@djangocfg/seo 2.1.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +192 -0
  2. package/dist/cli.d.ts +1 -0
  3. package/dist/cli.mjs +3780 -0
  4. package/dist/cli.mjs.map +1 -0
  5. package/dist/crawler/index.d.ts +88 -0
  6. package/dist/crawler/index.mjs +610 -0
  7. package/dist/crawler/index.mjs.map +1 -0
  8. package/dist/google-console/index.d.ts +95 -0
  9. package/dist/google-console/index.mjs +539 -0
  10. package/dist/google-console/index.mjs.map +1 -0
  11. package/dist/index.d.ts +285 -0
  12. package/dist/index.mjs +3236 -0
  13. package/dist/index.mjs.map +1 -0
  14. package/dist/link-checker/index.d.ts +76 -0
  15. package/dist/link-checker/index.mjs +326 -0
  16. package/dist/link-checker/index.mjs.map +1 -0
  17. package/dist/markdown-report-B3QdDzxE.d.ts +193 -0
  18. package/dist/reports/index.d.ts +24 -0
  19. package/dist/reports/index.mjs +836 -0
  20. package/dist/reports/index.mjs.map +1 -0
  21. package/dist/routes/index.d.ts +69 -0
  22. package/dist/routes/index.mjs +372 -0
  23. package/dist/routes/index.mjs.map +1 -0
  24. package/dist/scanner-Cz4Th2Pt.d.ts +60 -0
  25. package/dist/types/index.d.ts +144 -0
  26. package/dist/types/index.mjs +3 -0
  27. package/dist/types/index.mjs.map +1 -0
  28. package/package.json +114 -0
  29. package/src/analyzer.ts +256 -0
  30. package/src/cli/commands/audit.ts +260 -0
  31. package/src/cli/commands/content.ts +180 -0
  32. package/src/cli/commands/crawl.ts +32 -0
  33. package/src/cli/commands/index.ts +12 -0
  34. package/src/cli/commands/inspect.ts +60 -0
  35. package/src/cli/commands/links.ts +41 -0
  36. package/src/cli/commands/robots.ts +36 -0
  37. package/src/cli/commands/routes.ts +126 -0
  38. package/src/cli/commands/sitemap.ts +48 -0
  39. package/src/cli/index.ts +149 -0
  40. package/src/cli/types.ts +40 -0
  41. package/src/config.ts +207 -0
  42. package/src/content/index.ts +51 -0
  43. package/src/content/link-checker.ts +182 -0
  44. package/src/content/link-fixer.ts +188 -0
  45. package/src/content/scanner.ts +200 -0
  46. package/src/content/sitemap-generator.ts +321 -0
  47. package/src/content/types.ts +140 -0
  48. package/src/crawler/crawler.ts +425 -0
  49. package/src/crawler/index.ts +10 -0
  50. package/src/crawler/robots-parser.ts +171 -0
  51. package/src/crawler/sitemap-validator.ts +204 -0
  52. package/src/google-console/analyzer.ts +317 -0
  53. package/src/google-console/auth.ts +100 -0
  54. package/src/google-console/client.ts +281 -0
  55. package/src/google-console/index.ts +9 -0
  56. package/src/index.ts +144 -0
  57. package/src/link-checker/index.ts +461 -0
  58. package/src/reports/claude-context.ts +149 -0
  59. package/src/reports/generator.ts +244 -0
  60. package/src/reports/index.ts +27 -0
  61. package/src/reports/json-report.ts +320 -0
  62. package/src/reports/markdown-report.ts +246 -0
  63. package/src/reports/split-report.ts +252 -0
  64. package/src/routes/analyzer.ts +324 -0
  65. package/src/routes/index.ts +25 -0
  66. package/src/routes/scanner.ts +298 -0
  67. package/src/types/index.ts +222 -0
  68. package/src/utils/index.ts +154 -0
@@ -0,0 +1,425 @@
1
+ /**
2
+ * @djangocfg/seo - Site Crawler
3
+ * Internal site crawler for SEO analysis
4
+ */
5
+
6
+ import { load } from 'cheerio';
7
+ import pLimit from 'p-limit';
8
+ import consola from 'consola';
9
+ import type { CrawlResult, CrawlerConfig, SeoIssue } from '../types/index.js';
10
+
11
+ const DEFAULT_CONFIG: Required<CrawlerConfig> = {
12
+ maxPages: 100,
13
+ maxDepth: 3,
14
+ concurrency: 5,
15
+ timeout: 30000,
16
+ userAgent: 'DjangoCFG-SEO-Crawler/1.0 (+https://djangocfg.com/bot)',
17
+ respectRobotsTxt: true,
18
+ includePatterns: [],
19
+ excludePatterns: [
20
+ '/api/',
21
+ '/admin/',
22
+ '/_next/',
23
+ '/static/',
24
+ '.pdf',
25
+ '.jpg',
26
+ '.png',
27
+ '.gif',
28
+ '.svg',
29
+ '.css',
30
+ '.js',
31
+ ],
32
+ };
33
+
34
+ export class SiteCrawler {
35
+ private config: Required<CrawlerConfig>;
36
+ private baseUrl: URL;
37
+ private visited = new Set<string>();
38
+ private queue: Array<{ url: string; depth: number }> = [];
39
+ private results: CrawlResult[] = [];
40
+ private limit: ReturnType<typeof pLimit>;
41
+
42
+ constructor(siteUrl: string, config?: CrawlerConfig) {
43
+ this.config = { ...DEFAULT_CONFIG, ...config };
44
+ this.baseUrl = new URL(siteUrl);
45
+ this.limit = pLimit(this.config.concurrency);
46
+ }
47
+
48
+ /**
49
+ * Start crawling the site
50
+ */
51
+ async crawl(): Promise<CrawlResult[]> {
52
+ consola.info(`Starting crawl of ${this.baseUrl.origin}`);
53
+ consola.info(`Config: maxPages=${this.config.maxPages}, maxDepth=${this.config.maxDepth}`);
54
+
55
+ this.queue.push({ url: this.baseUrl.href, depth: 0 });
56
+
57
+ while (this.queue.length > 0 && this.results.length < this.config.maxPages) {
58
+ const batch = this.queue.splice(0, this.config.concurrency);
59
+
60
+ const promises = batch.map(({ url, depth }) =>
61
+ this.limit(() => this.crawlPage(url, depth))
62
+ );
63
+
64
+ await Promise.all(promises);
65
+ }
66
+
67
+ consola.success(`Crawl complete. Crawled ${this.results.length} pages.`);
68
+ return this.results;
69
+ }
70
+
71
+ /**
72
+ * Crawl a single page
73
+ */
74
+ private async crawlPage(url: string, depth: number): Promise<void> {
75
+ const normalizedUrl = this.normalizeUrl(url);
76
+
77
+ if (this.visited.has(normalizedUrl)) return;
78
+ if (this.shouldExclude(normalizedUrl)) return;
79
+
80
+ this.visited.add(normalizedUrl);
81
+
82
+ const startTime = Date.now();
83
+ const result: CrawlResult = {
84
+ url: normalizedUrl,
85
+ statusCode: 0,
86
+ links: { internal: [], external: [] },
87
+ images: [],
88
+ loadTime: 0,
89
+ errors: [],
90
+ warnings: [],
91
+ crawledAt: new Date().toISOString(),
92
+ };
93
+
94
+ try {
95
+ const controller = new AbortController();
96
+ const timeoutId = setTimeout(() => controller.abort(), this.config.timeout);
97
+
98
+ const response = await fetch(normalizedUrl, {
99
+ headers: {
100
+ 'User-Agent': this.config.userAgent,
101
+ Accept: 'text/html,application/xhtml+xml',
102
+ },
103
+ signal: controller.signal,
104
+ redirect: 'follow',
105
+ });
106
+
107
+ // TTFB = time from request start to first response headers
108
+ result.ttfb = Date.now() - startTime;
109
+
110
+ clearTimeout(timeoutId);
111
+
112
+ result.statusCode = response.status;
113
+ result.contentType = response.headers.get('content-type') || undefined;
114
+ result.contentLength = Number(response.headers.get('content-length')) || undefined;
115
+
116
+ if (response.ok && result.contentType?.includes('text/html')) {
117
+ const html = await response.text();
118
+ this.parseHtml(html, result, normalizedUrl, depth);
119
+ } else if (!response.ok) {
120
+ result.errors.push(`HTTP ${response.status}: ${response.statusText}`);
121
+ }
122
+ } catch (error) {
123
+ if (error instanceof Error) {
124
+ if (error.name === 'AbortError') {
125
+ result.errors.push('Request timeout');
126
+ } else {
127
+ result.errors.push(error.message);
128
+ }
129
+ }
130
+ }
131
+
132
+ result.loadTime = Date.now() - startTime;
133
+ this.results.push(result);
134
+
135
+ consola.debug(`Crawled: ${normalizedUrl} (${result.statusCode}) - ${result.loadTime}ms`);
136
+ }
137
+
138
+ /**
139
+ * Parse HTML and extract SEO-relevant data
140
+ */
141
+ private parseHtml(html: string, result: CrawlResult, pageUrl: string, depth: number): void {
142
+ const $ = load(html);
143
+
144
+ // Title
145
+ result.title = $('title').first().text().trim() || undefined;
146
+ if (!result.title) {
147
+ result.warnings.push('Missing title tag');
148
+ } else if (result.title.length > 60) {
149
+ result.warnings.push(`Title too long (${result.title.length} chars, recommended: <60)`);
150
+ }
151
+
152
+ // Meta description
153
+ result.metaDescription =
154
+ $('meta[name="description"]').attr('content')?.trim() || undefined;
155
+ if (!result.metaDescription) {
156
+ result.warnings.push('Missing meta description');
157
+ } else if (result.metaDescription.length > 160) {
158
+ result.warnings.push(
159
+ `Meta description too long (${result.metaDescription.length} chars, recommended: <160)`
160
+ );
161
+ }
162
+
163
+ // Meta robots
164
+ result.metaRobots = $('meta[name="robots"]').attr('content')?.trim() || undefined;
165
+ const xRobots = $('meta[http-equiv="X-Robots-Tag"]').attr('content')?.trim();
166
+ if (xRobots) {
167
+ result.metaRobots = result.metaRobots ? `${result.metaRobots}, ${xRobots}` : xRobots;
168
+ }
169
+
170
+ // Canonical
171
+ result.canonicalUrl = $('link[rel="canonical"]').attr('href')?.trim() || undefined;
172
+ if (!result.canonicalUrl) {
173
+ result.warnings.push('Missing canonical tag');
174
+ }
175
+
176
+ // Headings
177
+ result.h1 = $('h1')
178
+ .map((_, el) => $(el).text().trim())
179
+ .get();
180
+ result.h2 = $('h2')
181
+ .map((_, el) => $(el).text().trim())
182
+ .get();
183
+
184
+ if (result.h1.length === 0) {
185
+ result.warnings.push('Missing H1 tag');
186
+ } else if (result.h1.length > 1) {
187
+ result.warnings.push(`Multiple H1 tags (${result.h1.length})`);
188
+ }
189
+
190
+ // Links
191
+ $('a[href]').each((_, el) => {
192
+ const href = $(el).attr('href');
193
+ if (!href) return;
194
+
195
+ try {
196
+ const linkUrl = new URL(href, pageUrl);
197
+
198
+ if (linkUrl.hostname === this.baseUrl.hostname) {
199
+ const internalUrl = this.normalizeUrl(linkUrl.href);
200
+ result.links.internal.push(internalUrl);
201
+
202
+ // Add to crawl queue
203
+ if (depth < this.config.maxDepth && !this.visited.has(internalUrl)) {
204
+ this.queue.push({ url: internalUrl, depth: depth + 1 });
205
+ }
206
+ } else {
207
+ result.links.external.push(linkUrl.href);
208
+ }
209
+ } catch {
210
+ // Invalid URL, skip
211
+ }
212
+ });
213
+
214
+ // Images
215
+ $('img').each((_, el) => {
216
+ const src = $(el).attr('src');
217
+ const alt = $(el).attr('alt');
218
+
219
+ if (src) {
220
+ result.images.push({
221
+ src,
222
+ alt,
223
+ hasAlt: alt !== undefined && alt.trim().length > 0,
224
+ });
225
+ }
226
+ });
227
+
228
+ const imagesWithoutAlt = result.images.filter((img) => !img.hasAlt);
229
+ if (imagesWithoutAlt.length > 0) {
230
+ result.warnings.push(`${imagesWithoutAlt.length} images without alt text`);
231
+ }
232
+ }
233
+
234
+ /**
235
+ * Normalize URL for deduplication
236
+ */
237
+ private normalizeUrl(url: string): string {
238
+ try {
239
+ const parsed = new URL(url, this.baseUrl.href);
240
+ // Remove trailing slash, hash, and sort query params
241
+ parsed.hash = '';
242
+ let pathname = parsed.pathname;
243
+ if (pathname.endsWith('/') && pathname !== '/') {
244
+ pathname = pathname.slice(0, -1);
245
+ }
246
+ parsed.pathname = pathname;
247
+ return parsed.href;
248
+ } catch {
249
+ return url;
250
+ }
251
+ }
252
+
253
+ /**
254
+ * Check if URL should be excluded
255
+ */
256
+ private shouldExclude(url: string): boolean {
257
+ // Check include patterns first
258
+ if (this.config.includePatterns.length > 0) {
259
+ const included = this.config.includePatterns.some((pattern) =>
260
+ url.includes(pattern)
261
+ );
262
+ if (!included) return true;
263
+ }
264
+
265
+ // Check exclude patterns
266
+ return this.config.excludePatterns.some((pattern) => url.includes(pattern));
267
+ }
268
+ }
269
+
270
+ /**
271
+ * Analyze crawl results for SEO issues
272
+ */
273
+ export function analyzeCrawlResults(results: CrawlResult[]): SeoIssue[] {
274
+ const issues: SeoIssue[] = [];
275
+
276
+ for (const result of results) {
277
+ // HTTP errors
278
+ if (result.statusCode >= 400) {
279
+ issues.push({
280
+ id: `http-error-${hash(result.url)}`,
281
+ url: result.url,
282
+ category: 'technical',
283
+ severity: result.statusCode >= 500 ? 'critical' : 'error',
284
+ title: `HTTP ${result.statusCode} error`,
285
+ description: `Page returns ${result.statusCode} status code.`,
286
+ recommendation:
287
+ result.statusCode === 404
288
+ ? 'Either restore the content or set up a redirect.'
289
+ : 'Fix the server error and ensure the page is accessible.',
290
+ detectedAt: result.crawledAt,
291
+ metadata: { statusCode: result.statusCode },
292
+ });
293
+ }
294
+
295
+ // Missing title
296
+ if (!result.title && result.statusCode === 200) {
297
+ issues.push({
298
+ id: `missing-title-${hash(result.url)}`,
299
+ url: result.url,
300
+ category: 'content',
301
+ severity: 'error',
302
+ title: 'Missing title tag',
303
+ description: 'This page does not have a title tag.',
304
+ recommendation: 'Add a unique, descriptive title tag (50-60 characters).',
305
+ detectedAt: result.crawledAt,
306
+ });
307
+ }
308
+
309
+ // Missing meta description
310
+ if (!result.metaDescription && result.statusCode === 200) {
311
+ issues.push({
312
+ id: `missing-meta-desc-${hash(result.url)}`,
313
+ url: result.url,
314
+ category: 'content',
315
+ severity: 'warning',
316
+ title: 'Missing meta description',
317
+ description: 'This page does not have a meta description.',
318
+ recommendation: 'Add a unique meta description (120-160 characters).',
319
+ detectedAt: result.crawledAt,
320
+ });
321
+ }
322
+
323
+ // Missing H1
324
+ if (result.h1 && result.h1.length === 0 && result.statusCode === 200) {
325
+ issues.push({
326
+ id: `missing-h1-${hash(result.url)}`,
327
+ url: result.url,
328
+ category: 'content',
329
+ severity: 'warning',
330
+ title: 'Missing H1 heading',
331
+ description: 'This page does not have an H1 heading.',
332
+ recommendation: 'Add a single H1 heading that describes the page content.',
333
+ detectedAt: result.crawledAt,
334
+ });
335
+ }
336
+
337
+ // Multiple H1s
338
+ if (result.h1 && result.h1.length > 1) {
339
+ issues.push({
340
+ id: `multiple-h1-${hash(result.url)}`,
341
+ url: result.url,
342
+ category: 'content',
343
+ severity: 'warning',
344
+ title: 'Multiple H1 headings',
345
+ description: `This page has ${result.h1.length} H1 headings.`,
346
+ recommendation: 'Use only one H1 heading per page.',
347
+ detectedAt: result.crawledAt,
348
+ metadata: { h1Count: result.h1.length },
349
+ });
350
+ }
351
+
352
+ // Images without alt
353
+ const imagesWithoutAlt = result.images.filter((img) => !img.hasAlt);
354
+ if (imagesWithoutAlt.length > 0) {
355
+ issues.push({
356
+ id: `images-no-alt-${hash(result.url)}`,
357
+ url: result.url,
358
+ category: 'content',
359
+ severity: 'info',
360
+ title: 'Images without alt text',
361
+ description: `${imagesWithoutAlt.length} images are missing alt text.`,
362
+ recommendation: 'Add descriptive alt text to all images for accessibility and SEO.',
363
+ detectedAt: result.crawledAt,
364
+ metadata: { count: imagesWithoutAlt.length },
365
+ });
366
+ }
367
+
368
+ // Slow load time (> 3s)
369
+ if (result.loadTime > 3000) {
370
+ issues.push({
371
+ id: `slow-page-${hash(result.url)}`,
372
+ url: result.url,
373
+ category: 'performance',
374
+ severity: result.loadTime > 5000 ? 'error' : 'warning',
375
+ title: 'Slow page load time',
376
+ description: `Page took ${result.loadTime}ms to load.`,
377
+ recommendation: 'Optimize page load time. Target under 3 seconds.',
378
+ detectedAt: result.crawledAt,
379
+ metadata: { loadTime: result.loadTime },
380
+ });
381
+ }
382
+
383
+ // Slow TTFB (> 800ms)
384
+ if (result.ttfb && result.ttfb > 800) {
385
+ issues.push({
386
+ id: `slow-ttfb-${hash(result.url)}`,
387
+ url: result.url,
388
+ category: 'performance',
389
+ severity: result.ttfb > 1500 ? 'error' : 'warning',
390
+ title: 'Slow Time to First Byte',
391
+ description: `TTFB is ${result.ttfb}ms. Server responded slowly.`,
392
+ recommendation: 'Optimize server response. Target TTFB under 800ms. Consider CDN, caching, or server upgrades.',
393
+ detectedAt: result.crawledAt,
394
+ metadata: { ttfb: result.ttfb },
395
+ });
396
+ }
397
+
398
+ // Noindex check
399
+ if (result.metaRobots?.includes('noindex')) {
400
+ issues.push({
401
+ id: `noindex-${hash(result.url)}`,
402
+ url: result.url,
403
+ category: 'indexing',
404
+ severity: 'info',
405
+ title: 'Page marked as noindex',
406
+ description: 'This page has a noindex directive.',
407
+ recommendation: 'Verify this is intentional. Remove noindex if the page should be indexed.',
408
+ detectedAt: result.crawledAt,
409
+ metadata: { metaRobots: result.metaRobots },
410
+ });
411
+ }
412
+ }
413
+
414
+ return issues;
415
+ }
416
+
417
+ function hash(str: string): string {
418
+ let hash = 0;
419
+ for (let i = 0; i < str.length; i++) {
420
+ const char = str.charCodeAt(i);
421
+ hash = (hash << 5) - hash + char;
422
+ hash = hash & hash;
423
+ }
424
+ return Math.abs(hash).toString(36);
425
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * @djangocfg/seo - Crawler Module
3
+ * Site crawler and analysis tools
4
+ */
5
+
6
+ export { SiteCrawler, analyzeCrawlResults } from './crawler.js';
7
+ export { analyzeRobotsTxt, isUrlAllowed } from './robots-parser.js';
8
+ export { analyzeSitemap, analyzeAllSitemaps } from './sitemap-validator.js';
9
+ export type { RobotsAnalysis } from './robots-parser.js';
10
+ export type { SitemapAnalysis } from './sitemap-validator.js';
@@ -0,0 +1,171 @@
1
+ /**
2
+ * @djangocfg/seo - Robots.txt Parser
3
+ * Parse and analyze robots.txt files
4
+ */
5
+
6
+ import robotsParser from 'robots-parser';
7
+ import consola from 'consola';
8
+ import type { SeoIssue } from '../types/index.js';
9
+
10
+ export interface RobotsAnalysis {
11
+ exists: boolean;
12
+ content?: string;
13
+ sitemaps: string[];
14
+ allowedPaths: string[];
15
+ disallowedPaths: string[];
16
+ crawlDelay?: number;
17
+ issues: SeoIssue[];
18
+ }
19
+
20
+ /**
21
+ * Fetch and parse robots.txt for a site
22
+ */
23
+ export async function analyzeRobotsTxt(siteUrl: string): Promise<RobotsAnalysis> {
24
+ const robotsUrl = new URL('/robots.txt', siteUrl).href;
25
+
26
+ const analysis: RobotsAnalysis = {
27
+ exists: false,
28
+ sitemaps: [],
29
+ allowedPaths: [],
30
+ disallowedPaths: [],
31
+ issues: [],
32
+ };
33
+
34
+ try {
35
+ const response = await fetch(robotsUrl);
36
+
37
+ if (!response.ok) {
38
+ analysis.issues.push({
39
+ id: 'missing-robots-txt',
40
+ url: robotsUrl,
41
+ category: 'technical',
42
+ severity: 'warning',
43
+ title: 'Missing robots.txt',
44
+ description: `No robots.txt file found (HTTP ${response.status}).`,
45
+ recommendation: 'Create a robots.txt file to control crawler access.',
46
+ detectedAt: new Date().toISOString(),
47
+ });
48
+ return analysis;
49
+ }
50
+
51
+ analysis.exists = true;
52
+ analysis.content = await response.text();
53
+
54
+ // Parse robots.txt
55
+ const robots = robotsParser(robotsUrl, analysis.content);
56
+
57
+ // Extract sitemaps
58
+ analysis.sitemaps = robots.getSitemaps();
59
+
60
+ if (analysis.sitemaps.length === 0) {
61
+ analysis.issues.push({
62
+ id: 'no-sitemap-in-robots',
63
+ url: robotsUrl,
64
+ category: 'technical',
65
+ severity: 'info',
66
+ title: 'No sitemap in robots.txt',
67
+ description: 'No sitemap URL is declared in robots.txt.',
68
+ recommendation: 'Add a Sitemap directive pointing to your XML sitemap.',
69
+ detectedAt: new Date().toISOString(),
70
+ });
71
+ }
72
+
73
+ // Parse rules (simplified extraction)
74
+ const lines = analysis.content.split('\n');
75
+ let currentUserAgent = '*';
76
+
77
+ for (const line of lines) {
78
+ const trimmed = line.trim().toLowerCase();
79
+
80
+ if (trimmed.startsWith('user-agent:')) {
81
+ currentUserAgent = trimmed.replace('user-agent:', '').trim();
82
+ } else if (trimmed.startsWith('disallow:')) {
83
+ const path = line.trim().replace(/disallow:/i, '').trim();
84
+ if (path) {
85
+ analysis.disallowedPaths.push(path);
86
+ }
87
+ } else if (trimmed.startsWith('allow:')) {
88
+ const path = line.trim().replace(/allow:/i, '').trim();
89
+ if (path) {
90
+ analysis.allowedPaths.push(path);
91
+ }
92
+ } else if (trimmed.startsWith('crawl-delay:')) {
93
+ const delay = parseInt(trimmed.replace('crawl-delay:', '').trim(), 10);
94
+ if (!isNaN(delay)) {
95
+ analysis.crawlDelay = delay;
96
+ }
97
+ }
98
+ }
99
+
100
+ // Check for blocking important paths
101
+ const importantPaths = ['/', '/sitemap.xml'];
102
+ for (const path of importantPaths) {
103
+ if (!robots.isAllowed(new URL(path, siteUrl).href, 'Googlebot')) {
104
+ analysis.issues.push({
105
+ id: `blocked-important-path-${path.replace(/\//g, '-')}`,
106
+ url: siteUrl,
107
+ category: 'crawling',
108
+ severity: 'error',
109
+ title: `Important path blocked: ${path}`,
110
+ description: `The path ${path} is blocked in robots.txt.`,
111
+ recommendation: `Ensure ${path} is accessible to search engines.`,
112
+ detectedAt: new Date().toISOString(),
113
+ metadata: { path },
114
+ });
115
+ }
116
+ }
117
+
118
+ // Check for excessively restrictive rules
119
+ if (analysis.disallowedPaths.includes('/')) {
120
+ analysis.issues.push({
121
+ id: 'all-blocked',
122
+ url: robotsUrl,
123
+ category: 'crawling',
124
+ severity: 'critical',
125
+ title: 'Entire site blocked',
126
+ description: 'robots.txt blocks access to the entire site (Disallow: /).',
127
+ recommendation: 'Remove or modify this rule if you want your site to be indexed.',
128
+ detectedAt: new Date().toISOString(),
129
+ });
130
+ }
131
+
132
+ consola.debug(`Analyzed robots.txt: ${analysis.disallowedPaths.length} disallow rules`);
133
+ } catch (error) {
134
+ consola.error('Failed to fetch robots.txt:', error);
135
+ analysis.issues.push({
136
+ id: 'robots-txt-error',
137
+ url: robotsUrl,
138
+ category: 'technical',
139
+ severity: 'warning',
140
+ title: 'Failed to fetch robots.txt',
141
+ description: `Error fetching robots.txt: ${error instanceof Error ? error.message : 'Unknown error'}`,
142
+ recommendation: 'Ensure robots.txt is accessible.',
143
+ detectedAt: new Date().toISOString(),
144
+ });
145
+ }
146
+
147
+ return analysis;
148
+ }
149
+
150
+ /**
151
+ * Check if a URL is allowed by robots.txt
152
+ */
153
+ export async function isUrlAllowed(
154
+ siteUrl: string,
155
+ url: string,
156
+ userAgent = 'Googlebot'
157
+ ): Promise<boolean> {
158
+ const robotsUrl = new URL('/robots.txt', siteUrl).href;
159
+
160
+ try {
161
+ const response = await fetch(robotsUrl);
162
+ if (!response.ok) return true; // No robots.txt = allow all
163
+
164
+ const content = await response.text();
165
+ const robots = robotsParser(robotsUrl, content);
166
+
167
+ return robots.isAllowed(url, userAgent) ?? true;
168
+ } catch {
169
+ return true; // Error fetching = allow
170
+ }
171
+ }