@djangocfg/seo 2.1.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +192 -0
  2. package/dist/cli.d.ts +1 -0
  3. package/dist/cli.mjs +3780 -0
  4. package/dist/cli.mjs.map +1 -0
  5. package/dist/crawler/index.d.ts +88 -0
  6. package/dist/crawler/index.mjs +610 -0
  7. package/dist/crawler/index.mjs.map +1 -0
  8. package/dist/google-console/index.d.ts +95 -0
  9. package/dist/google-console/index.mjs +539 -0
  10. package/dist/google-console/index.mjs.map +1 -0
  11. package/dist/index.d.ts +285 -0
  12. package/dist/index.mjs +3236 -0
  13. package/dist/index.mjs.map +1 -0
  14. package/dist/link-checker/index.d.ts +76 -0
  15. package/dist/link-checker/index.mjs +326 -0
  16. package/dist/link-checker/index.mjs.map +1 -0
  17. package/dist/markdown-report-B3QdDzxE.d.ts +193 -0
  18. package/dist/reports/index.d.ts +24 -0
  19. package/dist/reports/index.mjs +836 -0
  20. package/dist/reports/index.mjs.map +1 -0
  21. package/dist/routes/index.d.ts +69 -0
  22. package/dist/routes/index.mjs +372 -0
  23. package/dist/routes/index.mjs.map +1 -0
  24. package/dist/scanner-Cz4Th2Pt.d.ts +60 -0
  25. package/dist/types/index.d.ts +144 -0
  26. package/dist/types/index.mjs +3 -0
  27. package/dist/types/index.mjs.map +1 -0
  28. package/package.json +114 -0
  29. package/src/analyzer.ts +256 -0
  30. package/src/cli/commands/audit.ts +260 -0
  31. package/src/cli/commands/content.ts +180 -0
  32. package/src/cli/commands/crawl.ts +32 -0
  33. package/src/cli/commands/index.ts +12 -0
  34. package/src/cli/commands/inspect.ts +60 -0
  35. package/src/cli/commands/links.ts +41 -0
  36. package/src/cli/commands/robots.ts +36 -0
  37. package/src/cli/commands/routes.ts +126 -0
  38. package/src/cli/commands/sitemap.ts +48 -0
  39. package/src/cli/index.ts +149 -0
  40. package/src/cli/types.ts +40 -0
  41. package/src/config.ts +207 -0
  42. package/src/content/index.ts +51 -0
  43. package/src/content/link-checker.ts +182 -0
  44. package/src/content/link-fixer.ts +188 -0
  45. package/src/content/scanner.ts +200 -0
  46. package/src/content/sitemap-generator.ts +321 -0
  47. package/src/content/types.ts +140 -0
  48. package/src/crawler/crawler.ts +425 -0
  49. package/src/crawler/index.ts +10 -0
  50. package/src/crawler/robots-parser.ts +171 -0
  51. package/src/crawler/sitemap-validator.ts +204 -0
  52. package/src/google-console/analyzer.ts +317 -0
  53. package/src/google-console/auth.ts +100 -0
  54. package/src/google-console/client.ts +281 -0
  55. package/src/google-console/index.ts +9 -0
  56. package/src/index.ts +144 -0
  57. package/src/link-checker/index.ts +461 -0
  58. package/src/reports/claude-context.ts +149 -0
  59. package/src/reports/generator.ts +244 -0
  60. package/src/reports/index.ts +27 -0
  61. package/src/reports/json-report.ts +320 -0
  62. package/src/reports/markdown-report.ts +246 -0
  63. package/src/reports/split-report.ts +252 -0
  64. package/src/routes/analyzer.ts +324 -0
  65. package/src/routes/index.ts +25 -0
  66. package/src/routes/scanner.ts +298 -0
  67. package/src/types/index.ts +222 -0
  68. package/src/utils/index.ts +154 -0
@@ -0,0 +1,88 @@
1
+ import { CrawlerConfig, CrawlResult, SeoIssue } from '../types/index.js';
2
+
3
+ /**
4
+ * @djangocfg/seo - Site Crawler
5
+ * Internal site crawler for SEO analysis
6
+ */
7
+
8
+ declare class SiteCrawler {
9
+ private config;
10
+ private baseUrl;
11
+ private visited;
12
+ private queue;
13
+ private results;
14
+ private limit;
15
+ constructor(siteUrl: string, config?: CrawlerConfig);
16
+ /**
17
+ * Start crawling the site
18
+ */
19
+ crawl(): Promise<CrawlResult[]>;
20
+ /**
21
+ * Crawl a single page
22
+ */
23
+ private crawlPage;
24
+ /**
25
+ * Parse HTML and extract SEO-relevant data
26
+ */
27
+ private parseHtml;
28
+ /**
29
+ * Normalize URL for deduplication
30
+ */
31
+ private normalizeUrl;
32
+ /**
33
+ * Check if URL should be excluded
34
+ */
35
+ private shouldExclude;
36
+ }
37
+ /**
38
+ * Analyze crawl results for SEO issues
39
+ */
40
+ declare function analyzeCrawlResults(results: CrawlResult[]): SeoIssue[];
41
+
42
+ /**
43
+ * @djangocfg/seo - Robots.txt Parser
44
+ * Parse and analyze robots.txt files
45
+ */
46
+
47
+ interface RobotsAnalysis {
48
+ exists: boolean;
49
+ content?: string;
50
+ sitemaps: string[];
51
+ allowedPaths: string[];
52
+ disallowedPaths: string[];
53
+ crawlDelay?: number;
54
+ issues: SeoIssue[];
55
+ }
56
+ /**
57
+ * Fetch and parse robots.txt for a site
58
+ */
59
+ declare function analyzeRobotsTxt(siteUrl: string): Promise<RobotsAnalysis>;
60
+ /**
61
+ * Check if a URL is allowed by robots.txt
62
+ */
63
+ declare function isUrlAllowed(siteUrl: string, url: string, userAgent?: string): Promise<boolean>;
64
+
65
+ /**
66
+ * @djangocfg/seo - Sitemap Validator
67
+ * Validate XML sitemaps
68
+ */
69
+
70
+ interface SitemapAnalysis {
71
+ url: string;
72
+ exists: boolean;
73
+ type: 'sitemap' | 'sitemap-index' | 'unknown';
74
+ urls: string[];
75
+ childSitemaps: string[];
76
+ lastmod?: string;
77
+ issues: SeoIssue[];
78
+ }
79
+ /**
80
+ * Analyze a sitemap URL
81
+ */
82
+ declare function analyzeSitemap(sitemapUrl: string): Promise<SitemapAnalysis>;
83
+ /**
84
+ * Recursively analyze a sitemap and all its children
85
+ */
86
+ declare function analyzeAllSitemaps(sitemapUrl: string, maxDepth?: number): Promise<SitemapAnalysis[]>;
87
+
88
+ export { type RobotsAnalysis, SiteCrawler, type SitemapAnalysis, analyzeAllSitemaps, analyzeCrawlResults, analyzeRobotsTxt, analyzeSitemap, isUrlAllowed };
@@ -0,0 +1,610 @@
1
+ import { load } from 'cheerio';
2
+ import pLimit from 'p-limit';
3
+ import consola from 'consola';
4
+ import robotsParser from 'robots-parser';
5
+
6
+ // src/crawler/crawler.ts
7
+ var DEFAULT_CONFIG = {
8
+ maxPages: 100,
9
+ maxDepth: 3,
10
+ concurrency: 5,
11
+ timeout: 3e4,
12
+ userAgent: "DjangoCFG-SEO-Crawler/1.0 (+https://djangocfg.com/bot)",
13
+ respectRobotsTxt: true,
14
+ includePatterns: [],
15
+ excludePatterns: [
16
+ "/api/",
17
+ "/admin/",
18
+ "/_next/",
19
+ "/static/",
20
+ ".pdf",
21
+ ".jpg",
22
+ ".png",
23
+ ".gif",
24
+ ".svg",
25
+ ".css",
26
+ ".js"
27
+ ]
28
+ };
29
+ var SiteCrawler = class {
30
+ config;
31
+ baseUrl;
32
+ visited = /* @__PURE__ */ new Set();
33
+ queue = [];
34
+ results = [];
35
+ limit;
36
+ constructor(siteUrl, config) {
37
+ this.config = { ...DEFAULT_CONFIG, ...config };
38
+ this.baseUrl = new URL(siteUrl);
39
+ this.limit = pLimit(this.config.concurrency);
40
+ }
41
+ /**
42
+ * Start crawling the site
43
+ */
44
+ async crawl() {
45
+ consola.info(`Starting crawl of ${this.baseUrl.origin}`);
46
+ consola.info(`Config: maxPages=${this.config.maxPages}, maxDepth=${this.config.maxDepth}`);
47
+ this.queue.push({ url: this.baseUrl.href, depth: 0 });
48
+ while (this.queue.length > 0 && this.results.length < this.config.maxPages) {
49
+ const batch = this.queue.splice(0, this.config.concurrency);
50
+ const promises = batch.map(
51
+ ({ url, depth }) => this.limit(() => this.crawlPage(url, depth))
52
+ );
53
+ await Promise.all(promises);
54
+ }
55
+ consola.success(`Crawl complete. Crawled ${this.results.length} pages.`);
56
+ return this.results;
57
+ }
58
+ /**
59
+ * Crawl a single page
60
+ */
61
+ async crawlPage(url, depth) {
62
+ const normalizedUrl = this.normalizeUrl(url);
63
+ if (this.visited.has(normalizedUrl)) return;
64
+ if (this.shouldExclude(normalizedUrl)) return;
65
+ this.visited.add(normalizedUrl);
66
+ const startTime = Date.now();
67
+ const result = {
68
+ url: normalizedUrl,
69
+ statusCode: 0,
70
+ links: { internal: [], external: [] },
71
+ images: [],
72
+ loadTime: 0,
73
+ errors: [],
74
+ warnings: [],
75
+ crawledAt: (/* @__PURE__ */ new Date()).toISOString()
76
+ };
77
+ try {
78
+ const controller = new AbortController();
79
+ const timeoutId = setTimeout(() => controller.abort(), this.config.timeout);
80
+ const response = await fetch(normalizedUrl, {
81
+ headers: {
82
+ "User-Agent": this.config.userAgent,
83
+ Accept: "text/html,application/xhtml+xml"
84
+ },
85
+ signal: controller.signal,
86
+ redirect: "follow"
87
+ });
88
+ result.ttfb = Date.now() - startTime;
89
+ clearTimeout(timeoutId);
90
+ result.statusCode = response.status;
91
+ result.contentType = response.headers.get("content-type") || void 0;
92
+ result.contentLength = Number(response.headers.get("content-length")) || void 0;
93
+ if (response.ok && result.contentType?.includes("text/html")) {
94
+ const html = await response.text();
95
+ this.parseHtml(html, result, normalizedUrl, depth);
96
+ } else if (!response.ok) {
97
+ result.errors.push(`HTTP ${response.status}: ${response.statusText}`);
98
+ }
99
+ } catch (error) {
100
+ if (error instanceof Error) {
101
+ if (error.name === "AbortError") {
102
+ result.errors.push("Request timeout");
103
+ } else {
104
+ result.errors.push(error.message);
105
+ }
106
+ }
107
+ }
108
+ result.loadTime = Date.now() - startTime;
109
+ this.results.push(result);
110
+ consola.debug(`Crawled: ${normalizedUrl} (${result.statusCode}) - ${result.loadTime}ms`);
111
+ }
112
+ /**
113
+ * Parse HTML and extract SEO-relevant data
114
+ */
115
+ parseHtml(html, result, pageUrl, depth) {
116
+ const $ = load(html);
117
+ result.title = $("title").first().text().trim() || void 0;
118
+ if (!result.title) {
119
+ result.warnings.push("Missing title tag");
120
+ } else if (result.title.length > 60) {
121
+ result.warnings.push(`Title too long (${result.title.length} chars, recommended: <60)`);
122
+ }
123
+ result.metaDescription = $('meta[name="description"]').attr("content")?.trim() || void 0;
124
+ if (!result.metaDescription) {
125
+ result.warnings.push("Missing meta description");
126
+ } else if (result.metaDescription.length > 160) {
127
+ result.warnings.push(
128
+ `Meta description too long (${result.metaDescription.length} chars, recommended: <160)`
129
+ );
130
+ }
131
+ result.metaRobots = $('meta[name="robots"]').attr("content")?.trim() || void 0;
132
+ const xRobots = $('meta[http-equiv="X-Robots-Tag"]').attr("content")?.trim();
133
+ if (xRobots) {
134
+ result.metaRobots = result.metaRobots ? `${result.metaRobots}, ${xRobots}` : xRobots;
135
+ }
136
+ result.canonicalUrl = $('link[rel="canonical"]').attr("href")?.trim() || void 0;
137
+ if (!result.canonicalUrl) {
138
+ result.warnings.push("Missing canonical tag");
139
+ }
140
+ result.h1 = $("h1").map((_, el) => $(el).text().trim()).get();
141
+ result.h2 = $("h2").map((_, el) => $(el).text().trim()).get();
142
+ if (result.h1.length === 0) {
143
+ result.warnings.push("Missing H1 tag");
144
+ } else if (result.h1.length > 1) {
145
+ result.warnings.push(`Multiple H1 tags (${result.h1.length})`);
146
+ }
147
+ $("a[href]").each((_, el) => {
148
+ const href = $(el).attr("href");
149
+ if (!href) return;
150
+ try {
151
+ const linkUrl = new URL(href, pageUrl);
152
+ if (linkUrl.hostname === this.baseUrl.hostname) {
153
+ const internalUrl = this.normalizeUrl(linkUrl.href);
154
+ result.links.internal.push(internalUrl);
155
+ if (depth < this.config.maxDepth && !this.visited.has(internalUrl)) {
156
+ this.queue.push({ url: internalUrl, depth: depth + 1 });
157
+ }
158
+ } else {
159
+ result.links.external.push(linkUrl.href);
160
+ }
161
+ } catch {
162
+ }
163
+ });
164
+ $("img").each((_, el) => {
165
+ const src = $(el).attr("src");
166
+ const alt = $(el).attr("alt");
167
+ if (src) {
168
+ result.images.push({
169
+ src,
170
+ alt,
171
+ hasAlt: alt !== void 0 && alt.trim().length > 0
172
+ });
173
+ }
174
+ });
175
+ const imagesWithoutAlt = result.images.filter((img) => !img.hasAlt);
176
+ if (imagesWithoutAlt.length > 0) {
177
+ result.warnings.push(`${imagesWithoutAlt.length} images without alt text`);
178
+ }
179
+ }
180
+ /**
181
+ * Normalize URL for deduplication
182
+ */
183
+ normalizeUrl(url) {
184
+ try {
185
+ const parsed = new URL(url, this.baseUrl.href);
186
+ parsed.hash = "";
187
+ let pathname = parsed.pathname;
188
+ if (pathname.endsWith("/") && pathname !== "/") {
189
+ pathname = pathname.slice(0, -1);
190
+ }
191
+ parsed.pathname = pathname;
192
+ return parsed.href;
193
+ } catch {
194
+ return url;
195
+ }
196
+ }
197
+ /**
198
+ * Check if URL should be excluded
199
+ */
200
+ shouldExclude(url) {
201
+ if (this.config.includePatterns.length > 0) {
202
+ const included = this.config.includePatterns.some(
203
+ (pattern) => url.includes(pattern)
204
+ );
205
+ if (!included) return true;
206
+ }
207
+ return this.config.excludePatterns.some((pattern) => url.includes(pattern));
208
+ }
209
+ };
210
+ function analyzeCrawlResults(results) {
211
+ const issues = [];
212
+ for (const result of results) {
213
+ if (result.statusCode >= 400) {
214
+ issues.push({
215
+ id: `http-error-${hash(result.url)}`,
216
+ url: result.url,
217
+ category: "technical",
218
+ severity: result.statusCode >= 500 ? "critical" : "error",
219
+ title: `HTTP ${result.statusCode} error`,
220
+ description: `Page returns ${result.statusCode} status code.`,
221
+ recommendation: result.statusCode === 404 ? "Either restore the content or set up a redirect." : "Fix the server error and ensure the page is accessible.",
222
+ detectedAt: result.crawledAt,
223
+ metadata: { statusCode: result.statusCode }
224
+ });
225
+ }
226
+ if (!result.title && result.statusCode === 200) {
227
+ issues.push({
228
+ id: `missing-title-${hash(result.url)}`,
229
+ url: result.url,
230
+ category: "content",
231
+ severity: "error",
232
+ title: "Missing title tag",
233
+ description: "This page does not have a title tag.",
234
+ recommendation: "Add a unique, descriptive title tag (50-60 characters).",
235
+ detectedAt: result.crawledAt
236
+ });
237
+ }
238
+ if (!result.metaDescription && result.statusCode === 200) {
239
+ issues.push({
240
+ id: `missing-meta-desc-${hash(result.url)}`,
241
+ url: result.url,
242
+ category: "content",
243
+ severity: "warning",
244
+ title: "Missing meta description",
245
+ description: "This page does not have a meta description.",
246
+ recommendation: "Add a unique meta description (120-160 characters).",
247
+ detectedAt: result.crawledAt
248
+ });
249
+ }
250
+ if (result.h1 && result.h1.length === 0 && result.statusCode === 200) {
251
+ issues.push({
252
+ id: `missing-h1-${hash(result.url)}`,
253
+ url: result.url,
254
+ category: "content",
255
+ severity: "warning",
256
+ title: "Missing H1 heading",
257
+ description: "This page does not have an H1 heading.",
258
+ recommendation: "Add a single H1 heading that describes the page content.",
259
+ detectedAt: result.crawledAt
260
+ });
261
+ }
262
+ if (result.h1 && result.h1.length > 1) {
263
+ issues.push({
264
+ id: `multiple-h1-${hash(result.url)}`,
265
+ url: result.url,
266
+ category: "content",
267
+ severity: "warning",
268
+ title: "Multiple H1 headings",
269
+ description: `This page has ${result.h1.length} H1 headings.`,
270
+ recommendation: "Use only one H1 heading per page.",
271
+ detectedAt: result.crawledAt,
272
+ metadata: { h1Count: result.h1.length }
273
+ });
274
+ }
275
+ const imagesWithoutAlt = result.images.filter((img) => !img.hasAlt);
276
+ if (imagesWithoutAlt.length > 0) {
277
+ issues.push({
278
+ id: `images-no-alt-${hash(result.url)}`,
279
+ url: result.url,
280
+ category: "content",
281
+ severity: "info",
282
+ title: "Images without alt text",
283
+ description: `${imagesWithoutAlt.length} images are missing alt text.`,
284
+ recommendation: "Add descriptive alt text to all images for accessibility and SEO.",
285
+ detectedAt: result.crawledAt,
286
+ metadata: { count: imagesWithoutAlt.length }
287
+ });
288
+ }
289
+ if (result.loadTime > 3e3) {
290
+ issues.push({
291
+ id: `slow-page-${hash(result.url)}`,
292
+ url: result.url,
293
+ category: "performance",
294
+ severity: result.loadTime > 5e3 ? "error" : "warning",
295
+ title: "Slow page load time",
296
+ description: `Page took ${result.loadTime}ms to load.`,
297
+ recommendation: "Optimize page load time. Target under 3 seconds.",
298
+ detectedAt: result.crawledAt,
299
+ metadata: { loadTime: result.loadTime }
300
+ });
301
+ }
302
+ if (result.ttfb && result.ttfb > 800) {
303
+ issues.push({
304
+ id: `slow-ttfb-${hash(result.url)}`,
305
+ url: result.url,
306
+ category: "performance",
307
+ severity: result.ttfb > 1500 ? "error" : "warning",
308
+ title: "Slow Time to First Byte",
309
+ description: `TTFB is ${result.ttfb}ms. Server responded slowly.`,
310
+ recommendation: "Optimize server response. Target TTFB under 800ms. Consider CDN, caching, or server upgrades.",
311
+ detectedAt: result.crawledAt,
312
+ metadata: { ttfb: result.ttfb }
313
+ });
314
+ }
315
+ if (result.metaRobots?.includes("noindex")) {
316
+ issues.push({
317
+ id: `noindex-${hash(result.url)}`,
318
+ url: result.url,
319
+ category: "indexing",
320
+ severity: "info",
321
+ title: "Page marked as noindex",
322
+ description: "This page has a noindex directive.",
323
+ recommendation: "Verify this is intentional. Remove noindex if the page should be indexed.",
324
+ detectedAt: result.crawledAt,
325
+ metadata: { metaRobots: result.metaRobots }
326
+ });
327
+ }
328
+ }
329
+ return issues;
330
+ }
331
+ function hash(str) {
332
+ let hash3 = 0;
333
+ for (let i = 0; i < str.length; i++) {
334
+ const char = str.charCodeAt(i);
335
+ hash3 = (hash3 << 5) - hash3 + char;
336
+ hash3 = hash3 & hash3;
337
+ }
338
+ return Math.abs(hash3).toString(36);
339
+ }
340
+ async function analyzeRobotsTxt(siteUrl) {
341
+ const robotsUrl = new URL("/robots.txt", siteUrl).href;
342
+ const analysis = {
343
+ exists: false,
344
+ sitemaps: [],
345
+ allowedPaths: [],
346
+ disallowedPaths: [],
347
+ issues: []
348
+ };
349
+ try {
350
+ const response = await fetch(robotsUrl);
351
+ if (!response.ok) {
352
+ analysis.issues.push({
353
+ id: "missing-robots-txt",
354
+ url: robotsUrl,
355
+ category: "technical",
356
+ severity: "warning",
357
+ title: "Missing robots.txt",
358
+ description: `No robots.txt file found (HTTP ${response.status}).`,
359
+ recommendation: "Create a robots.txt file to control crawler access.",
360
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString()
361
+ });
362
+ return analysis;
363
+ }
364
+ analysis.exists = true;
365
+ analysis.content = await response.text();
366
+ const robots = robotsParser(robotsUrl, analysis.content);
367
+ analysis.sitemaps = robots.getSitemaps();
368
+ if (analysis.sitemaps.length === 0) {
369
+ analysis.issues.push({
370
+ id: "no-sitemap-in-robots",
371
+ url: robotsUrl,
372
+ category: "technical",
373
+ severity: "info",
374
+ title: "No sitemap in robots.txt",
375
+ description: "No sitemap URL is declared in robots.txt.",
376
+ recommendation: "Add a Sitemap directive pointing to your XML sitemap.",
377
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString()
378
+ });
379
+ }
380
+ const lines = analysis.content.split("\n");
381
+ let currentUserAgent = "*";
382
+ for (const line of lines) {
383
+ const trimmed = line.trim().toLowerCase();
384
+ if (trimmed.startsWith("user-agent:")) {
385
+ currentUserAgent = trimmed.replace("user-agent:", "").trim();
386
+ } else if (trimmed.startsWith("disallow:")) {
387
+ const path = line.trim().replace(/disallow:/i, "").trim();
388
+ if (path) {
389
+ analysis.disallowedPaths.push(path);
390
+ }
391
+ } else if (trimmed.startsWith("allow:")) {
392
+ const path = line.trim().replace(/allow:/i, "").trim();
393
+ if (path) {
394
+ analysis.allowedPaths.push(path);
395
+ }
396
+ } else if (trimmed.startsWith("crawl-delay:")) {
397
+ const delay = parseInt(trimmed.replace("crawl-delay:", "").trim(), 10);
398
+ if (!isNaN(delay)) {
399
+ analysis.crawlDelay = delay;
400
+ }
401
+ }
402
+ }
403
+ const importantPaths = ["/", "/sitemap.xml"];
404
+ for (const path of importantPaths) {
405
+ if (!robots.isAllowed(new URL(path, siteUrl).href, "Googlebot")) {
406
+ analysis.issues.push({
407
+ id: `blocked-important-path-${path.replace(/\//g, "-")}`,
408
+ url: siteUrl,
409
+ category: "crawling",
410
+ severity: "error",
411
+ title: `Important path blocked: ${path}`,
412
+ description: `The path ${path} is blocked in robots.txt.`,
413
+ recommendation: `Ensure ${path} is accessible to search engines.`,
414
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
415
+ metadata: { path }
416
+ });
417
+ }
418
+ }
419
+ if (analysis.disallowedPaths.includes("/")) {
420
+ analysis.issues.push({
421
+ id: "all-blocked",
422
+ url: robotsUrl,
423
+ category: "crawling",
424
+ severity: "critical",
425
+ title: "Entire site blocked",
426
+ description: "robots.txt blocks access to the entire site (Disallow: /).",
427
+ recommendation: "Remove or modify this rule if you want your site to be indexed.",
428
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString()
429
+ });
430
+ }
431
+ consola.debug(`Analyzed robots.txt: ${analysis.disallowedPaths.length} disallow rules`);
432
+ } catch (error) {
433
+ consola.error("Failed to fetch robots.txt:", error);
434
+ analysis.issues.push({
435
+ id: "robots-txt-error",
436
+ url: robotsUrl,
437
+ category: "technical",
438
+ severity: "warning",
439
+ title: "Failed to fetch robots.txt",
440
+ description: `Error fetching robots.txt: ${error instanceof Error ? error.message : "Unknown error"}`,
441
+ recommendation: "Ensure robots.txt is accessible.",
442
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString()
443
+ });
444
+ }
445
+ return analysis;
446
+ }
447
+ async function isUrlAllowed(siteUrl, url, userAgent = "Googlebot") {
448
+ const robotsUrl = new URL("/robots.txt", siteUrl).href;
449
+ try {
450
+ const response = await fetch(robotsUrl);
451
+ if (!response.ok) return true;
452
+ const content = await response.text();
453
+ const robots = robotsParser(robotsUrl, content);
454
+ return robots.isAllowed(url, userAgent) ?? true;
455
+ } catch {
456
+ return true;
457
+ }
458
+ }
459
+ async function analyzeSitemap(sitemapUrl) {
460
+ const analysis = {
461
+ url: sitemapUrl,
462
+ exists: false,
463
+ type: "unknown",
464
+ urls: [],
465
+ childSitemaps: [],
466
+ issues: []
467
+ };
468
+ try {
469
+ const response = await fetch(sitemapUrl, {
470
+ headers: {
471
+ Accept: "application/xml, text/xml, */*"
472
+ }
473
+ });
474
+ if (!response.ok) {
475
+ analysis.issues.push({
476
+ id: `sitemap-not-found-${hash2(sitemapUrl)}`,
477
+ url: sitemapUrl,
478
+ category: "technical",
479
+ severity: "error",
480
+ title: "Sitemap not accessible",
481
+ description: `Sitemap returned HTTP ${response.status}.`,
482
+ recommendation: "Ensure the sitemap URL is correct and accessible.",
483
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
484
+ metadata: { statusCode: response.status }
485
+ });
486
+ return analysis;
487
+ }
488
+ analysis.exists = true;
489
+ const content = await response.text();
490
+ const contentType = response.headers.get("content-type") || "";
491
+ if (!contentType.includes("xml") && !content.trim().startsWith("<?xml")) {
492
+ analysis.issues.push({
493
+ id: `sitemap-not-xml-${hash2(sitemapUrl)}`,
494
+ url: sitemapUrl,
495
+ category: "technical",
496
+ severity: "warning",
497
+ title: "Sitemap is not XML",
498
+ description: "The sitemap does not have an XML content type.",
499
+ recommendation: "Ensure sitemap is served with Content-Type: application/xml.",
500
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
501
+ metadata: { contentType }
502
+ });
503
+ }
504
+ const $ = load(content, { xmlMode: true });
505
+ const sitemapIndex = $("sitemapindex");
506
+ if (sitemapIndex.length > 0) {
507
+ analysis.type = "sitemap-index";
508
+ $("sitemap").each((_, el) => {
509
+ const loc = $("loc", el).text().trim();
510
+ if (loc) {
511
+ analysis.childSitemaps.push(loc);
512
+ }
513
+ });
514
+ consola.debug(`Sitemap index contains ${analysis.childSitemaps.length} sitemaps`);
515
+ } else {
516
+ analysis.type = "sitemap";
517
+ $("url").each((_, el) => {
518
+ const loc = $("loc", el).text().trim();
519
+ if (loc) {
520
+ analysis.urls.push(loc);
521
+ }
522
+ });
523
+ const lastmod = $("url lastmod").first().text().trim();
524
+ if (lastmod) {
525
+ analysis.lastmod = lastmod;
526
+ }
527
+ consola.debug(`Sitemap contains ${analysis.urls.length} URLs`);
528
+ }
529
+ if (analysis.type === "sitemap" && analysis.urls.length === 0) {
530
+ analysis.issues.push({
531
+ id: `sitemap-empty-${hash2(sitemapUrl)}`,
532
+ url: sitemapUrl,
533
+ category: "technical",
534
+ severity: "warning",
535
+ title: "Sitemap is empty",
536
+ description: "The sitemap contains no URLs.",
537
+ recommendation: "Add URLs to your sitemap or remove it if not needed.",
538
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString()
539
+ });
540
+ }
541
+ if (analysis.urls.length > 5e4) {
542
+ analysis.issues.push({
543
+ id: `sitemap-too-large-${hash2(sitemapUrl)}`,
544
+ url: sitemapUrl,
545
+ category: "technical",
546
+ severity: "error",
547
+ title: "Sitemap exceeds URL limit",
548
+ description: `Sitemap contains ${analysis.urls.length} URLs. Maximum is 50,000.`,
549
+ recommendation: "Split the sitemap into multiple files using a sitemap index.",
550
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
551
+ metadata: { urlCount: analysis.urls.length }
552
+ });
553
+ }
554
+ const sizeInMB = new Blob([content]).size / (1024 * 1024);
555
+ if (sizeInMB > 50) {
556
+ analysis.issues.push({
557
+ id: `sitemap-too-large-size-${hash2(sitemapUrl)}`,
558
+ url: sitemapUrl,
559
+ category: "technical",
560
+ severity: "error",
561
+ title: "Sitemap exceeds size limit",
562
+ description: `Sitemap is ${sizeInMB.toFixed(2)}MB. Maximum is 50MB.`,
563
+ recommendation: "Split the sitemap or compress it.",
564
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString(),
565
+ metadata: { sizeMB: sizeInMB }
566
+ });
567
+ }
568
+ } catch (error) {
569
+ consola.error("Failed to analyze sitemap:", error);
570
+ analysis.issues.push({
571
+ id: `sitemap-error-${hash2(sitemapUrl)}`,
572
+ url: sitemapUrl,
573
+ category: "technical",
574
+ severity: "error",
575
+ title: "Failed to parse sitemap",
576
+ description: `Error: ${error instanceof Error ? error.message : "Unknown error"}`,
577
+ recommendation: "Check sitemap validity using Google Search Console.",
578
+ detectedAt: (/* @__PURE__ */ new Date()).toISOString()
579
+ });
580
+ }
581
+ return analysis;
582
+ }
583
+ async function analyzeAllSitemaps(sitemapUrl, maxDepth = 3) {
584
+ const results = [];
585
+ const visited = /* @__PURE__ */ new Set();
586
+ async function analyze(url, depth) {
587
+ if (depth > maxDepth || visited.has(url)) return;
588
+ visited.add(url);
589
+ const analysis = await analyzeSitemap(url);
590
+ results.push(analysis);
591
+ for (const childUrl of analysis.childSitemaps) {
592
+ await analyze(childUrl, depth + 1);
593
+ }
594
+ }
595
+ await analyze(sitemapUrl, 0);
596
+ return results;
597
+ }
598
+ function hash2(str) {
599
+ let hash3 = 0;
600
+ for (let i = 0; i < str.length; i++) {
601
+ const char = str.charCodeAt(i);
602
+ hash3 = (hash3 << 5) - hash3 + char;
603
+ hash3 = hash3 & hash3;
604
+ }
605
+ return Math.abs(hash3).toString(36);
606
+ }
607
+
608
+ export { SiteCrawler, analyzeAllSitemaps, analyzeCrawlResults, analyzeRobotsTxt, analyzeSitemap, isUrlAllowed };
609
+ //# sourceMappingURL=index.mjs.map
610
+ //# sourceMappingURL=index.mjs.map