@houtini/seo-crawler-mcp 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/.github/workflows/ci.yml +59 -0
  2. package/LICENSE +190 -0
  3. package/NOTICE +8 -0
  4. package/README.md +694 -0
  5. package/build/analyzers/QueryLoader.d.ts +30 -0
  6. package/build/analyzers/QueryLoader.d.ts.map +1 -0
  7. package/build/analyzers/QueryLoader.js +126 -0
  8. package/build/analyzers/QueryLoader.js.map +1 -0
  9. package/build/cli.d.ts +3 -0
  10. package/build/cli.d.ts.map +1 -0
  11. package/build/cli.js +190 -0
  12. package/build/cli.js.map +1 -0
  13. package/build/core/ContentExtractor.d.ts +30 -0
  14. package/build/core/ContentExtractor.d.ts.map +1 -0
  15. package/build/core/ContentExtractor.js +362 -0
  16. package/build/core/ContentExtractor.js.map +1 -0
  17. package/build/core/CrawlDatabase.d.ts +25 -0
  18. package/build/core/CrawlDatabase.d.ts.map +1 -0
  19. package/build/core/CrawlDatabase.js +603 -0
  20. package/build/core/CrawlDatabase.js.map +1 -0
  21. package/build/core/CrawlOrchestrator.d.ts +27 -0
  22. package/build/core/CrawlOrchestrator.d.ts.map +1 -0
  23. package/build/core/CrawlOrchestrator.js +279 -0
  24. package/build/core/CrawlOrchestrator.js.map +1 -0
  25. package/build/core/CrawlStorage.d.ts +33 -0
  26. package/build/core/CrawlStorage.d.ts.map +1 -0
  27. package/build/core/CrawlStorage.js +94 -0
  28. package/build/core/CrawlStorage.js.map +1 -0
  29. package/build/core/LinkExtractor.d.ts +14 -0
  30. package/build/core/LinkExtractor.d.ts.map +1 -0
  31. package/build/core/LinkExtractor.js +91 -0
  32. package/build/core/LinkExtractor.js.map +1 -0
  33. package/build/core/UrlManager.d.ts +21 -0
  34. package/build/core/UrlManager.d.ts.map +1 -0
  35. package/build/core/UrlManager.js +87 -0
  36. package/build/core/UrlManager.js.map +1 -0
  37. package/build/formatters/structured-report-format.d.ts +48 -0
  38. package/build/formatters/structured-report-format.d.ts.map +1 -0
  39. package/build/formatters/structured-report-format.js +145 -0
  40. package/build/formatters/structured-report-format.js.map +1 -0
  41. package/build/index.d.ts +3 -0
  42. package/build/index.d.ts.map +1 -0
  43. package/build/index.js +214 -0
  44. package/build/index.js.map +1 -0
  45. package/build/schema/index.d.ts +627 -0
  46. package/build/schema/index.d.ts.map +1 -0
  47. package/build/schema/index.js +159 -0
  48. package/build/schema/index.js.map +1 -0
  49. package/build/tools/analyze-seo.d.ts +44 -0
  50. package/build/tools/analyze-seo.d.ts.map +1 -0
  51. package/build/tools/analyze-seo.js +110 -0
  52. package/build/tools/analyze-seo.js.map +1 -0
  53. package/build/tools/list-queries.d.ts +28 -0
  54. package/build/tools/list-queries.d.ts.map +1 -0
  55. package/build/tools/list-queries.js +30 -0
  56. package/build/tools/list-queries.js.map +1 -0
  57. package/build/tools/query-seo-data.d.ts +15 -0
  58. package/build/tools/query-seo-data.d.ts.map +1 -0
  59. package/build/tools/query-seo-data.js +43 -0
  60. package/build/tools/query-seo-data.js.map +1 -0
  61. package/build/tools/run-seo-audit.d.ts +3 -0
  62. package/build/tools/run-seo-audit.d.ts.map +1 -0
  63. package/build/tools/run-seo-audit.js +54 -0
  64. package/build/tools/run-seo-audit.js.map +1 -0
  65. package/build/types/index.d.ts +158 -0
  66. package/build/types/index.d.ts.map +1 -0
  67. package/build/types/index.js +2 -0
  68. package/build/types/index.js.map +1 -0
  69. package/build/utils/debug.d.ts +2 -0
  70. package/build/utils/debug.d.ts.map +1 -0
  71. package/build/utils/debug.js +7 -0
  72. package/build/utils/debug.js.map +1 -0
  73. package/package.json +49 -0
  74. package/server.json +31 -0
  75. package/src/analyzers/QueryLoader.ts +175 -0
  76. package/src/analyzers/queries/README.md +228 -0
  77. package/src/analyzers/queries/content/duplicate-h1.sql +18 -0
  78. package/src/analyzers/queries/content/duplicate-meta-descriptions.sql +18 -0
  79. package/src/analyzers/queries/content/duplicate-titles.sql +19 -0
  80. package/src/analyzers/queries/content/missing-h1.sql +18 -0
  81. package/src/analyzers/queries/content/missing-meta-descriptions.sql +19 -0
  82. package/src/analyzers/queries/content/multiple-h1.sql +17 -0
  83. package/src/analyzers/queries/content/thin-content.sql +18 -0
  84. package/src/analyzers/queries/critical/404-errors.sql +14 -0
  85. package/src/analyzers/queries/critical/broken-internal-links.sql +20 -0
  86. package/src/analyzers/queries/critical/missing-titles.sql +17 -0
  87. package/src/analyzers/queries/critical/server-errors.sql +15 -0
  88. package/src/analyzers/queries/opportunities/high-external-links.sql +18 -0
  89. package/src/analyzers/queries/opportunities/meta-description-length.sql +27 -0
  90. package/src/analyzers/queries/opportunities/missing-images.sql +18 -0
  91. package/src/analyzers/queries/opportunities/no-outbound-links.sql +18 -0
  92. package/src/analyzers/queries/opportunities/title-equals-h1.sql +21 -0
  93. package/src/analyzers/queries/opportunities/title-length.sql +27 -0
  94. package/src/analyzers/queries/security/missing-csp.sql +16 -0
  95. package/src/analyzers/queries/security/missing-hsts.sql +17 -0
  96. package/src/analyzers/queries/security/missing-referrer-policy.sql +16 -0
  97. package/src/analyzers/queries/security/missing-x-frame-options.sql +16 -0
  98. package/src/analyzers/queries/security/protocol-relative-links.sql +16 -0
  99. package/src/analyzers/queries/security/unsafe-external-links.sql +17 -0
  100. package/src/analyzers/queries/technical/canonical-issues.sql +20 -0
  101. package/src/analyzers/queries/technical/heading-hierarchy-issues.sql +19 -0
  102. package/src/analyzers/queries/technical/non-https.sql +16 -0
  103. package/src/analyzers/queries/technical/orphan-pages.sql +21 -0
  104. package/src/analyzers/queries/technical/redirects.sql +15 -0
  105. package/src/cli.ts +224 -0
  106. package/src/core/ContentExtractor.ts +480 -0
  107. package/src/core/CrawlDatabase.ts +736 -0
  108. package/src/core/CrawlOrchestrator.ts +346 -0
  109. package/src/core/CrawlStorage.ts +148 -0
  110. package/src/core/LinkExtractor.ts +123 -0
  111. package/src/core/UrlManager.ts +114 -0
  112. package/src/formatters/structured-report-format.ts +254 -0
  113. package/src/index.ts +259 -0
  114. package/src/schema/index.ts +176 -0
  115. package/src/tools/analyze-seo.ts +184 -0
  116. package/src/tools/list-queries.ts +70 -0
  117. package/src/tools/query-seo-data.ts +77 -0
  118. package/src/tools/run-seo-audit.ts +83 -0
  119. package/src/types/index.ts +179 -0
  120. package/src/utils/debug.ts +12 -0
  121. package/tsconfig.json +26 -0
@@ -0,0 +1,114 @@
1
+ /**
2
+ * UrlManager - URL normalization, deduplication, and tracking
3
+ *
4
+ * Responsibilities:
5
+ * - Normalize URLs (www-agnostic, no fragments, no trailing slashes)
6
+ * - Track discovered vs visited URLs
7
+ * - Determine if URL is internal
8
+ * - Track source pages ("linked from" feature)
9
+ * - Track depth of each URL
10
+ */
11
+
12
+ export class UrlManager {
13
+ private baseDomain: string;
14
+ private discovered: Map<string, number> = new Map();
15
+ private visited: Set<string> = new Set();
16
+ private sourcePagesMap: Map<string, Set<string>> = new Map();
17
+
18
+ constructor(baseDomain: string) {
19
+ this.baseDomain = this.normalizeDomain(baseDomain);
20
+ }
21
+
22
+ normalizeUrl(url: string): string {
23
+ try {
24
+ const parsed = new URL(url);
25
+
26
+ let clean = `${parsed.protocol}//${parsed.hostname}${parsed.pathname}`;
27
+
28
+ if (parsed.search) {
29
+ clean += parsed.search;
30
+ }
31
+
32
+ if (clean.endsWith('/') && clean.length > clean.indexOf('://') + 4) {
33
+ clean = clean.slice(0, -1);
34
+ }
35
+
36
+ return clean;
37
+ } catch {
38
+ return url;
39
+ }
40
+ }
41
+
42
+ private normalizeDomain(domain: string): string {
43
+ try {
44
+ const url = domain.startsWith('http') ? domain : `https://${domain}`;
45
+ return new URL(url).hostname.replace(/^www\./, '');
46
+ } catch {
47
+ return domain.replace(/^www\./, '');
48
+ }
49
+ }
50
+
51
+ isInternal(url: string): boolean {
52
+ try {
53
+ const urlDomain = new URL(url).hostname.replace(/^www\./, '');
54
+ return urlDomain === this.baseDomain;
55
+ } catch {
56
+ return false;
57
+ }
58
+ }
59
+
60
+ addDiscovered(url: string, depth: number, sourceUrl?: string): void {
61
+ const normalized = this.normalizeUrl(url);
62
+
63
+ if (!this.discovered.has(normalized)) {
64
+ this.discovered.set(normalized, depth);
65
+ }
66
+
67
+ if (sourceUrl) {
68
+ const normalizedSource = this.normalizeUrl(sourceUrl);
69
+ if (!this.sourcePagesMap.has(normalized)) {
70
+ this.sourcePagesMap.set(normalized, new Set());
71
+ }
72
+ this.sourcePagesMap.get(normalized)!.add(normalizedSource);
73
+ }
74
+ }
75
+
76
+ markVisited(url: string): void {
77
+ const normalized = this.normalizeUrl(url);
78
+ this.visited.add(normalized);
79
+ }
80
+
81
+ isVisited(url: string): boolean {
82
+ return this.visited.has(this.normalizeUrl(url));
83
+ }
84
+
85
+ isDiscovered(url: string): boolean {
86
+ return this.discovered.has(this.normalizeUrl(url));
87
+ }
88
+
89
+ getSourcePages(url: string): string[] {
90
+ const normalized = this.normalizeUrl(url);
91
+ const sources = this.sourcePagesMap.get(normalized);
92
+ return sources ? Array.from(sources) : [];
93
+ }
94
+
95
+ getDepth(url: string): number {
96
+ return this.discovered.get(this.normalizeUrl(url)) ?? 0;
97
+ }
98
+
99
+ getTotalDiscovered(): number {
100
+ return this.discovered.size;
101
+ }
102
+
103
+ getTotalVisited(): number {
104
+ return this.visited.size;
105
+ }
106
+
107
+ getMaxDepth(): number {
108
+ return Math.max(0, ...Array.from(this.discovered.values()));
109
+ }
110
+
111
+ getUnvisitedUrls(): string[] {
112
+ return Array.from(this.discovered.keys()).filter(url => !this.visited.has(url));
113
+ }
114
+ }
@@ -0,0 +1,254 @@
1
+ /**
2
+ * Structured SEO Report Formatter
3
+ *
4
+ * Transforms raw SEO analysis data into structured, actionable reports
5
+ * with clear issue categorization and remediation guidance.
6
+ *
7
+ * Output Format:
8
+ * - Issue Name | Issue Type | Priority | URLs | % of Total | Description | How To Fix
9
+ */
10
+
11
+ interface SEOIssue {
12
+ query: string;
13
+ category: string;
14
+ priority: string;
15
+ description: string;
16
+ impact: string;
17
+ fix: string;
18
+ affectedCount: number;
19
+ examples: Array<{
20
+ url: string;
21
+ detail?: string;
22
+ }>;
23
+ }
24
+
25
+ interface StructuredIssue {
26
+ issueName: string;
27
+ issueType: 'Issue' | 'Warning' | 'Opportunity';
28
+ issuePriority: 'Critical' | 'High' | 'Medium' | 'Low';
29
+ urls: number;
30
+ percentOfTotal: number;
31
+ description: string;
32
+ howToFix: string;
33
+ examples: Array<{
34
+ url: string;
35
+ detail?: string;
36
+ }>;
37
+ }
38
+
39
+ interface StructuredReport {
40
+ overview: {
41
+ totalPages: number;
42
+ totalIssues: number;
43
+ criticalIssues: number;
44
+ highPriorityIssues: number;
45
+ mediumPriorityIssues: number;
46
+ lowPriorityIssues: number;
47
+ };
48
+ issues: StructuredIssue[];
49
+ executionTime: number;
50
+ }
51
+
52
+ export class StructuredReportFormatter {
53
+
54
+ /**
55
+ * Convert internal SEO issues to structured format
56
+ */
57
+ static formatReport(
58
+ issues: SEOIssue[],
59
+ totalPages: number,
60
+ executionTime: number
61
+ ): StructuredReport {
62
+
63
+ const formattedIssues = issues.map(issue =>
64
+ this.formatIssue(issue, totalPages)
65
+ );
66
+
67
+ // Sort by priority (Critical > High > Medium > Low)
68
+ const priorityOrder = { Critical: 0, High: 1, Medium: 2, Low: 3 };
69
+ formattedIssues.sort((a, b) =>
70
+ priorityOrder[a.issuePriority] - priorityOrder[b.issuePriority]
71
+ );
72
+
73
+ const overview = {
74
+ totalPages,
75
+ totalIssues: formattedIssues.length,
76
+ criticalIssues: formattedIssues.filter(i => i.issuePriority === 'Critical').length,
77
+ highPriorityIssues: formattedIssues.filter(i => i.issuePriority === 'High').length,
78
+ mediumPriorityIssues: formattedIssues.filter(i => i.issuePriority === 'Medium').length,
79
+ lowPriorityIssues: formattedIssues.filter(i => i.issuePriority === 'Low').length
80
+ };
81
+
82
+ return {
83
+ overview,
84
+ issues: formattedIssues,
85
+ executionTime
86
+ };
87
+ }
88
+
89
+ /**
90
+ * Format individual issue to structured style
91
+ */
92
+ private static formatIssue(
93
+ issue: SEOIssue,
94
+ totalPages: number
95
+ ): StructuredIssue {
96
+
97
+ return {
98
+ issueName: this.formatIssueName(issue),
99
+ issueType: this.getIssueType(issue.category, issue.priority),
100
+ issuePriority: this.normalizePriority(issue.priority),
101
+ urls: issue.affectedCount,
102
+ percentOfTotal: parseFloat(((issue.affectedCount / totalPages) * 100).toFixed(2)),
103
+ description: issue.description,
104
+ howToFix: issue.fix,
105
+ examples: issue.examples
106
+ };
107
+ }
108
+
109
+ /**
110
+ * Convert query name to user-friendly issue name
111
+ */
112
+ private static formatIssueName(issue: SEOIssue): string {
113
+ const nameMap: Record<string, string> = {
114
+ 'missing-titles': 'Page Titles: Missing',
115
+ 'broken-internal-links': 'Links: Broken Internal Links',
116
+ 'server-errors': 'Response Codes: Internal Server Error (5xx)',
117
+ '404-errors': 'Response Codes: Internal Client Error (4xx)',
118
+ 'duplicate-titles': 'Page Titles: Duplicate',
119
+ 'duplicate-meta-descriptions': 'Meta Description: Duplicate',
120
+ 'missing-meta-descriptions': 'Meta Description: Missing',
121
+ 'thin-content': 'Content: Low Content Pages',
122
+ 'missing-h1': 'H1: Missing',
123
+ 'multiple-h1': 'H1: Multiple',
124
+ 'duplicate-h1': 'H1: Duplicate',
125
+ 'redirects': 'Response Codes: Internal Redirection (3xx)',
126
+ 'orphan-pages': 'Links: Orphan Pages',
127
+ 'canonical-issues': 'Canonical: Issues',
128
+ 'non-https': 'Security: Non-HTTPS URLs',
129
+ 'heading-hierarchy-issues': 'Heading Hierarchy: Issues',
130
+ 'missing-hsts': 'Security: Missing HSTS Header',
131
+ 'missing-csp': 'Security: Missing Content-Security-Policy Header',
132
+ 'missing-x-frame-options': 'Security: Missing X-Frame-Options Header',
133
+ 'missing-referrer-policy': 'Security: Missing Secure Referrer-Policy Header',
134
+ 'unsafe-external-links': 'Security: Unsafe Cross-Origin Links',
135
+ 'protocol-relative-links': 'Security: Protocol-Relative Resource Links',
136
+ 'title-length': 'Page Titles: Length Issues',
137
+ 'meta-description-length': 'Meta Description: Length Issues',
138
+ 'title-equals-h1': 'Page Titles: Same as H1',
139
+ 'no-outbound-links': 'Links: Pages With No Outbound Links',
140
+ 'high-external-links': 'Links: Pages With High External Outlinks',
141
+ 'missing-images': 'Images: Missing On Content Pages'
142
+ };
143
+
144
+ return nameMap[issue.query] || issue.query;
145
+ }
146
+
147
+ /**
148
+ * Determine issue type based on category and priority
149
+ */
150
+ private static getIssueType(
151
+ category: string,
152
+ priority: string
153
+ ): 'Issue' | 'Warning' | 'Opportunity' {
154
+
155
+ // Critical/High priority = Issue
156
+ if (priority === 'CRITICAL' || priority === 'HIGH') {
157
+ return 'Issue';
158
+ }
159
+
160
+ // Opportunities category = Opportunity
161
+ if (category === 'opportunities') {
162
+ return 'Opportunity';
163
+ }
164
+
165
+ // Everything else = Warning
166
+ return 'Warning';
167
+ }
168
+
169
+ /**
170
+ * Normalize priority to title case
171
+ */
172
+ private static normalizePriority(priority: string): 'Critical' | 'High' | 'Medium' | 'Low' {
173
+ const normalized = priority.charAt(0).toUpperCase() + priority.slice(1).toLowerCase();
174
+ return normalized as 'Critical' | 'High' | 'Medium' | 'Low';
175
+ }
176
+
177
+ /**
178
+ * Generate plain text summary for terminal output
179
+ */
180
+ static generateTextSummary(report: StructuredReport): string {
181
+ const lines: string[] = [];
182
+
183
+ lines.push('═══════════════════════════════════════════════════════');
184
+ lines.push(' SEO AUDIT SUMMARY');
185
+ lines.push('═══════════════════════════════════════════════════════');
186
+ lines.push('');
187
+ lines.push(`Total Pages Crawled: ${report.overview.totalPages.toLocaleString()}`);
188
+ lines.push(`Total Issues Found: ${report.overview.totalIssues}`);
189
+ lines.push('');
190
+ lines.push('Issues by Priority:');
191
+ lines.push(` Critical: ${report.overview.criticalIssues}`);
192
+ lines.push(` High: ${report.overview.highPriorityIssues}`);
193
+ lines.push(` Medium: ${report.overview.mediumPriorityIssues}`);
194
+ lines.push(` Low: ${report.overview.lowPriorityIssues}`);
195
+ lines.push('');
196
+ lines.push(`Analysis Time: ${report.executionTime}ms`);
197
+ lines.push('═══════════════════════════════════════════════════════');
198
+ lines.push('');
199
+
200
+ // Group by priority
201
+ const critical = report.issues.filter(i => i.issuePriority === 'Critical');
202
+ const high = report.issues.filter(i => i.issuePriority === 'High');
203
+ const medium = report.issues.filter(i => i.issuePriority === 'Medium');
204
+ const low = report.issues.filter(i => i.issuePriority === 'Low');
205
+
206
+ if (critical.length > 0) {
207
+ lines.push('CRITICAL ISSUES (Fix Immediately)');
208
+ lines.push('───────────────────────────────────────────────────────');
209
+ critical.forEach(issue => {
210
+ lines.push('');
211
+ lines.push(`${issue.issueName}`);
212
+ lines.push(` URLs Affected: ${issue.urls} (${issue.percentOfTotal}%)`);
213
+ lines.push(` Type: ${issue.issueType}`);
214
+ lines.push(` Description: ${issue.description}`);
215
+ lines.push(` How To Fix: ${issue.howToFix}`);
216
+ });
217
+ lines.push('');
218
+ }
219
+
220
+ if (high.length > 0) {
221
+ lines.push('HIGH PRIORITY ISSUES');
222
+ lines.push('───────────────────────────────────────────────────────');
223
+ high.forEach(issue => {
224
+ lines.push('');
225
+ lines.push(`${issue.issueName}`);
226
+ lines.push(` URLs Affected: ${issue.urls} (${issue.percentOfTotal}%)`);
227
+ lines.push(` Type: ${issue.issueType}`);
228
+ lines.push(` Description: ${issue.description}`);
229
+ lines.push(` How To Fix: ${issue.howToFix}`);
230
+ });
231
+ lines.push('');
232
+ }
233
+
234
+ if (medium.length > 0) {
235
+ lines.push('MEDIUM PRIORITY ISSUES');
236
+ lines.push('───────────────────────────────────────────────────────');
237
+ medium.forEach(issue => {
238
+ lines.push(`${issue.issueName} - ${issue.urls} URLs (${issue.percentOfTotal}%)`);
239
+ });
240
+ lines.push('');
241
+ }
242
+
243
+ if (low.length > 0) {
244
+ lines.push('LOW PRIORITY / OPPORTUNITIES');
245
+ lines.push('───────────────────────────────────────────────────────');
246
+ low.forEach(issue => {
247
+ lines.push(`${issue.issueName} - ${issue.urls} URLs (${issue.percentOfTotal}%)`);
248
+ });
249
+ lines.push('');
250
+ }
251
+
252
+ return lines.join('\n');
253
+ }
254
+ }
package/src/index.ts ADDED
@@ -0,0 +1,259 @@
1
+ #!/usr/bin/env node
2
+
3
+ // Copyright 2026 Richard Baxter
4
+ // Licensed under the Apache License, Version 2.0
5
+
6
+ // CRITICAL: Disable ALL Crawlee logging for MCP stdio compatibility
7
+ process.env.CRAWLEE_LOG_LEVEL = 'OFF';
8
+
9
+ /**
10
+ * Crawlee MCP Server v2
11
+ *
12
+ * Professional website crawler and SEO analyzer
13
+ * Built with @modelcontextprotocol/sdk and Crawlee
14
+ *
15
+ * Phase 1: MCP server skeleton with tool registration ✅
16
+ * Phase 2: Full crawling engine implementation ✅
17
+ * Phase 3: SEO analysis layer ✅
18
+ * Phase 4: Fixed RequestQueue persistence bug ✅
19
+ */
20
+
21
+ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
22
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
23
+ import {
24
+ CallToolRequestSchema,
25
+ ListToolsRequestSchema,
26
+ Tool
27
+ } from '@modelcontextprotocol/sdk/types.js';
28
+
29
+ import { runSeoAudit } from './tools/run-seo-audit.js';
30
+ import { analyzeSeo } from './tools/analyze-seo.js';
31
+ import { querySeoData } from './tools/query-seo-data.js';
32
+ import { listQueries } from './tools/list-queries.js';
33
+
34
+ const SERVER_NAME = 'seo-crawler-mcp';
35
+ const SERVER_VERSION = '2.0.1'; // Version bump for storage cleanup fix
36
+
37
+ const tools: Tool[] = [
38
+ {
39
+ name: 'run_seo_audit',
40
+ description: 'Crawl a website and extract comprehensive SEO data using Crawlee HttpCrawler. Returns crawl ID and output path.',
41
+ inputSchema: {
42
+ type: 'object',
43
+ properties: {
44
+ url: {
45
+ type: 'string',
46
+ description: 'Starting URL to crawl (must include http:// or https://)'
47
+ },
48
+ maxPages: {
49
+ type: 'number',
50
+ description: 'Maximum number of pages to crawl (1-10000). Default: 1000',
51
+ minimum: 1,
52
+ maximum: 10000
53
+ },
54
+ depth: {
55
+ type: 'number',
56
+ description: 'Maximum crawl depth (1-10). Default: 3',
57
+ minimum: 1,
58
+ maximum: 10
59
+ },
60
+ userAgent: {
61
+ type: 'string',
62
+ enum: ['chrome', 'googlebot'],
63
+ description: 'User agent to identify as: "chrome" (default, Chrome browser) or "googlebot" (Googlebot crawler). Default: chrome'
64
+ }
65
+ },
66
+ required: ['url']
67
+ }
68
+ },
69
+ {
70
+ name: 'analyze_seo',
71
+ description: 'Analyze SEO data from a completed crawl. Runs 25+ SQL queries to detect critical issues, content problems, technical SEO issues, security vulnerabilities, and optimization opportunities. Returns structured report with affected URLs and fix recommendations.',
72
+ inputSchema: {
73
+ type: 'object',
74
+ properties: {
75
+ crawlPath: {
76
+ type: 'string',
77
+ description: 'Path to crawl output directory (e.g., C:/seo-audits/example.com_2026-02-01_abc123)'
78
+ },
79
+ includeCategories: {
80
+ type: 'array',
81
+ items: {
82
+ type: 'string',
83
+ enum: ['critical', 'content', 'technical', 'security', 'opportunities']
84
+ },
85
+ description: 'Optional: Filter analysis by categories. Default: all categories'
86
+ },
87
+ maxExamplesPerIssue: {
88
+ type: 'number',
89
+ description: 'Maximum example URLs to return per issue. Default: 10',
90
+ minimum: 1,
91
+ maximum: 100
92
+ },
93
+ format: {
94
+ type: 'string',
95
+ enum: ['detailed', 'summary', 'structured'],
96
+ description: 'Output format: "structured" (organized format, default), "summary" (text overview), "detailed" (full JSON). Default: structured'
97
+ }
98
+ },
99
+ required: ['crawlPath']
100
+ }
101
+ },
102
+ {
103
+ name: 'query_seo_data',
104
+ description: 'Execute a specific SEO analysis query by name. Use list_seo_queries to see available queries. Returns detailed results with affected URLs and context.',
105
+ inputSchema: {
106
+ type: 'object',
107
+ properties: {
108
+ crawlPath: {
109
+ type: 'string',
110
+ description: 'Path to crawl output directory'
111
+ },
112
+ query: {
113
+ type: 'string',
114
+ description: 'Query name (e.g., "missing-titles", "duplicate-h1", "orphan-pages"). Use list_seo_queries to see all available queries.'
115
+ },
116
+ limit: {
117
+ type: 'number',
118
+ description: 'Optional: Maximum number of results to return. Default: 100',
119
+ minimum: 1,
120
+ maximum: 1000
121
+ }
122
+ },
123
+ required: ['crawlPath', 'query']
124
+ }
125
+ },
126
+ {
127
+ name: 'list_seo_queries',
128
+ description: 'List all available SEO analysis queries with descriptions, priorities, and fix recommendations. Optionally filter by category or priority level.',
129
+ inputSchema: {
130
+ type: 'object',
131
+ properties: {
132
+ category: {
133
+ type: 'string',
134
+ enum: ['critical', 'content', 'technical', 'security', 'opportunities'],
135
+ description: 'Optional: Filter by category'
136
+ },
137
+ priority: {
138
+ type: 'string',
139
+ enum: ['CRITICAL', 'HIGH', 'MEDIUM', 'LOW'],
140
+ description: 'Optional: Filter by priority level'
141
+ }
142
+ }
143
+ }
144
+ }
145
+ ];
146
+
147
+ class CrawleeMcpServer {
148
+ private server: Server;
149
+
150
+ constructor() {
151
+ this.server = new Server(
152
+ {
153
+ name: SERVER_NAME,
154
+ version: SERVER_VERSION
155
+ },
156
+ {
157
+ capabilities: {
158
+ tools: {}
159
+ }
160
+ }
161
+ );
162
+
163
+ this.setupHandlers();
164
+ this.setupErrorHandling();
165
+ }
166
+
167
+ private setupHandlers(): void {
168
+ this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
169
+ tools
170
+ }));
171
+
172
+ this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
173
+ const { name, arguments: args } = request.params;
174
+
175
+ try {
176
+ let result: any;
177
+
178
+ switch (name) {
179
+ case 'run_seo_audit':
180
+ result = await runSeoAudit(args as any);
181
+ break;
182
+
183
+ case 'analyze_seo':
184
+ result = await analyzeSeo(args as any);
185
+ break;
186
+
187
+ case 'query_seo_data':
188
+ result = await querySeoData(args as any);
189
+ break;
190
+
191
+ case 'list_seo_queries':
192
+ result = await listQueries(args as any);
193
+ break;
194
+
195
+ default:
196
+ return {
197
+ content: [
198
+ {
199
+ type: 'text',
200
+ text: `Unknown tool: ${name}`
201
+ }
202
+ ],
203
+ isError: true
204
+ };
205
+ }
206
+
207
+ return {
208
+ content: [
209
+ {
210
+ type: 'text',
211
+ text: JSON.stringify(result, null, 2)
212
+ }
213
+ ]
214
+ };
215
+
216
+ } catch (error) {
217
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
218
+
219
+ return {
220
+ content: [
221
+ {
222
+ type: 'text',
223
+ text: JSON.stringify({
224
+ error: errorMessage,
225
+ tool: name
226
+ }, null, 2)
227
+ }
228
+ ],
229
+ isError: true
230
+ };
231
+ }
232
+ });
233
+ }
234
+
235
+ private setupErrorHandling(): void {
236
+ this.server.onerror = (error) => {
237
+ console.error('[MCP Error]', error);
238
+ };
239
+
240
+ process.on('SIGINT', async () => {
241
+ await this.server.close();
242
+ process.exit(0);
243
+ });
244
+ }
245
+
246
+ async run(): Promise<void> {
247
+ const transport = new StdioServerTransport();
248
+ await this.server.connect(transport);
249
+
250
+ console.error(`${SERVER_NAME} v${SERVER_VERSION} running on stdio`);
251
+ console.error('✅ Phase 1: MCP server active');
252
+ console.error('✅ Phase 2: Crawling engine ready');
253
+ console.error('✅ Phase 3: SEO analysis layer active');
254
+ console.error('✅ Phase 4: RequestQueue bug fixed');
255
+ }
256
+ }
257
+
258
+ const server = new CrawleeMcpServer();
259
+ server.run().catch(console.error);