@houtini/seo-crawler-mcp 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +59 -0
- package/LICENSE +190 -0
- package/NOTICE +8 -0
- package/README.md +694 -0
- package/build/analyzers/QueryLoader.d.ts +30 -0
- package/build/analyzers/QueryLoader.d.ts.map +1 -0
- package/build/analyzers/QueryLoader.js +126 -0
- package/build/analyzers/QueryLoader.js.map +1 -0
- package/build/cli.d.ts +3 -0
- package/build/cli.d.ts.map +1 -0
- package/build/cli.js +190 -0
- package/build/cli.js.map +1 -0
- package/build/core/ContentExtractor.d.ts +30 -0
- package/build/core/ContentExtractor.d.ts.map +1 -0
- package/build/core/ContentExtractor.js +362 -0
- package/build/core/ContentExtractor.js.map +1 -0
- package/build/core/CrawlDatabase.d.ts +25 -0
- package/build/core/CrawlDatabase.d.ts.map +1 -0
- package/build/core/CrawlDatabase.js +603 -0
- package/build/core/CrawlDatabase.js.map +1 -0
- package/build/core/CrawlOrchestrator.d.ts +27 -0
- package/build/core/CrawlOrchestrator.d.ts.map +1 -0
- package/build/core/CrawlOrchestrator.js +279 -0
- package/build/core/CrawlOrchestrator.js.map +1 -0
- package/build/core/CrawlStorage.d.ts +33 -0
- package/build/core/CrawlStorage.d.ts.map +1 -0
- package/build/core/CrawlStorage.js +94 -0
- package/build/core/CrawlStorage.js.map +1 -0
- package/build/core/LinkExtractor.d.ts +14 -0
- package/build/core/LinkExtractor.d.ts.map +1 -0
- package/build/core/LinkExtractor.js +91 -0
- package/build/core/LinkExtractor.js.map +1 -0
- package/build/core/UrlManager.d.ts +21 -0
- package/build/core/UrlManager.d.ts.map +1 -0
- package/build/core/UrlManager.js +87 -0
- package/build/core/UrlManager.js.map +1 -0
- package/build/formatters/structured-report-format.d.ts +48 -0
- package/build/formatters/structured-report-format.d.ts.map +1 -0
- package/build/formatters/structured-report-format.js +145 -0
- package/build/formatters/structured-report-format.js.map +1 -0
- package/build/index.d.ts +3 -0
- package/build/index.d.ts.map +1 -0
- package/build/index.js +214 -0
- package/build/index.js.map +1 -0
- package/build/schema/index.d.ts +627 -0
- package/build/schema/index.d.ts.map +1 -0
- package/build/schema/index.js +159 -0
- package/build/schema/index.js.map +1 -0
- package/build/tools/analyze-seo.d.ts +44 -0
- package/build/tools/analyze-seo.d.ts.map +1 -0
- package/build/tools/analyze-seo.js +110 -0
- package/build/tools/analyze-seo.js.map +1 -0
- package/build/tools/list-queries.d.ts +28 -0
- package/build/tools/list-queries.d.ts.map +1 -0
- package/build/tools/list-queries.js +30 -0
- package/build/tools/list-queries.js.map +1 -0
- package/build/tools/query-seo-data.d.ts +15 -0
- package/build/tools/query-seo-data.d.ts.map +1 -0
- package/build/tools/query-seo-data.js +43 -0
- package/build/tools/query-seo-data.js.map +1 -0
- package/build/tools/run-seo-audit.d.ts +3 -0
- package/build/tools/run-seo-audit.d.ts.map +1 -0
- package/build/tools/run-seo-audit.js +54 -0
- package/build/tools/run-seo-audit.js.map +1 -0
- package/build/types/index.d.ts +158 -0
- package/build/types/index.d.ts.map +1 -0
- package/build/types/index.js +2 -0
- package/build/types/index.js.map +1 -0
- package/build/utils/debug.d.ts +2 -0
- package/build/utils/debug.d.ts.map +1 -0
- package/build/utils/debug.js +7 -0
- package/build/utils/debug.js.map +1 -0
- package/package.json +49 -0
- package/server.json +31 -0
- package/src/analyzers/QueryLoader.ts +175 -0
- package/src/analyzers/queries/README.md +228 -0
- package/src/analyzers/queries/content/duplicate-h1.sql +18 -0
- package/src/analyzers/queries/content/duplicate-meta-descriptions.sql +18 -0
- package/src/analyzers/queries/content/duplicate-titles.sql +19 -0
- package/src/analyzers/queries/content/missing-h1.sql +18 -0
- package/src/analyzers/queries/content/missing-meta-descriptions.sql +19 -0
- package/src/analyzers/queries/content/multiple-h1.sql +17 -0
- package/src/analyzers/queries/content/thin-content.sql +18 -0
- package/src/analyzers/queries/critical/404-errors.sql +14 -0
- package/src/analyzers/queries/critical/broken-internal-links.sql +20 -0
- package/src/analyzers/queries/critical/missing-titles.sql +17 -0
- package/src/analyzers/queries/critical/server-errors.sql +15 -0
- package/src/analyzers/queries/opportunities/high-external-links.sql +18 -0
- package/src/analyzers/queries/opportunities/meta-description-length.sql +27 -0
- package/src/analyzers/queries/opportunities/missing-images.sql +18 -0
- package/src/analyzers/queries/opportunities/no-outbound-links.sql +18 -0
- package/src/analyzers/queries/opportunities/title-equals-h1.sql +21 -0
- package/src/analyzers/queries/opportunities/title-length.sql +27 -0
- package/src/analyzers/queries/security/missing-csp.sql +16 -0
- package/src/analyzers/queries/security/missing-hsts.sql +17 -0
- package/src/analyzers/queries/security/missing-referrer-policy.sql +16 -0
- package/src/analyzers/queries/security/missing-x-frame-options.sql +16 -0
- package/src/analyzers/queries/security/protocol-relative-links.sql +16 -0
- package/src/analyzers/queries/security/unsafe-external-links.sql +17 -0
- package/src/analyzers/queries/technical/canonical-issues.sql +20 -0
- package/src/analyzers/queries/technical/heading-hierarchy-issues.sql +19 -0
- package/src/analyzers/queries/technical/non-https.sql +16 -0
- package/src/analyzers/queries/technical/orphan-pages.sql +21 -0
- package/src/analyzers/queries/technical/redirects.sql +15 -0
- package/src/cli.ts +224 -0
- package/src/core/ContentExtractor.ts +480 -0
- package/src/core/CrawlDatabase.ts +736 -0
- package/src/core/CrawlOrchestrator.ts +346 -0
- package/src/core/CrawlStorage.ts +148 -0
- package/src/core/LinkExtractor.ts +123 -0
- package/src/core/UrlManager.ts +114 -0
- package/src/formatters/structured-report-format.ts +254 -0
- package/src/index.ts +259 -0
- package/src/schema/index.ts +176 -0
- package/src/tools/analyze-seo.ts +184 -0
- package/src/tools/list-queries.ts +70 -0
- package/src/tools/query-seo-data.ts +77 -0
- package/src/tools/run-seo-audit.ts +83 -0
- package/src/types/index.ts +179 -0
- package/src/utils/debug.ts +12 -0
- package/tsconfig.json +26 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* UrlManager - URL normalization, deduplication, and tracking
|
|
3
|
+
*
|
|
4
|
+
* Responsibilities:
|
|
5
|
+
* - Normalize URLs (www-agnostic, no fragments, no trailing slashes)
|
|
6
|
+
* - Track discovered vs visited URLs
|
|
7
|
+
* - Determine if URL is internal
|
|
8
|
+
* - Track source pages ("linked from" feature)
|
|
9
|
+
* - Track depth of each URL
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
export class UrlManager {
|
|
13
|
+
private baseDomain: string;
|
|
14
|
+
private discovered: Map<string, number> = new Map();
|
|
15
|
+
private visited: Set<string> = new Set();
|
|
16
|
+
private sourcePagesMap: Map<string, Set<string>> = new Map();
|
|
17
|
+
|
|
18
|
+
constructor(baseDomain: string) {
|
|
19
|
+
this.baseDomain = this.normalizeDomain(baseDomain);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
normalizeUrl(url: string): string {
|
|
23
|
+
try {
|
|
24
|
+
const parsed = new URL(url);
|
|
25
|
+
|
|
26
|
+
let clean = `${parsed.protocol}//${parsed.hostname}${parsed.pathname}`;
|
|
27
|
+
|
|
28
|
+
if (parsed.search) {
|
|
29
|
+
clean += parsed.search;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (clean.endsWith('/') && clean.length > clean.indexOf('://') + 4) {
|
|
33
|
+
clean = clean.slice(0, -1);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return clean;
|
|
37
|
+
} catch {
|
|
38
|
+
return url;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
private normalizeDomain(domain: string): string {
|
|
43
|
+
try {
|
|
44
|
+
const url = domain.startsWith('http') ? domain : `https://${domain}`;
|
|
45
|
+
return new URL(url).hostname.replace(/^www\./, '');
|
|
46
|
+
} catch {
|
|
47
|
+
return domain.replace(/^www\./, '');
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
isInternal(url: string): boolean {
|
|
52
|
+
try {
|
|
53
|
+
const urlDomain = new URL(url).hostname.replace(/^www\./, '');
|
|
54
|
+
return urlDomain === this.baseDomain;
|
|
55
|
+
} catch {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
addDiscovered(url: string, depth: number, sourceUrl?: string): void {
|
|
61
|
+
const normalized = this.normalizeUrl(url);
|
|
62
|
+
|
|
63
|
+
if (!this.discovered.has(normalized)) {
|
|
64
|
+
this.discovered.set(normalized, depth);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (sourceUrl) {
|
|
68
|
+
const normalizedSource = this.normalizeUrl(sourceUrl);
|
|
69
|
+
if (!this.sourcePagesMap.has(normalized)) {
|
|
70
|
+
this.sourcePagesMap.set(normalized, new Set());
|
|
71
|
+
}
|
|
72
|
+
this.sourcePagesMap.get(normalized)!.add(normalizedSource);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
markVisited(url: string): void {
|
|
77
|
+
const normalized = this.normalizeUrl(url);
|
|
78
|
+
this.visited.add(normalized);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
isVisited(url: string): boolean {
|
|
82
|
+
return this.visited.has(this.normalizeUrl(url));
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
isDiscovered(url: string): boolean {
|
|
86
|
+
return this.discovered.has(this.normalizeUrl(url));
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
getSourcePages(url: string): string[] {
|
|
90
|
+
const normalized = this.normalizeUrl(url);
|
|
91
|
+
const sources = this.sourcePagesMap.get(normalized);
|
|
92
|
+
return sources ? Array.from(sources) : [];
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
getDepth(url: string): number {
|
|
96
|
+
return this.discovered.get(this.normalizeUrl(url)) ?? 0;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
getTotalDiscovered(): number {
|
|
100
|
+
return this.discovered.size;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
getTotalVisited(): number {
|
|
104
|
+
return this.visited.size;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
getMaxDepth(): number {
|
|
108
|
+
return Math.max(0, ...Array.from(this.discovered.values()));
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
getUnvisitedUrls(): string[] {
|
|
112
|
+
return Array.from(this.discovered.keys()).filter(url => !this.visited.has(url));
|
|
113
|
+
}
|
|
114
|
+
}
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured SEO Report Formatter
|
|
3
|
+
*
|
|
4
|
+
* Transforms raw SEO analysis data into structured, actionable reports
|
|
5
|
+
* with clear issue categorization and remediation guidance.
|
|
6
|
+
*
|
|
7
|
+
* Output Format:
|
|
8
|
+
* - Issue Name | Issue Type | Priority | URLs | % of Total | Description | How To Fix
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
interface SEOIssue {
|
|
12
|
+
query: string;
|
|
13
|
+
category: string;
|
|
14
|
+
priority: string;
|
|
15
|
+
description: string;
|
|
16
|
+
impact: string;
|
|
17
|
+
fix: string;
|
|
18
|
+
affectedCount: number;
|
|
19
|
+
examples: Array<{
|
|
20
|
+
url: string;
|
|
21
|
+
detail?: string;
|
|
22
|
+
}>;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
interface StructuredIssue {
|
|
26
|
+
issueName: string;
|
|
27
|
+
issueType: 'Issue' | 'Warning' | 'Opportunity';
|
|
28
|
+
issuePriority: 'Critical' | 'High' | 'Medium' | 'Low';
|
|
29
|
+
urls: number;
|
|
30
|
+
percentOfTotal: number;
|
|
31
|
+
description: string;
|
|
32
|
+
howToFix: string;
|
|
33
|
+
examples: Array<{
|
|
34
|
+
url: string;
|
|
35
|
+
detail?: string;
|
|
36
|
+
}>;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
interface StructuredReport {
|
|
40
|
+
overview: {
|
|
41
|
+
totalPages: number;
|
|
42
|
+
totalIssues: number;
|
|
43
|
+
criticalIssues: number;
|
|
44
|
+
highPriorityIssues: number;
|
|
45
|
+
mediumPriorityIssues: number;
|
|
46
|
+
lowPriorityIssues: number;
|
|
47
|
+
};
|
|
48
|
+
issues: StructuredIssue[];
|
|
49
|
+
executionTime: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export class StructuredReportFormatter {
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Convert internal SEO issues to structured format
|
|
56
|
+
*/
|
|
57
|
+
static formatReport(
|
|
58
|
+
issues: SEOIssue[],
|
|
59
|
+
totalPages: number,
|
|
60
|
+
executionTime: number
|
|
61
|
+
): StructuredReport {
|
|
62
|
+
|
|
63
|
+
const formattedIssues = issues.map(issue =>
|
|
64
|
+
this.formatIssue(issue, totalPages)
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
// Sort by priority (Critical > High > Medium > Low)
|
|
68
|
+
const priorityOrder = { Critical: 0, High: 1, Medium: 2, Low: 3 };
|
|
69
|
+
formattedIssues.sort((a, b) =>
|
|
70
|
+
priorityOrder[a.issuePriority] - priorityOrder[b.issuePriority]
|
|
71
|
+
);
|
|
72
|
+
|
|
73
|
+
const overview = {
|
|
74
|
+
totalPages,
|
|
75
|
+
totalIssues: formattedIssues.length,
|
|
76
|
+
criticalIssues: formattedIssues.filter(i => i.issuePriority === 'Critical').length,
|
|
77
|
+
highPriorityIssues: formattedIssues.filter(i => i.issuePriority === 'High').length,
|
|
78
|
+
mediumPriorityIssues: formattedIssues.filter(i => i.issuePriority === 'Medium').length,
|
|
79
|
+
lowPriorityIssues: formattedIssues.filter(i => i.issuePriority === 'Low').length
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
overview,
|
|
84
|
+
issues: formattedIssues,
|
|
85
|
+
executionTime
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Format individual issue to structured style
|
|
91
|
+
*/
|
|
92
|
+
private static formatIssue(
|
|
93
|
+
issue: SEOIssue,
|
|
94
|
+
totalPages: number
|
|
95
|
+
): StructuredIssue {
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
issueName: this.formatIssueName(issue),
|
|
99
|
+
issueType: this.getIssueType(issue.category, issue.priority),
|
|
100
|
+
issuePriority: this.normalizePriority(issue.priority),
|
|
101
|
+
urls: issue.affectedCount,
|
|
102
|
+
percentOfTotal: parseFloat(((issue.affectedCount / totalPages) * 100).toFixed(2)),
|
|
103
|
+
description: issue.description,
|
|
104
|
+
howToFix: issue.fix,
|
|
105
|
+
examples: issue.examples
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Convert query name to user-friendly issue name
|
|
111
|
+
*/
|
|
112
|
+
private static formatIssueName(issue: SEOIssue): string {
|
|
113
|
+
const nameMap: Record<string, string> = {
|
|
114
|
+
'missing-titles': 'Page Titles: Missing',
|
|
115
|
+
'broken-internal-links': 'Links: Broken Internal Links',
|
|
116
|
+
'server-errors': 'Response Codes: Internal Server Error (5xx)',
|
|
117
|
+
'404-errors': 'Response Codes: Internal Client Error (4xx)',
|
|
118
|
+
'duplicate-titles': 'Page Titles: Duplicate',
|
|
119
|
+
'duplicate-meta-descriptions': 'Meta Description: Duplicate',
|
|
120
|
+
'missing-meta-descriptions': 'Meta Description: Missing',
|
|
121
|
+
'thin-content': 'Content: Low Content Pages',
|
|
122
|
+
'missing-h1': 'H1: Missing',
|
|
123
|
+
'multiple-h1': 'H1: Multiple',
|
|
124
|
+
'duplicate-h1': 'H1: Duplicate',
|
|
125
|
+
'redirects': 'Response Codes: Internal Redirection (3xx)',
|
|
126
|
+
'orphan-pages': 'Links: Orphan Pages',
|
|
127
|
+
'canonical-issues': 'Canonical: Issues',
|
|
128
|
+
'non-https': 'Security: Non-HTTPS URLs',
|
|
129
|
+
'heading-hierarchy-issues': 'Heading Hierarchy: Issues',
|
|
130
|
+
'missing-hsts': 'Security: Missing HSTS Header',
|
|
131
|
+
'missing-csp': 'Security: Missing Content-Security-Policy Header',
|
|
132
|
+
'missing-x-frame-options': 'Security: Missing X-Frame-Options Header',
|
|
133
|
+
'missing-referrer-policy': 'Security: Missing Secure Referrer-Policy Header',
|
|
134
|
+
'unsafe-external-links': 'Security: Unsafe Cross-Origin Links',
|
|
135
|
+
'protocol-relative-links': 'Security: Protocol-Relative Resource Links',
|
|
136
|
+
'title-length': 'Page Titles: Length Issues',
|
|
137
|
+
'meta-description-length': 'Meta Description: Length Issues',
|
|
138
|
+
'title-equals-h1': 'Page Titles: Same as H1',
|
|
139
|
+
'no-outbound-links': 'Links: Pages With No Outbound Links',
|
|
140
|
+
'high-external-links': 'Links: Pages With High External Outlinks',
|
|
141
|
+
'missing-images': 'Images: Missing On Content Pages'
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
return nameMap[issue.query] || issue.query;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Determine issue type based on category and priority
|
|
149
|
+
*/
|
|
150
|
+
private static getIssueType(
|
|
151
|
+
category: string,
|
|
152
|
+
priority: string
|
|
153
|
+
): 'Issue' | 'Warning' | 'Opportunity' {
|
|
154
|
+
|
|
155
|
+
// Critical/High priority = Issue
|
|
156
|
+
if (priority === 'CRITICAL' || priority === 'HIGH') {
|
|
157
|
+
return 'Issue';
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Opportunities category = Opportunity
|
|
161
|
+
if (category === 'opportunities') {
|
|
162
|
+
return 'Opportunity';
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Everything else = Warning
|
|
166
|
+
return 'Warning';
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Normalize priority to title case
|
|
171
|
+
*/
|
|
172
|
+
private static normalizePriority(priority: string): 'Critical' | 'High' | 'Medium' | 'Low' {
|
|
173
|
+
const normalized = priority.charAt(0).toUpperCase() + priority.slice(1).toLowerCase();
|
|
174
|
+
return normalized as 'Critical' | 'High' | 'Medium' | 'Low';
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Generate plain text summary for terminal output
|
|
179
|
+
*/
|
|
180
|
+
static generateTextSummary(report: StructuredReport): string {
|
|
181
|
+
const lines: string[] = [];
|
|
182
|
+
|
|
183
|
+
lines.push('═══════════════════════════════════════════════════════');
|
|
184
|
+
lines.push(' SEO AUDIT SUMMARY');
|
|
185
|
+
lines.push('═══════════════════════════════════════════════════════');
|
|
186
|
+
lines.push('');
|
|
187
|
+
lines.push(`Total Pages Crawled: ${report.overview.totalPages.toLocaleString()}`);
|
|
188
|
+
lines.push(`Total Issues Found: ${report.overview.totalIssues}`);
|
|
189
|
+
lines.push('');
|
|
190
|
+
lines.push('Issues by Priority:');
|
|
191
|
+
lines.push(` Critical: ${report.overview.criticalIssues}`);
|
|
192
|
+
lines.push(` High: ${report.overview.highPriorityIssues}`);
|
|
193
|
+
lines.push(` Medium: ${report.overview.mediumPriorityIssues}`);
|
|
194
|
+
lines.push(` Low: ${report.overview.lowPriorityIssues}`);
|
|
195
|
+
lines.push('');
|
|
196
|
+
lines.push(`Analysis Time: ${report.executionTime}ms`);
|
|
197
|
+
lines.push('═══════════════════════════════════════════════════════');
|
|
198
|
+
lines.push('');
|
|
199
|
+
|
|
200
|
+
// Group by priority
|
|
201
|
+
const critical = report.issues.filter(i => i.issuePriority === 'Critical');
|
|
202
|
+
const high = report.issues.filter(i => i.issuePriority === 'High');
|
|
203
|
+
const medium = report.issues.filter(i => i.issuePriority === 'Medium');
|
|
204
|
+
const low = report.issues.filter(i => i.issuePriority === 'Low');
|
|
205
|
+
|
|
206
|
+
if (critical.length > 0) {
|
|
207
|
+
lines.push('CRITICAL ISSUES (Fix Immediately)');
|
|
208
|
+
lines.push('───────────────────────────────────────────────────────');
|
|
209
|
+
critical.forEach(issue => {
|
|
210
|
+
lines.push('');
|
|
211
|
+
lines.push(`${issue.issueName}`);
|
|
212
|
+
lines.push(` URLs Affected: ${issue.urls} (${issue.percentOfTotal}%)`);
|
|
213
|
+
lines.push(` Type: ${issue.issueType}`);
|
|
214
|
+
lines.push(` Description: ${issue.description}`);
|
|
215
|
+
lines.push(` How To Fix: ${issue.howToFix}`);
|
|
216
|
+
});
|
|
217
|
+
lines.push('');
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
if (high.length > 0) {
|
|
221
|
+
lines.push('HIGH PRIORITY ISSUES');
|
|
222
|
+
lines.push('───────────────────────────────────────────────────────');
|
|
223
|
+
high.forEach(issue => {
|
|
224
|
+
lines.push('');
|
|
225
|
+
lines.push(`${issue.issueName}`);
|
|
226
|
+
lines.push(` URLs Affected: ${issue.urls} (${issue.percentOfTotal}%)`);
|
|
227
|
+
lines.push(` Type: ${issue.issueType}`);
|
|
228
|
+
lines.push(` Description: ${issue.description}`);
|
|
229
|
+
lines.push(` How To Fix: ${issue.howToFix}`);
|
|
230
|
+
});
|
|
231
|
+
lines.push('');
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (medium.length > 0) {
|
|
235
|
+
lines.push('MEDIUM PRIORITY ISSUES');
|
|
236
|
+
lines.push('───────────────────────────────────────────────────────');
|
|
237
|
+
medium.forEach(issue => {
|
|
238
|
+
lines.push(`${issue.issueName} - ${issue.urls} URLs (${issue.percentOfTotal}%)`);
|
|
239
|
+
});
|
|
240
|
+
lines.push('');
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
if (low.length > 0) {
|
|
244
|
+
lines.push('LOW PRIORITY / OPPORTUNITIES');
|
|
245
|
+
lines.push('───────────────────────────────────────────────────────');
|
|
246
|
+
low.forEach(issue => {
|
|
247
|
+
lines.push(`${issue.issueName} - ${issue.urls} URLs (${issue.percentOfTotal}%)`);
|
|
248
|
+
});
|
|
249
|
+
lines.push('');
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return lines.join('\n');
|
|
253
|
+
}
|
|
254
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// Copyright 2026 Richard Baxter
|
|
4
|
+
// Licensed under the Apache License, Version 2.0
|
|
5
|
+
|
|
6
|
+
// CRITICAL: Disable ALL Crawlee logging for MCP stdio compatibility
|
|
7
|
+
process.env.CRAWLEE_LOG_LEVEL = 'OFF';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Crawlee MCP Server v2
|
|
11
|
+
*
|
|
12
|
+
* Professional website crawler and SEO analyzer
|
|
13
|
+
* Built with @modelcontextprotocol/sdk and Crawlee
|
|
14
|
+
*
|
|
15
|
+
* Phase 1: MCP server skeleton with tool registration ✅
|
|
16
|
+
* Phase 2: Full crawling engine implementation ✅
|
|
17
|
+
* Phase 3: SEO analysis layer ✅
|
|
18
|
+
* Phase 4: Fixed RequestQueue persistence bug ✅
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
22
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
23
|
+
import {
|
|
24
|
+
CallToolRequestSchema,
|
|
25
|
+
ListToolsRequestSchema,
|
|
26
|
+
Tool
|
|
27
|
+
} from '@modelcontextprotocol/sdk/types.js';
|
|
28
|
+
|
|
29
|
+
import { runSeoAudit } from './tools/run-seo-audit.js';
|
|
30
|
+
import { analyzeSeo } from './tools/analyze-seo.js';
|
|
31
|
+
import { querySeoData } from './tools/query-seo-data.js';
|
|
32
|
+
import { listQueries } from './tools/list-queries.js';
|
|
33
|
+
|
|
34
|
+
const SERVER_NAME = 'seo-crawler-mcp';
|
|
35
|
+
const SERVER_VERSION = '2.0.1'; // Version bump for storage cleanup fix
|
|
36
|
+
|
|
37
|
+
const tools: Tool[] = [
|
|
38
|
+
{
|
|
39
|
+
name: 'run_seo_audit',
|
|
40
|
+
description: 'Crawl a website and extract comprehensive SEO data using Crawlee HttpCrawler. Returns crawl ID and output path.',
|
|
41
|
+
inputSchema: {
|
|
42
|
+
type: 'object',
|
|
43
|
+
properties: {
|
|
44
|
+
url: {
|
|
45
|
+
type: 'string',
|
|
46
|
+
description: 'Starting URL to crawl (must include http:// or https://)'
|
|
47
|
+
},
|
|
48
|
+
maxPages: {
|
|
49
|
+
type: 'number',
|
|
50
|
+
description: 'Maximum number of pages to crawl (1-10000). Default: 1000',
|
|
51
|
+
minimum: 1,
|
|
52
|
+
maximum: 10000
|
|
53
|
+
},
|
|
54
|
+
depth: {
|
|
55
|
+
type: 'number',
|
|
56
|
+
description: 'Maximum crawl depth (1-10). Default: 3',
|
|
57
|
+
minimum: 1,
|
|
58
|
+
maximum: 10
|
|
59
|
+
},
|
|
60
|
+
userAgent: {
|
|
61
|
+
type: 'string',
|
|
62
|
+
enum: ['chrome', 'googlebot'],
|
|
63
|
+
description: 'User agent to identify as: "chrome" (default, Chrome browser) or "googlebot" (Googlebot crawler). Default: chrome'
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
required: ['url']
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
name: 'analyze_seo',
|
|
71
|
+
description: 'Analyze SEO data from a completed crawl. Runs 25+ SQL queries to detect critical issues, content problems, technical SEO issues, security vulnerabilities, and optimization opportunities. Returns structured report with affected URLs and fix recommendations.',
|
|
72
|
+
inputSchema: {
|
|
73
|
+
type: 'object',
|
|
74
|
+
properties: {
|
|
75
|
+
crawlPath: {
|
|
76
|
+
type: 'string',
|
|
77
|
+
description: 'Path to crawl output directory (e.g., C:/seo-audits/example.com_2026-02-01_abc123)'
|
|
78
|
+
},
|
|
79
|
+
includeCategories: {
|
|
80
|
+
type: 'array',
|
|
81
|
+
items: {
|
|
82
|
+
type: 'string',
|
|
83
|
+
enum: ['critical', 'content', 'technical', 'security', 'opportunities']
|
|
84
|
+
},
|
|
85
|
+
description: 'Optional: Filter analysis by categories. Default: all categories'
|
|
86
|
+
},
|
|
87
|
+
maxExamplesPerIssue: {
|
|
88
|
+
type: 'number',
|
|
89
|
+
description: 'Maximum example URLs to return per issue. Default: 10',
|
|
90
|
+
minimum: 1,
|
|
91
|
+
maximum: 100
|
|
92
|
+
},
|
|
93
|
+
format: {
|
|
94
|
+
type: 'string',
|
|
95
|
+
enum: ['detailed', 'summary', 'structured'],
|
|
96
|
+
description: 'Output format: "structured" (organized format, default), "summary" (text overview), "detailed" (full JSON). Default: structured'
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
required: ['crawlPath']
|
|
100
|
+
}
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
name: 'query_seo_data',
|
|
104
|
+
description: 'Execute a specific SEO analysis query by name. Use list_seo_queries to see available queries. Returns detailed results with affected URLs and context.',
|
|
105
|
+
inputSchema: {
|
|
106
|
+
type: 'object',
|
|
107
|
+
properties: {
|
|
108
|
+
crawlPath: {
|
|
109
|
+
type: 'string',
|
|
110
|
+
description: 'Path to crawl output directory'
|
|
111
|
+
},
|
|
112
|
+
query: {
|
|
113
|
+
type: 'string',
|
|
114
|
+
description: 'Query name (e.g., "missing-titles", "duplicate-h1", "orphan-pages"). Use list_seo_queries to see all available queries.'
|
|
115
|
+
},
|
|
116
|
+
limit: {
|
|
117
|
+
type: 'number',
|
|
118
|
+
description: 'Optional: Maximum number of results to return. Default: 100',
|
|
119
|
+
minimum: 1,
|
|
120
|
+
maximum: 1000
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
required: ['crawlPath', 'query']
|
|
124
|
+
}
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
name: 'list_seo_queries',
|
|
128
|
+
description: 'List all available SEO analysis queries with descriptions, priorities, and fix recommendations. Optionally filter by category or priority level.',
|
|
129
|
+
inputSchema: {
|
|
130
|
+
type: 'object',
|
|
131
|
+
properties: {
|
|
132
|
+
category: {
|
|
133
|
+
type: 'string',
|
|
134
|
+
enum: ['critical', 'content', 'technical', 'security', 'opportunities'],
|
|
135
|
+
description: 'Optional: Filter by category'
|
|
136
|
+
},
|
|
137
|
+
priority: {
|
|
138
|
+
type: 'string',
|
|
139
|
+
enum: ['CRITICAL', 'HIGH', 'MEDIUM', 'LOW'],
|
|
140
|
+
description: 'Optional: Filter by priority level'
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
];
|
|
146
|
+
|
|
147
|
+
class CrawleeMcpServer {
|
|
148
|
+
private server: Server;
|
|
149
|
+
|
|
150
|
+
constructor() {
|
|
151
|
+
this.server = new Server(
|
|
152
|
+
{
|
|
153
|
+
name: SERVER_NAME,
|
|
154
|
+
version: SERVER_VERSION
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
capabilities: {
|
|
158
|
+
tools: {}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
);
|
|
162
|
+
|
|
163
|
+
this.setupHandlers();
|
|
164
|
+
this.setupErrorHandling();
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
private setupHandlers(): void {
|
|
168
|
+
this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
169
|
+
tools
|
|
170
|
+
}));
|
|
171
|
+
|
|
172
|
+
this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
173
|
+
const { name, arguments: args } = request.params;
|
|
174
|
+
|
|
175
|
+
try {
|
|
176
|
+
let result: any;
|
|
177
|
+
|
|
178
|
+
switch (name) {
|
|
179
|
+
case 'run_seo_audit':
|
|
180
|
+
result = await runSeoAudit(args as any);
|
|
181
|
+
break;
|
|
182
|
+
|
|
183
|
+
case 'analyze_seo':
|
|
184
|
+
result = await analyzeSeo(args as any);
|
|
185
|
+
break;
|
|
186
|
+
|
|
187
|
+
case 'query_seo_data':
|
|
188
|
+
result = await querySeoData(args as any);
|
|
189
|
+
break;
|
|
190
|
+
|
|
191
|
+
case 'list_seo_queries':
|
|
192
|
+
result = await listQueries(args as any);
|
|
193
|
+
break;
|
|
194
|
+
|
|
195
|
+
default:
|
|
196
|
+
return {
|
|
197
|
+
content: [
|
|
198
|
+
{
|
|
199
|
+
type: 'text',
|
|
200
|
+
text: `Unknown tool: ${name}`
|
|
201
|
+
}
|
|
202
|
+
],
|
|
203
|
+
isError: true
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
content: [
|
|
209
|
+
{
|
|
210
|
+
type: 'text',
|
|
211
|
+
text: JSON.stringify(result, null, 2)
|
|
212
|
+
}
|
|
213
|
+
]
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
} catch (error) {
|
|
217
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
218
|
+
|
|
219
|
+
return {
|
|
220
|
+
content: [
|
|
221
|
+
{
|
|
222
|
+
type: 'text',
|
|
223
|
+
text: JSON.stringify({
|
|
224
|
+
error: errorMessage,
|
|
225
|
+
tool: name
|
|
226
|
+
}, null, 2)
|
|
227
|
+
}
|
|
228
|
+
],
|
|
229
|
+
isError: true
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
private setupErrorHandling(): void {
|
|
236
|
+
this.server.onerror = (error) => {
|
|
237
|
+
console.error('[MCP Error]', error);
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
process.on('SIGINT', async () => {
|
|
241
|
+
await this.server.close();
|
|
242
|
+
process.exit(0);
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
async run(): Promise<void> {
|
|
247
|
+
const transport = new StdioServerTransport();
|
|
248
|
+
await this.server.connect(transport);
|
|
249
|
+
|
|
250
|
+
console.error(`${SERVER_NAME} v${SERVER_VERSION} running on stdio`);
|
|
251
|
+
console.error('✅ Phase 1: MCP server active');
|
|
252
|
+
console.error('✅ Phase 2: Crawling engine ready');
|
|
253
|
+
console.error('✅ Phase 3: SEO analysis layer active');
|
|
254
|
+
console.error('✅ Phase 4: RequestQueue bug fixed');
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
const server = new CrawleeMcpServer();
|
|
259
|
+
server.run().catch(console.error);
|