@houtini/seo-crawler-mcp 2.1.2 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/houtini-logo.jpg +0 -0
  2. package/package.json +9 -1
  3. package/server.json +2 -2
  4. package/.github/workflows/ci.yml +0 -59
  5. package/src/analyzers/QueryLoader.ts +0 -175
  6. package/src/analyzers/queries/README.md +0 -228
  7. package/src/analyzers/queries/content/duplicate-h1.sql +0 -18
  8. package/src/analyzers/queries/content/duplicate-meta-descriptions.sql +0 -18
  9. package/src/analyzers/queries/content/duplicate-titles.sql +0 -19
  10. package/src/analyzers/queries/content/missing-h1.sql +0 -18
  11. package/src/analyzers/queries/content/missing-meta-descriptions.sql +0 -19
  12. package/src/analyzers/queries/content/multiple-h1.sql +0 -17
  13. package/src/analyzers/queries/content/thin-content.sql +0 -18
  14. package/src/analyzers/queries/critical/404-errors.sql +0 -14
  15. package/src/analyzers/queries/critical/broken-internal-links.sql +0 -22
  16. package/src/analyzers/queries/critical/missing-titles.sql +0 -17
  17. package/src/analyzers/queries/critical/server-errors.sql +0 -15
  18. package/src/analyzers/queries/opportunities/high-external-links.sql +0 -18
  19. package/src/analyzers/queries/opportunities/meta-description-length.sql +0 -27
  20. package/src/analyzers/queries/opportunities/missing-images.sql +0 -18
  21. package/src/analyzers/queries/opportunities/no-outbound-links.sql +0 -18
  22. package/src/analyzers/queries/opportunities/title-equals-h1.sql +0 -21
  23. package/src/analyzers/queries/opportunities/title-length.sql +0 -27
  24. package/src/analyzers/queries/opportunities/uncrawled-internal-links.sql +0 -20
  25. package/src/analyzers/queries/security/missing-csp.sql +0 -16
  26. package/src/analyzers/queries/security/missing-hsts.sql +0 -17
  27. package/src/analyzers/queries/security/missing-referrer-policy.sql +0 -16
  28. package/src/analyzers/queries/security/missing-x-frame-options.sql +0 -16
  29. package/src/analyzers/queries/security/protocol-relative-links.sql +0 -16
  30. package/src/analyzers/queries/security/unsafe-external-links.sql +0 -17
  31. package/src/analyzers/queries/technical/canonical-issues.sql +0 -20
  32. package/src/analyzers/queries/technical/heading-hierarchy-issues.sql +0 -19
  33. package/src/analyzers/queries/technical/non-https.sql +0 -16
  34. package/src/analyzers/queries/technical/orphan-pages.sql +0 -21
  35. package/src/analyzers/queries/technical/redirects.sql +0 -15
  36. package/src/cli.ts +0 -228
  37. package/src/core/ContentExtractor.ts +0 -480
  38. package/src/core/CrawlDatabase.ts +0 -736
  39. package/src/core/CrawlOrchestrator.ts +0 -346
  40. package/src/core/CrawlStorage.ts +0 -148
  41. package/src/core/LinkExtractor.ts +0 -119
  42. package/src/core/UrlManager.ts +0 -110
  43. package/src/formatters/structured-report-format.ts +0 -254
  44. package/src/index.ts +0 -261
  45. package/src/schema/index.ts +0 -176
  46. package/src/tools/analyze-seo.ts +0 -184
  47. package/src/tools/list-queries.ts +0 -70
  48. package/src/tools/query-seo-data.ts +0 -77
  49. package/src/tools/run-seo-audit.ts +0 -91
  50. package/src/types/index.ts +0 -179
  51. package/src/utils/debug.ts +0 -12
  52. package/tsconfig.json +0 -26
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@houtini/seo-crawler-mcp",
3
- "version": "2.1.2",
3
+ "version": "2.1.3",
4
4
  "mcpName": "io.github.houtini-ai/seo-crawler-mcp",
5
5
  "description": "Crawl and analyse websites for SEO errors and issues using Crawlee with SQLite storage",
6
6
  "type": "module",
@@ -23,6 +23,14 @@
23
23
  "audit",
24
24
  "model-context-protocol"
25
25
  ],
26
+ "files": [
27
+ "build",
28
+ "README.md",
29
+ "LICENSE",
30
+ "NOTICE",
31
+ "houtini-logo.jpg",
32
+ "server.json"
33
+ ],
26
34
  "engines": {
27
35
  "node": ">=18.0.0"
28
36
  },
package/server.json CHANGED
@@ -6,12 +6,12 @@
6
6
  "url": "https://github.com/houtini-ai/seo-crawler-mcp",
7
7
  "source": "github"
8
8
  },
9
- "version": "2.0.1",
9
+ "version": "2.1.3",
10
10
  "packages": [
11
11
  {
12
12
  "registryType": "npm",
13
13
  "identifier": "@houtini/seo-crawler-mcp",
14
- "version": "2.0.1",
14
+ "version": "2.1.3",
15
15
  "transport": {
16
16
  "type": "stdio"
17
17
  },
@@ -1,59 +0,0 @@
1
- name: CI
2
-
3
- on:
4
- push:
5
- branches: [ main ]
6
- pull_request:
7
- branches: [ main ]
8
-
9
- jobs:
10
- build:
11
- runs-on: ubuntu-latest
12
-
13
- strategy:
14
- matrix:
15
- node-version: [18.x, 20.x, 22.x]
16
-
17
- steps:
18
- - name: Checkout repository
19
- uses: actions/checkout@v4
20
-
21
- - name: Setup Node.js ${{ matrix.node-version }}
22
- uses: actions/setup-node@v4
23
- with:
24
- node-version: ${{ matrix.node-version }}
25
- cache: 'npm'
26
-
27
- - name: Install dependencies
28
- run: npm ci
29
-
30
- - name: Run TypeScript build
31
- run: npm run build
32
-
33
- - name: Run tests
34
- run: npm test
35
- continue-on-error: true
36
-
37
- - name: Check for TypeScript errors
38
- run: npx tsc --noEmit
39
-
40
- lint:
41
- runs-on: ubuntu-latest
42
-
43
- steps:
44
- - name: Checkout repository
45
- uses: actions/checkout@v4
46
-
47
- - name: Setup Node.js
48
- uses: actions/setup-node@v4
49
- with:
50
- node-version: '20.x'
51
- cache: 'npm'
52
-
53
- - name: Install dependencies
54
- run: npm ci
55
-
56
- - name: Check formatting
57
- run: |
58
- echo "Checking for common code issues..."
59
- npx tsc --noEmit || echo "TypeScript check completed"
@@ -1,175 +0,0 @@
1
- import fs from 'fs';
2
- import path from 'path';
3
- import { fileURLToPath } from 'url';
4
-
5
- const __filename = fileURLToPath(import.meta.url);
6
- const __dirname = path.dirname(__filename);
7
-
8
- export interface QueryMetadata {
9
- name: string;
10
- category: 'critical' | 'content' | 'technical' | 'security' | 'opportunities';
11
- priority: 'CRITICAL' | 'HIGH' | 'MEDIUM' | 'LOW';
12
- description: string;
13
- impact: string;
14
- fix: string;
15
- sql: string;
16
- }
17
-
18
- function findQueriesPath(): string {
19
- // Try multiple possible locations
20
- const candidates = [
21
- // When running from compiled build/analyzers/QueryLoader.js
22
- path.join(__dirname, 'queries'),
23
- // When running from src/analyzers/QueryLoader.ts
24
- path.join(__dirname, '..', '..', 'src', 'analyzers', 'queries'),
25
- // When running from project root
26
- path.join(process.cwd(), 'src', 'analyzers', 'queries'),
27
- // When running tests from project root
28
- path.join(process.cwd(), 'build', 'analyzers', 'queries'),
29
- ];
30
-
31
- for (const candidate of candidates) {
32
- if (fs.existsSync(candidate)) {
33
- return candidate;
34
- }
35
- }
36
-
37
- throw new Error(`Could not find queries directory. Tried: ${candidates.join(', ')}`);
38
- }
39
-
40
- export class QueryLoader {
41
- private queriesPath: string;
42
- private queries: Map<string, QueryMetadata>;
43
-
44
- constructor() {
45
- this.queriesPath = findQueriesPath();
46
- this.queries = new Map();
47
- this.loadAllQueries();
48
- }
49
-
50
- private loadAllQueries(): void {
51
- const categories = ['critical', 'content', 'technical', 'security', 'opportunities'];
52
-
53
- for (const category of categories) {
54
- const categoryPath = path.join(this.queriesPath, category);
55
-
56
- if (!fs.existsSync(categoryPath)) {
57
- continue;
58
- }
59
-
60
- const files = fs.readdirSync(categoryPath).filter(f => f.endsWith('.sql'));
61
-
62
- for (const file of files) {
63
- const filePath = path.join(categoryPath, file);
64
- const queryName = file.replace('.sql', '');
65
- const metadata = this.parseQueryFile(filePath, queryName, category as any);
66
-
67
- this.queries.set(queryName, metadata);
68
- }
69
- }
70
- }
71
-
72
- private parseQueryFile(
73
- filePath: string,
74
- name: string,
75
- category: QueryMetadata['category']
76
- ): QueryMetadata {
77
- const content = fs.readFileSync(filePath, 'utf-8');
78
- const lines = content.split('\n');
79
-
80
- let description = '';
81
- let priority: QueryMetadata['priority'] = 'MEDIUM';
82
- let impact = '';
83
- let fix = '';
84
- const sqlLines: string[] = [];
85
- let inSQL = false;
86
-
87
- for (const line of lines) {
88
- const trimmed = line.trim();
89
-
90
- if (trimmed.startsWith('--')) {
91
- const comment = trimmed.substring(2).trim();
92
-
93
- if (comment.startsWith('Priority:')) {
94
- priority = comment.replace('Priority:', '').trim() as QueryMetadata['priority'];
95
- } else if (comment.startsWith('Impact:')) {
96
- impact = comment.replace('Impact:', '').trim();
97
- } else if (comment.startsWith('Fix:')) {
98
- fix = comment.replace('Fix:', '').trim();
99
- } else if (comment && !comment.startsWith('Category:')) {
100
- if (!description) {
101
- description = comment;
102
- }
103
- }
104
- } else if (trimmed.startsWith('SELECT')) {
105
- inSQL = true;
106
- }
107
-
108
- if (inSQL) {
109
- sqlLines.push(line);
110
- }
111
- }
112
-
113
- return {
114
- name,
115
- category,
116
- priority,
117
- description,
118
- impact,
119
- fix,
120
- sql: sqlLines.join('\n').trim()
121
- };
122
- }
123
-
124
- getQuery(name: string): QueryMetadata | undefined {
125
- return this.queries.get(name);
126
- }
127
-
128
- getAllQueries(): QueryMetadata[] {
129
- return Array.from(this.queries.values());
130
- }
131
-
132
- getQueriesByCategory(category: QueryMetadata['category']): QueryMetadata[] {
133
- return this.getAllQueries().filter(q => q.category === category);
134
- }
135
-
136
- getQueriesByPriority(priority: QueryMetadata['priority']): QueryMetadata[] {
137
- return this.getAllQueries().filter(q => q.priority === priority);
138
- }
139
-
140
- getCriticalQueries(): QueryMetadata[] {
141
- return this.getQueriesByPriority('CRITICAL');
142
- }
143
-
144
- getHighPriorityQueries(): QueryMetadata[] {
145
- return this.getQueriesByPriority('HIGH');
146
- }
147
-
148
- listQueryNames(): string[] {
149
- return Array.from(this.queries.keys()).sort();
150
- }
151
-
152
- getQueryStats(): {
153
- total: number;
154
- byCategory: Record<string, number>;
155
- byPriority: Record<string, number>;
156
- } {
157
- const queries = this.getAllQueries();
158
-
159
- const byCategory: Record<string, number> = {};
160
- const byPriority: Record<string, number> = {};
161
-
162
- for (const query of queries) {
163
- byCategory[query.category] = (byCategory[query.category] || 0) + 1;
164
- byPriority[query.priority] = (byPriority[query.priority] || 0) + 1;
165
- }
166
-
167
- return {
168
- total: queries.length,
169
- byCategory,
170
- byPriority
171
- };
172
- }
173
- }
174
-
175
- export const queryLoader = new QueryLoader();
@@ -1,228 +0,0 @@
1
- # SEO Analysis Query Library
2
-
3
- **Version:** 1.0.0
4
- **Last Updated:** 2026-02-01
5
- **Coverage:** 25 detectable SEO issues
6
-
7
- ## Query Organization
8
-
9
- All queries follow a standard format:
10
- - SQL comments describing the issue
11
- - Priority level (CRITICAL, HIGH, MEDIUM, LOW)
12
- - Category classification
13
- - Optimized SELECT statements with ORDER BY and LIMIT
14
- - Results limited to 100 rows for performance
15
-
16
- ## Critical Issues (4 queries)
17
-
18
- **Indexability issues that must be fixed immediately**
19
-
20
- 1. **missing-titles.sql** - Pages without title tags
21
- - Priority: CRITICAL
22
- - Impact: Major indexability problem
23
- - Fix: Add unique, descriptive title tags
24
-
25
- 2. **broken-internal-links.sql** - Internal links to 404/5xx pages
26
- - Priority: CRITICAL
27
- - Impact: Poor user experience, crawl budget waste
28
- - Fix: Update or remove broken links
29
-
30
- 3. **server-errors.sql** - Pages returning 5xx errors
31
- - Priority: CRITICAL
32
- - Impact: Prevents indexing
33
- - Fix: Debug server issues immediately
34
-
35
- 4. **404-errors.sql** - Pages not found
36
- - Priority: CRITICAL
37
- - Impact: Dead ends for users and crawlers
38
- - Fix: Redirect to relevant pages or restore content
39
-
40
- ## Content Quality Issues (7 queries)
41
-
42
- **Problems with page content and metadata**
43
-
44
- 5. **duplicate-titles.sql** - Multiple pages with same title
45
- - Priority: HIGH
46
- - Impact: Cannibalization, poor CTR
47
- - Fix: Create unique titles for each page
48
-
49
- 6. **duplicate-meta-descriptions.sql** - Duplicate meta descriptions
50
- - Priority: MEDIUM
51
- - Impact: Reduced CTR, missed opportunities
52
- - Fix: Write unique descriptions
53
-
54
- 7. **missing-meta-descriptions.sql** - Pages without descriptions
55
- - Priority: MEDIUM
56
- - Impact: Search engines auto-generate poor snippets
57
- - Fix: Add compelling meta descriptions
58
-
59
- 8. **thin-content.sql** - Pages with < 300 words
60
- - Priority: MEDIUM
61
- - Impact: Low quality signals
62
- - Fix: Expand content or consolidate pages
63
-
64
- 9. **missing-h1.sql** - Pages without H1 tags
65
- - Priority: HIGH
66
- - Impact: Unclear page topic
67
- - Fix: Add descriptive H1 tags
68
-
69
- 10. **multiple-h1.sql** - Pages with multiple H1 tags
70
- - Priority: MEDIUM
71
- - Impact: Diluted topical focus
72
- - Fix: Use single H1 per page
73
-
74
- 11. **duplicate-h1.sql** - Multiple pages with same H1
75
- - Priority: MEDIUM
76
- - Impact: Content cannibalization
77
- - Fix: Differentiate H1 tags
78
-
79
- ## Technical SEO Issues (5 queries)
80
-
81
- **Infrastructure and architecture problems**
82
-
83
- 12. **redirects.sql** - Pages with 3xx redirect status
84
- - Priority: MEDIUM
85
- - Impact: Crawl budget waste, slow page speed
86
- - Fix: Update links to final destinations
87
-
88
- 13. **orphan-pages.sql** - Pages with no internal links
89
- - Priority: MEDIUM
90
- - Impact: Difficult to discover and crawl
91
- - Fix: Add internal links from related pages
92
-
93
- 14. **canonical-issues.sql** - Canonical URL differs from actual URL
94
- - Priority: MEDIUM
95
- - Impact: Duplicate content confusion
96
- - Fix: Review canonical implementation
97
-
98
- 15. **non-https.sql** - Pages not using HTTPS
99
- - Priority: MEDIUM
100
- - Impact: Security warnings, ranking penalty
101
- - Fix: Migrate to HTTPS
102
-
103
- 16. **heading-hierarchy-issues.sql** - Non-sequential headings
104
- - Priority: MEDIUM
105
- - Impact: Poor document structure
106
- - Fix: Correct heading order (h1 → h2 → h3)
107
-
108
- ## Security Issues (6 queries)
109
-
110
- **Security header and link security problems**
111
-
112
- 17. **missing-hsts.sql** - No Strict-Transport-Security header
113
- - Priority: HIGH
114
- - Impact: HTTPS downgrade attacks possible
115
- - Fix: Add HSTS header to server config
116
-
117
- 18. **missing-csp.sql** - No Content-Security-Policy header
118
- - Priority: MEDIUM
119
- - Impact: XSS vulnerability
120
- - Fix: Implement CSP header
121
-
122
- 19. **missing-x-frame-options.sql** - No X-Frame-Options header
123
- - Priority: MEDIUM
124
- - Impact: Clickjacking vulnerability
125
- - Fix: Add X-Frame-Options: DENY
126
-
127
- 20. **missing-referrer-policy.sql** - No Referrer-Policy header
128
- - Priority: LOW
129
- - Impact: Privacy leaks
130
- - Fix: Add Referrer-Policy header
131
-
132
- 21. **unsafe-external-links.sql** - target="_blank" without rel="noopener"
133
- - Priority: MEDIUM
134
- - Impact: Tabnabbing vulnerability
135
- - Fix: Add rel="noopener noreferrer"
136
-
137
- 22. **protocol-relative-links.sql** - Links using //example.com format
138
- - Priority: LOW
139
- - Impact: Mixed content warnings
140
- - Fix: Use absolute HTTPS URLs
141
-
142
- ## Optimization Opportunities (6 queries)
143
-
144
- **Enhancement opportunities for better SEO**
145
-
146
- 23. **title-length.sql** - Titles too short (< 30) or too long (> 60)
147
- - Priority: MEDIUM
148
- - Impact: Truncated or poor SERP display
149
- - Fix: Optimize title length to 30-60 characters
150
-
151
- 24. **meta-description-length.sql** - Descriptions too short/long
152
- - Priority: LOW
153
- - Impact: Suboptimal SERP snippets
154
- - Fix: Optimize to 120-160 characters
155
-
156
- 25. **title-equals-h1.sql** - Title and H1 are identical
157
- - Priority: LOW
158
- - Impact: Missed keyword opportunity
159
- - Fix: Differentiate title and H1 slightly
160
-
161
- 26. **no-outbound-links.sql** - Pages with no links
162
- - Priority: LOW
163
- - Impact: Poor user experience, low PageRank flow
164
- - Fix: Add relevant internal/external links
165
-
166
- 27. **high-external-links.sql** - Pages with > 20 external links
167
- - Priority: LOW
168
- - Impact: Excessive PageRank dilution
169
- - Fix: Review and reduce external links
170
-
171
- 28. **missing-images.sql** - Content pages without images
172
- - Priority: LOW
173
- - Impact: Poor engagement, no image search visibility
174
- - Fix: Add relevant images with alt text
175
-
176
- ## Query Performance
177
-
178
- All queries are optimized for SQLite with:
179
- - Indexed columns (url, status_code, depth)
180
- - LIMIT clauses to prevent excessive results
181
- - Strategic WHERE clauses to filter early
182
- - Simple JOINs where necessary
183
-
184
- **Expected Performance:**
185
- - Simple queries: < 10ms
186
- - Complex queries (duplicates, orphans): < 100ms
187
- - Join queries (broken links): < 200ms
188
-
189
- ## Usage Patterns
190
-
191
- ### Direct SQL Execution
192
- ```typescript
193
- import Database from 'better-sqlite3';
194
- import fs from 'fs';
195
-
196
- const db = new Database('./crawl-data.db');
197
- const query = fs.readFileSync('./queries/critical/missing-titles.sql', 'utf-8');
198
- const results = db.prepare(query).all();
199
- ```
200
-
201
- ### Programmatic Analysis
202
- ```typescript
203
- import { SQLAnalyzer } from './SQLAnalyzer.js';
204
-
205
- const analyzer = new SQLAnalyzer(crawlId);
206
- const report = await analyzer.generateReport();
207
- // Returns structured SEOAnalysisReport
208
- ```
209
-
210
- ### MCP Tool Integration
211
- ```bash
212
- seo-crawler-mcp:analyze_seo crawlId="431841d4"
213
- # Returns JSON report with all 25 issues checked
214
- ```
215
-
216
- ## Future Enhancements
217
-
218
- **Not yet implemented (requires additional data capture):**
219
- - Core Web Vitals analysis (requires Playwright)
220
- - Robots.txt validation (requires separate parser)
221
- - Readability scoring (requires text analysis library)
222
- - Mobile rendering issues (requires device emulation)
223
-
224
- ---
225
-
226
- **Query Coverage: 25 Production-Ready SEO Checks**
227
- **Status: Production Ready**
228
- **Next Step: Build SQLAnalyzer.ts class**
@@ -1,18 +0,0 @@
1
- -- Duplicate H1 Tags (MEDIUM)
2
- -- Multiple pages sharing the same H1 tag
3
- -- Priority: MEDIUM
4
- -- Category: content
5
-
6
- SELECT
7
- h1,
8
- COUNT(*) as page_count,
9
- GROUP_CONCAT(url, '|||') as urls,
10
- MIN(title) as example_title
11
- FROM pages
12
- WHERE h1 IS NOT NULL
13
- AND TRIM(h1) != ''
14
- AND status_code = 200
15
- GROUP BY h1
16
- HAVING COUNT(*) > 1
17
- ORDER BY page_count DESC
18
- LIMIT 50;
@@ -1,18 +0,0 @@
1
- -- Duplicate Meta Descriptions (MEDIUM)
2
- -- Multiple pages sharing the same meta description
3
- -- Priority: MEDIUM
4
- -- Category: content
5
-
6
- SELECT
7
- meta_description,
8
- COUNT(*) as page_count,
9
- GROUP_CONCAT(url, '|||') as urls,
10
- MIN(title) as example_title
11
- FROM pages
12
- WHERE meta_description IS NOT NULL
13
- AND TRIM(meta_description) != ''
14
- AND status_code = 200
15
- GROUP BY meta_description
16
- HAVING COUNT(*) > 1
17
- ORDER BY page_count DESC
18
- LIMIT 50;
@@ -1,19 +0,0 @@
1
- -- Duplicate Title Tags (HIGH)
2
- -- Multiple pages sharing the same title tag
3
- -- Priority: HIGH
4
- -- Category: content
5
- -- Impact: Title duplication causes keyword cannibalization where pages compete against each other. Reduces click-through rates as users can't distinguish between pages in search results.
6
- -- Fix: Create unique, descriptive title tags for each page that accurately reflect the page's specific content and target different keyword variations.
7
-
8
- SELECT
9
- title,
10
- GROUP_CONCAT(url, ', ') as duplicate_urls,
11
- COUNT(*) as count
12
- FROM pages
13
- WHERE title IS NOT NULL
14
- AND title != ''
15
- AND status_code = 200
16
- GROUP BY title
17
- HAVING count > 1
18
- ORDER BY count DESC
19
- LIMIT 100;
@@ -1,18 +0,0 @@
1
- -- Missing H1 Tags (HIGH)
2
- -- Pages without H1 headings
3
- -- Priority: HIGH
4
- -- Category: content
5
- -- Impact: H1 tags are critical for SEO and accessibility. Their absence makes it unclear to search engines and users what the page is about. Impacts topic clarity and ranking potential.
6
- -- Fix: Ensure every important page has exactly one descriptive H1 tag that clearly indicates the main topic. Include primary keywords naturally without keyword stuffing.
7
-
8
- SELECT
9
- url,
10
- title,
11
- word_count,
12
- depth
13
- FROM pages
14
- WHERE (h1 IS NULL OR h1 = '' OR TRIM(h1) = '')
15
- AND status_code = 200
16
- AND content_type LIKE '%text/html%'
17
- ORDER BY word_count DESC
18
- LIMIT 100;
@@ -1,19 +0,0 @@
1
- -- Missing Meta Descriptions (MEDIUM)
2
- -- Pages without meta descriptions
3
- -- Priority: MEDIUM
4
- -- Category: content
5
- -- Impact: Search engines auto-generate snippets which are often poor quality and don't entice clicks. Missed opportunity to control your SERP messaging and improve click-through rates.
6
- -- Fix: Write unique, compelling meta descriptions (120-160 characters) for key pages. Focus on benefits and include a call-to-action. Avoid duplicating title tag content.
7
-
8
- SELECT
9
- url,
10
- title,
11
- word_count,
12
- depth
13
- FROM pages
14
- WHERE (meta_description IS NULL OR meta_description = '' OR TRIM(meta_description) = '')
15
- AND status_code = 200
16
- AND content_type LIKE '%text/html%'
17
- AND depth <= 5
18
- ORDER BY word_count DESC
19
- LIMIT 100;
@@ -1,17 +0,0 @@
1
- -- Multiple H1 Tags (MEDIUM)
2
- -- Pages with more than one H1 tag (can confuse search engines)
3
- -- Priority: MEDIUM
4
- -- Category: content
5
-
6
- SELECT
7
- url,
8
- title,
9
- heading_count_h1,
10
- h1,
11
- status_code,
12
- depth
13
- FROM pages
14
- WHERE heading_count_h1 > 1
15
- AND status_code = 200
16
- ORDER BY heading_count_h1 DESC, depth ASC
17
- LIMIT 100;
@@ -1,18 +0,0 @@
1
- -- Thin Content (MEDIUM)
2
- -- Pages with insufficient content (< 300 words)
3
- -- Priority: MEDIUM
4
- -- Category: content
5
-
6
- SELECT
7
- url,
8
- title,
9
- word_count,
10
- status_code,
11
- depth,
12
- internal_links
13
- FROM pages
14
- WHERE word_count < 300
15
- AND status_code = 200
16
- AND content_type LIKE '%text/html%'
17
- ORDER BY word_count ASC
18
- LIMIT 100;
@@ -1,14 +0,0 @@
1
- -- 404 Not Found Errors (CRITICAL)
2
- -- Pages returning 404 errors
3
- -- Priority: CRITICAL
4
- -- Category: indexability
5
-
6
- SELECT
7
- url,
8
- depth,
9
- linked_from,
10
- crawled_at
11
- FROM pages
12
- WHERE status_code = 404
13
- ORDER BY depth ASC
14
- LIMIT 100;
@@ -1,22 +0,0 @@
1
- -- Broken Internal Links (CRITICAL)
2
- -- Internal links pointing to pages that actually returned HTTP errors (404, 500, etc.)
3
- -- Priority: CRITICAL
4
- -- Category: critical
5
- -- Impact: Poor user experience, wasted crawl budget, and loss of link equity. Users encounter dead ends and search engines waste resources crawling broken links.
6
- -- Fix: View URLs that link to errors using the 'inlinks' tab and export them in bulk. Update broken links to point to correct URLs or remove them. Consider implementing 301 redirects for permanently moved content.
7
- -- Note: This query only reports links to pages that were crawled AND returned errors. Un-crawled links are reported separately in the opportunities category.
8
-
9
- SELECT
10
- l.source_url,
11
- l.target_url,
12
- l.anchor_text,
13
- l.placement,
14
- p.status_code,
15
- COUNT(*) as occurrences
16
- FROM links l
17
- INNER JOIN pages p ON l.target_url = p.url -- Only check pages we actually crawled
18
- WHERE l.is_internal = 1
19
- AND p.status_code >= 400 -- Only actual HTTP errors (404, 500, etc.)
20
- GROUP BY l.target_url, l.source_url, p.status_code
21
- ORDER BY occurrences DESC
22
- LIMIT 100;
@@ -1,17 +0,0 @@
1
- -- Missing Title Tags (CRITICAL)
2
- -- Pages without title tags - catastrophic for SEO
3
- -- Priority: CRITICAL
4
- -- Category: critical
5
- -- Impact: Pages are essentially invisible to search engines. Title tags are the most important on-page SEO element and their absence prevents proper indexing and ranking.
6
- -- Fix: Add unique, descriptive title tags (50-60 characters) to all pages immediately. Include primary keywords and brand name where appropriate.
7
-
8
- SELECT
9
- url,
10
- word_count,
11
- heading_count_h1
12
- FROM pages
13
- WHERE (title IS NULL OR title = '' OR TRIM(title) = '')
14
- AND status_code = 200
15
- AND depth <= 5
16
- ORDER BY word_count DESC
17
- LIMIT 100;