@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/src/analysis/analyze.ts
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
1
|
import { crawl } from '../crawler/crawl.js';
|
|
3
2
|
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
4
3
|
import { normalizeUrl } from '../crawler/normalize.js';
|
|
5
4
|
import { calculateMetrics, Metrics } from '../graph/metrics.js';
|
|
6
5
|
import { Graph, ClusterInfo } from '../graph/graph.js';
|
|
7
6
|
import { analyzeContent, calculateThinContentScore } from './content.js';
|
|
8
|
-
import { analyzeH1, analyzeMetaDescription, analyzeTitle,
|
|
7
|
+
import { analyzeH1, analyzeMetaDescription, analyzeTitle, H1Analysis, TextFieldAnalysis } from './seo.js';
|
|
9
8
|
import { analyzeImageAlts, ImageAltAnalysis } from './images.js';
|
|
10
9
|
import { analyzeLinks, LinkRatioAnalysis } from './links.js';
|
|
11
10
|
import { analyzeStructuredData, StructuredDataResult } from './structuredData.js';
|
|
@@ -15,6 +14,8 @@ import { getDb } from '../db/index.js';
|
|
|
15
14
|
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
16
15
|
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
17
16
|
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
17
|
+
import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
|
|
18
|
+
import { EngineContext } from '../events.js';
|
|
18
19
|
|
|
19
20
|
export interface CrawlPage {
|
|
20
21
|
url: string;
|
|
@@ -24,12 +25,11 @@ export interface CrawlPage {
|
|
|
24
25
|
canonical?: string;
|
|
25
26
|
noindex?: boolean;
|
|
26
27
|
nofollow?: boolean;
|
|
28
|
+
crawlStatus?: string;
|
|
27
29
|
}
|
|
28
30
|
|
|
29
31
|
export interface AnalyzeOptions {
|
|
30
|
-
fromCrawl?: string;
|
|
31
32
|
live?: boolean;
|
|
32
|
-
html?: boolean;
|
|
33
33
|
seo?: boolean;
|
|
34
34
|
content?: boolean;
|
|
35
35
|
accessibility?: boolean;
|
|
@@ -40,6 +40,7 @@ export interface AnalyzeOptions {
|
|
|
40
40
|
debug?: boolean;
|
|
41
41
|
clusterThreshold?: number;
|
|
42
42
|
minClusterSize?: number;
|
|
43
|
+
allPages?: boolean;
|
|
43
44
|
}
|
|
44
45
|
|
|
45
46
|
export interface PageAnalysis {
|
|
@@ -58,6 +59,7 @@ export interface PageAnalysis {
|
|
|
58
59
|
canonical?: string;
|
|
59
60
|
noindex?: boolean;
|
|
60
61
|
nofollow?: boolean;
|
|
62
|
+
crawlStatus?: string;
|
|
61
63
|
}
|
|
62
64
|
}
|
|
63
65
|
|
|
@@ -77,45 +79,93 @@ export interface AnalysisResult {
|
|
|
77
79
|
accessibility: boolean;
|
|
78
80
|
};
|
|
79
81
|
clusters?: ClusterInfo[];
|
|
82
|
+
snapshotId?: number;
|
|
83
|
+
crawledAt?: string;
|
|
80
84
|
}
|
|
81
85
|
|
|
82
86
|
interface CrawlData {
|
|
83
|
-
pages: CrawlPage[];
|
|
87
|
+
pages: Iterable<CrawlPage> | CrawlPage[];
|
|
84
88
|
metrics: Metrics;
|
|
85
89
|
graph: Graph;
|
|
90
|
+
snapshotId: number;
|
|
91
|
+
crawledAt?: string;
|
|
86
92
|
}
|
|
87
93
|
|
|
88
|
-
|
|
94
|
+
/**
|
|
95
|
+
* Analyzes a site for SEO, content, and accessibility.
|
|
96
|
+
* Supports live crawling or loading from a database snapshot.
|
|
97
|
+
* Note: File-based data loading is not supported.
|
|
98
|
+
*
|
|
99
|
+
* @param url The root URL to analyze
|
|
100
|
+
* @param options Analysis options
|
|
101
|
+
* @param context Engine context for event emission
|
|
102
|
+
*/
|
|
103
|
+
export async function analyzeSite(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<AnalysisResult> {
|
|
89
104
|
const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
|
|
90
105
|
if (!normalizedRoot) {
|
|
91
106
|
throw new Error('Invalid URL for analysis');
|
|
92
107
|
}
|
|
93
108
|
|
|
94
109
|
let crawlData: CrawlData;
|
|
95
|
-
|
|
110
|
+
let robots: any = null;
|
|
111
|
+
|
|
112
|
+
// Always try to fetch robots.txt for the analysis session
|
|
113
|
+
// to ensure we have the latest rules for visibility reporting.
|
|
114
|
+
try {
|
|
115
|
+
const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
|
|
116
|
+
const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
|
|
117
|
+
const status = robotsRes.status;
|
|
118
|
+
if (typeof status === 'number' && status >= 200 && status < 300) {
|
|
119
|
+
const robotsParserModule = await import('robots-parser');
|
|
120
|
+
const robotsParser = (robotsParserModule as any).default || robotsParserModule;
|
|
121
|
+
robots = (robotsParser as any)(robotsUrl, robotsRes.body);
|
|
122
|
+
}
|
|
123
|
+
} catch {
|
|
124
|
+
// Silence robots fetch errors, fallback to existing or none
|
|
125
|
+
}
|
|
96
126
|
if (options.live) {
|
|
97
|
-
crawlData = await runLiveCrawl(normalizedRoot, options);
|
|
127
|
+
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
98
128
|
} else {
|
|
99
129
|
try {
|
|
100
|
-
crawlData = await loadCrawlData(normalizedRoot
|
|
130
|
+
crawlData = await loadCrawlData(normalizedRoot);
|
|
131
|
+
|
|
132
|
+
// Convert generator to array so it can be reused multiple times
|
|
133
|
+
const allPages = Array.from(crawlData.pages);
|
|
134
|
+
crawlData.pages = allPages;
|
|
135
|
+
|
|
136
|
+
// Check if the requested URL actually exists in this snapshot
|
|
137
|
+
const exists = allPages.some(p => p.url === normalizedRoot);
|
|
138
|
+
if (!exists) {
|
|
139
|
+
options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
|
|
140
|
+
if (context) {
|
|
141
|
+
context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
|
|
142
|
+
}
|
|
143
|
+
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
144
|
+
}
|
|
101
145
|
} catch (error: any) {
|
|
102
146
|
const isNotFound = error.code === 'ENOENT' ||
|
|
103
147
|
error.message.includes('Crawl data not found') ||
|
|
104
148
|
error.message.includes('No completed snapshot found') ||
|
|
105
149
|
error.message.includes('not found in database');
|
|
106
|
-
if (isNotFound
|
|
107
|
-
|
|
108
|
-
|
|
150
|
+
if (isNotFound) {
|
|
151
|
+
options.live = true; // Force live mode
|
|
152
|
+
if (context) {
|
|
153
|
+
context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
|
|
154
|
+
}
|
|
155
|
+
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
109
156
|
} else {
|
|
110
157
|
throw error;
|
|
111
158
|
}
|
|
112
159
|
}
|
|
113
160
|
}
|
|
114
161
|
|
|
162
|
+
const snapshotId = crawlData.snapshotId;
|
|
163
|
+
const crawledAt = crawlData.crawledAt;
|
|
164
|
+
|
|
115
165
|
// Run clustering if requested or as default
|
|
116
166
|
detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
|
|
117
167
|
|
|
118
|
-
const pages = analyzePages(normalizedRoot, crawlData.pages);
|
|
168
|
+
const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
|
|
119
169
|
|
|
120
170
|
const activeModules = {
|
|
121
171
|
seo: !!options.seo,
|
|
@@ -131,15 +181,21 @@ export async function analyzeSite(url: string, options: AnalyzeOptions): Promise
|
|
|
131
181
|
|
|
132
182
|
// Filter to only the requested URL
|
|
133
183
|
const targetPage = filteredPages.find(p => p.url === normalizedRoot);
|
|
134
|
-
|
|
184
|
+
let resultPages: PageAnalysis[];
|
|
185
|
+
|
|
186
|
+
if (options.allPages) {
|
|
187
|
+
resultPages = filteredPages;
|
|
188
|
+
} else {
|
|
189
|
+
resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
|
|
190
|
+
}
|
|
135
191
|
|
|
136
192
|
const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
|
|
137
193
|
const thinPages = pages.filter((page) => page.thinScore >= 70).length;
|
|
138
|
-
const siteScores = aggregateSiteScore(crawlData.metrics, pages);
|
|
194
|
+
const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
|
|
139
195
|
|
|
140
196
|
return {
|
|
141
197
|
site_summary: {
|
|
142
|
-
pages_analyzed:
|
|
198
|
+
pages_analyzed: resultPages.length,
|
|
143
199
|
avg_seo_score: siteScores.seoHealthScore,
|
|
144
200
|
thin_pages: thinPages,
|
|
145
201
|
duplicate_titles: duplicateTitles,
|
|
@@ -148,7 +204,9 @@ export async function analyzeSite(url: string, options: AnalyzeOptions): Promise
|
|
|
148
204
|
site_scores: siteScores,
|
|
149
205
|
pages: resultPages,
|
|
150
206
|
active_modules: activeModules,
|
|
151
|
-
clusters: crawlData.graph.contentClusters
|
|
207
|
+
clusters: crawlData.graph.contentClusters,
|
|
208
|
+
snapshotId,
|
|
209
|
+
crawledAt
|
|
152
210
|
};
|
|
153
211
|
}
|
|
154
212
|
|
|
@@ -157,144 +215,54 @@ export function renderAnalysisHtml(result: AnalysisResult): string {
|
|
|
157
215
|
return renderSinglePageHtml(result.pages[0]);
|
|
158
216
|
}
|
|
159
217
|
const rows = result.pages
|
|
160
|
-
.map((page) => `<
|
|
218
|
+
.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
|
|
161
219
|
.join('');
|
|
162
220
|
|
|
163
|
-
return
|
|
221
|
+
return ANALYSIS_LIST_TEMPLATE
|
|
222
|
+
.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
|
|
223
|
+
.replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
|
|
224
|
+
.replace('{{ROWS}}', rows);
|
|
164
225
|
}
|
|
165
226
|
|
|
166
227
|
function renderSinglePageHtml(page: PageAnalysis): string {
|
|
167
|
-
|
|
168
|
-
<
|
|
169
|
-
|
|
170
|
-
<meta charset="UTF-8">
|
|
171
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
172
|
-
<title>Analysis for ${escapeHtml(page.url)}</title>
|
|
173
|
-
<style>
|
|
174
|
-
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
|
|
175
|
-
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
176
|
-
h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
|
|
177
|
-
.score-card { display: flex; gap: 20px; margin-bottom: 30px; }
|
|
178
|
-
.score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
|
|
179
|
-
.score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
|
|
180
|
-
.status-ok { color: green; font-weight: bold; }
|
|
181
|
-
.status-warning { color: orange; font-weight: bold; }
|
|
182
|
-
.status-critical { color: red; font-weight: bold; }
|
|
183
|
-
.status-missing { color: red; font-weight: bold; }
|
|
184
|
-
.data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
|
|
185
|
-
.data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
|
|
186
|
-
.data-table th { width: 150px; color: #666; }
|
|
187
|
-
code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
|
|
188
|
-
</style>
|
|
189
|
-
</head>
|
|
190
|
-
<body>
|
|
191
|
-
<h1>Page Analysis</h1>
|
|
192
|
-
<p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
|
|
193
|
-
|
|
194
|
-
<div class="score-card">
|
|
195
|
-
<div class="score-box">
|
|
196
|
-
<div class="score-val">${page.seoScore}</div>
|
|
197
|
-
<div>SEO Score</div>
|
|
198
|
-
</div>
|
|
199
|
-
<div class="score-box">
|
|
200
|
-
<div class="score-val">${page.thinScore}</div>
|
|
201
|
-
<div>Thin Content Score</div>
|
|
202
|
-
</div>
|
|
203
|
-
<div class="score-box">
|
|
204
|
-
<div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
|
|
205
|
-
<div>HTTP Status</div>
|
|
206
|
-
</div>
|
|
207
|
-
</div>
|
|
208
|
-
|
|
209
|
-
<h2>Meta Tags</h2>
|
|
210
|
-
<table class="data-table">
|
|
211
|
-
<tr>
|
|
212
|
-
<th>Title</th>
|
|
213
|
-
<td>
|
|
214
|
-
<div>${escapeHtml(page.title.value || '(missing)')}</div>
|
|
215
|
-
<small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
|
|
216
|
-
</td>
|
|
217
|
-
</tr>
|
|
218
|
-
<tr>
|
|
219
|
-
<th>Description</th>
|
|
220
|
-
<td>
|
|
221
|
-
<div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
|
|
222
|
-
<small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
|
|
223
|
-
</td>
|
|
224
|
-
</tr>
|
|
225
|
-
<tr>
|
|
226
|
-
<th>Canonical</th>
|
|
227
|
-
<td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
|
|
228
|
-
</tr>
|
|
229
|
-
<tr>
|
|
230
|
-
<th>Robots</th>
|
|
231
|
-
<td>
|
|
232
|
-
Index: ${!page.meta.noindex},
|
|
233
|
-
Follow: ${!page.meta.nofollow}
|
|
234
|
-
</td>
|
|
235
|
-
</tr>
|
|
236
|
-
</table>
|
|
237
|
-
|
|
238
|
-
<h2>Content & Heading</h2>
|
|
239
|
-
<table class="data-table">
|
|
240
|
-
<tr>
|
|
241
|
-
<th>H1 Tag</th>
|
|
242
|
-
<td>
|
|
243
|
-
Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
|
|
244
|
-
(${page.h1.count} detected)
|
|
245
|
-
${page.h1.matchesTitle ? ' | Matches Title' : ''}
|
|
246
|
-
</td>
|
|
247
|
-
</tr>
|
|
248
|
-
<tr>
|
|
249
|
-
<th>Word Count</th>
|
|
250
|
-
<td>${page.content.wordCount} words</td>
|
|
251
|
-
</tr>
|
|
252
|
-
<tr>
|
|
253
|
-
<th>Unique Sentences</th>
|
|
254
|
-
<td>${page.content.uniqueSentenceCount}</td>
|
|
255
|
-
</tr>
|
|
256
|
-
<tr>
|
|
257
|
-
<th>Text / HTML Ratio</th>
|
|
258
|
-
<td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
|
|
259
|
-
</tr>
|
|
260
|
-
</table>
|
|
228
|
+
const structuredDataStatus = page.structuredData.present
|
|
229
|
+
? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
|
|
230
|
+
: 'Not detected';
|
|
261
231
|
|
|
262
|
-
|
|
263
|
-
<table class="data-table">
|
|
264
|
-
<tr>
|
|
265
|
-
<th>Internal Links</th>
|
|
266
|
-
<td>${page.links.internalLinks}</td>
|
|
267
|
-
</tr>
|
|
268
|
-
<tr>
|
|
269
|
-
<th>External Links</th>
|
|
270
|
-
<td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
|
|
271
|
-
</tr>
|
|
272
|
-
<tr>
|
|
273
|
-
<th>Images</th>
|
|
274
|
-
<td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
|
|
275
|
-
</tr>
|
|
276
|
-
</table>
|
|
277
|
-
|
|
278
|
-
<h2>Structured Data</h2>
|
|
279
|
-
<table class="data-table">
|
|
280
|
-
<tr>
|
|
281
|
-
<th>Status</th>
|
|
282
|
-
<td>
|
|
283
|
-
${page.structuredData.present
|
|
284
|
-
? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
|
|
285
|
-
: 'Not detected'
|
|
286
|
-
}
|
|
287
|
-
</td>
|
|
288
|
-
</tr>
|
|
289
|
-
${page.structuredData.present ? `
|
|
232
|
+
const structuredDataTypesRow = page.structuredData.present ? `
|
|
290
233
|
<tr>
|
|
291
234
|
<th>Types Found</th>
|
|
292
235
|
<td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
|
|
293
236
|
</tr>
|
|
294
|
-
` : ''
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
237
|
+
` : '';
|
|
238
|
+
|
|
239
|
+
return ANALYSIS_PAGE_TEMPLATE
|
|
240
|
+
.replaceAll('{{URL}}', escapeHtml(page.url))
|
|
241
|
+
.replace('{{SEO_SCORE}}', page.seoScore.toString())
|
|
242
|
+
.replace('{{THIN_SCORE}}', page.thinScore.toString())
|
|
243
|
+
.replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
|
|
244
|
+
.replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
|
|
245
|
+
.replace('{{TITLE_LENGTH}}', page.title.length.toString())
|
|
246
|
+
.replaceAll('{{TITLE_STATUS}}', page.title.status)
|
|
247
|
+
.replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
|
|
248
|
+
.replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
|
|
249
|
+
.replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
|
|
250
|
+
.replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
|
|
251
|
+
.replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
|
|
252
|
+
.replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
|
|
253
|
+
.replaceAll('{{H1_STATUS}}', page.h1.status)
|
|
254
|
+
.replace('{{H1_COUNT}}', page.h1.count.toString())
|
|
255
|
+
.replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
|
|
256
|
+
.replace('{{WORD_COUNT}}', page.content.wordCount.toString())
|
|
257
|
+
.replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
|
|
258
|
+
.replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
|
|
259
|
+
.replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
|
|
260
|
+
.replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
|
|
261
|
+
.replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
|
|
262
|
+
.replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
|
|
263
|
+
.replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
|
|
264
|
+
.replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
|
|
265
|
+
.replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
|
|
298
266
|
}
|
|
299
267
|
|
|
300
268
|
export function renderAnalysisMarkdown(result: AnalysisResult): string {
|
|
@@ -347,52 +315,95 @@ function escapeHtml(value: string): string {
|
|
|
347
315
|
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
348
316
|
}
|
|
349
317
|
|
|
350
|
-
function analyzePages(rootUrl: string, pages: CrawlPage[]): PageAnalysis[] {
|
|
351
|
-
const
|
|
352
|
-
const
|
|
353
|
-
const titles = applyDuplicateStatuses(titleCandidates);
|
|
354
|
-
const metas = applyDuplicateStatuses(metaCandidates);
|
|
355
|
-
|
|
318
|
+
export function analyzePages(rootUrl: string, pages: Iterable<CrawlPage> | CrawlPage[], robots?: any): PageAnalysis[] {
|
|
319
|
+
const titleCounts = new Map<string, number>();
|
|
320
|
+
const metaCounts = new Map<string, number>();
|
|
356
321
|
const sentenceCountFrequency = new Map<number, number>();
|
|
357
|
-
const baseContent = pages.map((page) => analyzeContent(page.html || ''));
|
|
358
|
-
for (const item of baseContent) {
|
|
359
|
-
sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
|
|
360
|
-
}
|
|
361
322
|
|
|
362
|
-
|
|
323
|
+
const results: PageAnalysis[] = [];
|
|
324
|
+
|
|
325
|
+
for (const page of pages) {
|
|
363
326
|
const html = page.html || '';
|
|
364
|
-
|
|
365
|
-
|
|
327
|
+
|
|
328
|
+
// 0. Update crawl status based on current robots rules
|
|
329
|
+
let crawlStatus = page.crawlStatus;
|
|
330
|
+
if (robots) {
|
|
331
|
+
const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
|
|
332
|
+
(!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
|
|
333
|
+
if (isBlocked) {
|
|
334
|
+
crawlStatus = 'blocked_by_robots';
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// 1. Analyze Individual Components
|
|
339
|
+
const title = analyzeTitle(html);
|
|
340
|
+
const metaDescription = analyzeMetaDescription(html);
|
|
366
341
|
const h1 = analyzeH1(html, title.value);
|
|
367
|
-
const content =
|
|
368
|
-
const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
369
|
-
const thinScore = calculateThinContentScore(content, duplicationScore);
|
|
342
|
+
const content = analyzeContent(html);
|
|
370
343
|
const images = analyzeImageAlts(html);
|
|
371
344
|
const links = analyzeLinks(html, page.url, rootUrl);
|
|
372
345
|
const structuredData = analyzeStructuredData(html);
|
|
373
346
|
|
|
374
|
-
|
|
347
|
+
// 2. Accumulate Frequencies for Duplicates
|
|
348
|
+
if (title.value) {
|
|
349
|
+
const key = (title.value || '').trim().toLowerCase();
|
|
350
|
+
titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
|
|
351
|
+
}
|
|
352
|
+
if (metaDescription.value) {
|
|
353
|
+
const key = (metaDescription.value || '').trim().toLowerCase();
|
|
354
|
+
metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
|
|
355
|
+
}
|
|
356
|
+
sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
|
|
357
|
+
|
|
358
|
+
// 3. Store Preliminary Result
|
|
359
|
+
results.push({
|
|
375
360
|
url: page.url,
|
|
376
361
|
status: page.status || 0,
|
|
377
362
|
title,
|
|
378
363
|
metaDescription,
|
|
379
364
|
h1,
|
|
380
365
|
content,
|
|
381
|
-
thinScore,
|
|
366
|
+
thinScore: 0, // Calculated in pass 2
|
|
382
367
|
images,
|
|
383
368
|
links,
|
|
384
369
|
structuredData,
|
|
385
|
-
seoScore: 0,
|
|
370
|
+
seoScore: 0, // Calculated in pass 2
|
|
386
371
|
meta: {
|
|
387
372
|
canonical: page.canonical,
|
|
388
373
|
noindex: page.noindex,
|
|
389
|
-
nofollow: page.nofollow
|
|
374
|
+
nofollow: page.nofollow,
|
|
375
|
+
crawlStatus
|
|
376
|
+
}
|
|
377
|
+
});
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// 4. Finalize Statuses and Scores (Pass 2)
|
|
381
|
+
for (const analysis of results) {
|
|
382
|
+
// Check Title Duplicates
|
|
383
|
+
if (analysis.title.value) {
|
|
384
|
+
const key = (analysis.title.value || '').trim().toLowerCase();
|
|
385
|
+
if ((titleCounts.get(key) || 0) > 1) {
|
|
386
|
+
analysis.title.status = 'duplicate';
|
|
390
387
|
}
|
|
391
|
-
}
|
|
388
|
+
}
|
|
392
389
|
|
|
390
|
+
// Check Meta Duplicates
|
|
391
|
+
if (analysis.metaDescription.value) {
|
|
392
|
+
const key = (analysis.metaDescription.value || '').trim().toLowerCase();
|
|
393
|
+
if ((metaCounts.get(key) || 0) > 1) {
|
|
394
|
+
analysis.metaDescription.status = 'duplicate';
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// Check Content Duplication
|
|
399
|
+
const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
400
|
+
analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
|
|
401
|
+
|
|
402
|
+
// Calculate Final SEO Score
|
|
393
403
|
analysis.seoScore = scorePageSeo(analysis);
|
|
394
|
-
|
|
395
|
-
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
return results;
|
|
396
407
|
}
|
|
397
408
|
|
|
398
409
|
function filterPageModules(
|
|
@@ -416,23 +427,7 @@ function filterPageModules(
|
|
|
416
427
|
};
|
|
417
428
|
}
|
|
418
429
|
|
|
419
|
-
async function loadCrawlData(rootUrl: string
|
|
420
|
-
// If fromCrawl is provided, we could theoretically load JSON, but
|
|
421
|
-
// we now default to DB fetching for all operations.
|
|
422
|
-
|
|
423
|
-
if (fromCrawl) {
|
|
424
|
-
try {
|
|
425
|
-
const content = await fs.readFile(fromCrawl, 'utf-8');
|
|
426
|
-
const raw = JSON.parse(content) as Record<string, unknown>;
|
|
427
|
-
const pages = parsePages(raw);
|
|
428
|
-
const graph = graphFromPages(rootUrl, pages, raw);
|
|
429
|
-
const metrics = calculateMetrics(graph, 5);
|
|
430
|
-
return { pages, metrics, graph };
|
|
431
|
-
} catch (_e) {
|
|
432
|
-
// Fallback downwards if file doesn't exist
|
|
433
|
-
}
|
|
434
|
-
}
|
|
435
|
-
|
|
430
|
+
async function loadCrawlData(rootUrl: string): Promise<CrawlData> {
|
|
436
431
|
const db = getDb();
|
|
437
432
|
const siteRepo = new SiteRepository(db);
|
|
438
433
|
const snapshotRepo = new SnapshotRepository(db);
|
|
@@ -442,107 +437,69 @@ async function loadCrawlData(rootUrl: string, fromCrawl?: string): Promise<Crawl
|
|
|
442
437
|
const domain = urlObj.hostname.replace('www.', '');
|
|
443
438
|
const site = siteRepo.firstOrCreateSite(domain);
|
|
444
439
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
440
|
+
let snapshot;
|
|
441
|
+
const page = pageRepo.getPage(site.id, rootUrl);
|
|
442
|
+
if (page && page.last_seen_snapshot_id) {
|
|
443
|
+
snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
|
|
448
444
|
}
|
|
449
445
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
// We also need the `pages` array for analysis.
|
|
454
|
-
// It needs `html` which might not be fully available unless we look up from the DB or Graph.
|
|
455
|
-
// Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
|
|
456
|
-
// Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
|
|
457
|
-
// We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
|
|
458
|
-
|
|
459
|
-
const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
|
|
460
|
-
const pages: CrawlPage[] = dbPages.map((p: any) => ({
|
|
461
|
-
url: p.normalized_url,
|
|
462
|
-
status: p.http_status || 0,
|
|
463
|
-
html: p.html || '',
|
|
464
|
-
depth: p.depth || 0
|
|
465
|
-
}));
|
|
466
|
-
|
|
467
|
-
return { pages, metrics, graph };
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
function parsePages(raw: Record<string, unknown>): CrawlPage[] {
|
|
471
|
-
if (Array.isArray(raw.pages)) {
|
|
472
|
-
return raw.pages.map((page) => {
|
|
473
|
-
const p = page as Record<string, unknown>;
|
|
474
|
-
return {
|
|
475
|
-
url: String(p.url || ''),
|
|
476
|
-
status: Number(p.status || 0),
|
|
477
|
-
html: typeof p.html === 'string' ? p.html : '',
|
|
478
|
-
depth: Number(p.depth || 0)
|
|
479
|
-
};
|
|
480
|
-
}).filter((page) => Boolean(page.url));
|
|
446
|
+
if (!snapshot) {
|
|
447
|
+
snapshot = snapshotRepo.getLatestSnapshot(site.id);
|
|
481
448
|
}
|
|
482
449
|
|
|
483
|
-
if (
|
|
484
|
-
|
|
485
|
-
const n = node as Record<string, unknown>;
|
|
486
|
-
return {
|
|
487
|
-
url: String(n.url || ''),
|
|
488
|
-
status: Number(n.status || 0),
|
|
489
|
-
html: typeof n.html === 'string' ? n.html : '',
|
|
490
|
-
depth: Number(n.depth || 0)
|
|
491
|
-
};
|
|
492
|
-
}).filter((page) => Boolean(page.url));
|
|
450
|
+
if (!snapshot) {
|
|
451
|
+
throw new Error(`No crawl data found for ${rootUrl} in database.`);
|
|
493
452
|
}
|
|
494
453
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
function graphFromPages(rootUrl: string, pages: CrawlPage[], raw: Record<string, unknown>): Graph {
|
|
499
|
-
const graph = new Graph();
|
|
500
|
-
|
|
501
|
-
for (const page of pages) {
|
|
502
|
-
graph.addNode(page.url, page.depth || 0, page.status || 0);
|
|
503
|
-
}
|
|
454
|
+
const graph = loadGraphFromSnapshot(snapshot.id);
|
|
455
|
+
const metrics = calculateMetrics(graph, 5);
|
|
504
456
|
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
457
|
+
// Use iterator to save memory
|
|
458
|
+
const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
|
|
459
|
+
|
|
460
|
+
// We need to map the DB pages to CrawlPage format lazily
|
|
461
|
+
const pagesGenerator = function* () {
|
|
462
|
+
for (const p of dbPagesIterator) {
|
|
463
|
+
yield {
|
|
464
|
+
url: p.normalized_url,
|
|
465
|
+
status: p.http_status || 0,
|
|
466
|
+
html: p.html || '',
|
|
467
|
+
depth: p.depth || 0,
|
|
468
|
+
canonical: p.canonical_url || undefined,
|
|
469
|
+
noindex: !!p.noindex,
|
|
470
|
+
nofollow: !!p.nofollow,
|
|
471
|
+
crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
|
|
472
|
+
} as CrawlPage;
|
|
513
473
|
}
|
|
514
|
-
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
for (const page of pages) {
|
|
518
|
-
if (!page.html) continue;
|
|
519
|
-
const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
|
|
520
|
-
if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0) continue;
|
|
521
|
-
}
|
|
474
|
+
};
|
|
522
475
|
|
|
523
|
-
return graph;
|
|
476
|
+
return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
|
|
524
477
|
}
|
|
525
478
|
|
|
526
|
-
|
|
479
|
+
|
|
480
|
+
async function runLiveCrawl(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<CrawlData> {
|
|
527
481
|
const snapshotId = await crawl(url, {
|
|
528
|
-
limit: 1,
|
|
482
|
+
limit: 1, // Always limit to 1 for single page live analysis
|
|
529
483
|
depth: 0,
|
|
530
484
|
rate: options.rate,
|
|
531
485
|
proxyUrl: options.proxyUrl,
|
|
532
486
|
userAgent: options.userAgent,
|
|
533
487
|
maxRedirects: options.maxRedirects,
|
|
534
|
-
debug: options.debug
|
|
535
|
-
|
|
488
|
+
debug: options.debug,
|
|
489
|
+
snapshotType: 'partial'
|
|
490
|
+
}, context) as number;
|
|
536
491
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
537
492
|
const pages = graph.getNodes().map((node) => ({
|
|
538
493
|
url: node.url,
|
|
539
494
|
status: node.status,
|
|
540
495
|
html: node.html || '', // Include HTML
|
|
541
|
-
depth: node.depth
|
|
496
|
+
depth: node.depth,
|
|
497
|
+
crawlStatus: node.crawlStatus
|
|
542
498
|
}));
|
|
543
499
|
return {
|
|
544
500
|
pages,
|
|
545
501
|
metrics: calculateMetrics(graph, 1),
|
|
546
|
-
graph
|
|
502
|
+
graph,
|
|
503
|
+
snapshotId
|
|
547
504
|
};
|
|
548
505
|
}
|