@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/dist/analysis/analyze.js
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
1
|
import { crawl } from '../crawler/crawl.js';
|
|
3
2
|
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
4
3
|
import { normalizeUrl } from '../crawler/normalize.js';
|
|
5
4
|
import { calculateMetrics } from '../graph/metrics.js';
|
|
6
|
-
import { Graph } from '../graph/graph.js';
|
|
7
5
|
import { analyzeContent, calculateThinContentScore } from './content.js';
|
|
8
|
-
import { analyzeH1, analyzeMetaDescription, analyzeTitle
|
|
6
|
+
import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
|
|
9
7
|
import { analyzeImageAlts } from './images.js';
|
|
10
8
|
import { analyzeLinks } from './links.js';
|
|
11
9
|
import { analyzeStructuredData } from './structuredData.js';
|
|
@@ -15,36 +13,79 @@ import { getDb } from '../db/index.js';
|
|
|
15
13
|
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
16
14
|
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
17
15
|
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
18
|
-
|
|
16
|
+
import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
|
|
17
|
+
/**
|
|
18
|
+
* Analyzes a site for SEO, content, and accessibility.
|
|
19
|
+
* Supports live crawling or loading from a database snapshot.
|
|
20
|
+
* Note: File-based data loading is not supported.
|
|
21
|
+
*
|
|
22
|
+
* @param url The root URL to analyze
|
|
23
|
+
* @param options Analysis options
|
|
24
|
+
* @param context Engine context for event emission
|
|
25
|
+
*/
|
|
26
|
+
export async function analyzeSite(url, options, context) {
|
|
19
27
|
const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
|
|
20
28
|
if (!normalizedRoot) {
|
|
21
29
|
throw new Error('Invalid URL for analysis');
|
|
22
30
|
}
|
|
23
31
|
let crawlData;
|
|
32
|
+
let robots = null;
|
|
33
|
+
// Always try to fetch robots.txt for the analysis session
|
|
34
|
+
// to ensure we have the latest rules for visibility reporting.
|
|
35
|
+
try {
|
|
36
|
+
const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
|
|
37
|
+
const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
|
|
38
|
+
const status = robotsRes.status;
|
|
39
|
+
if (typeof status === 'number' && status >= 200 && status < 300) {
|
|
40
|
+
const robotsParserModule = await import('robots-parser');
|
|
41
|
+
const robotsParser = robotsParserModule.default || robotsParserModule;
|
|
42
|
+
robots = robotsParser(robotsUrl, robotsRes.body);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
catch {
|
|
46
|
+
// Silence robots fetch errors, fallback to existing or none
|
|
47
|
+
}
|
|
24
48
|
if (options.live) {
|
|
25
|
-
crawlData = await runLiveCrawl(normalizedRoot, options);
|
|
49
|
+
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
26
50
|
}
|
|
27
51
|
else {
|
|
28
52
|
try {
|
|
29
|
-
crawlData = await loadCrawlData(normalizedRoot
|
|
53
|
+
crawlData = await loadCrawlData(normalizedRoot);
|
|
54
|
+
// Convert generator to array so it can be reused multiple times
|
|
55
|
+
const allPages = Array.from(crawlData.pages);
|
|
56
|
+
crawlData.pages = allPages;
|
|
57
|
+
// Check if the requested URL actually exists in this snapshot
|
|
58
|
+
const exists = allPages.some(p => p.url === normalizedRoot);
|
|
59
|
+
if (!exists) {
|
|
60
|
+
options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
|
|
61
|
+
if (context) {
|
|
62
|
+
context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
|
|
63
|
+
}
|
|
64
|
+
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
65
|
+
}
|
|
30
66
|
}
|
|
31
67
|
catch (error) {
|
|
32
68
|
const isNotFound = error.code === 'ENOENT' ||
|
|
33
69
|
error.message.includes('Crawl data not found') ||
|
|
34
70
|
error.message.includes('No completed snapshot found') ||
|
|
35
71
|
error.message.includes('not found in database');
|
|
36
|
-
if (isNotFound
|
|
37
|
-
|
|
38
|
-
|
|
72
|
+
if (isNotFound) {
|
|
73
|
+
options.live = true; // Force live mode
|
|
74
|
+
if (context) {
|
|
75
|
+
context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
|
|
76
|
+
}
|
|
77
|
+
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
39
78
|
}
|
|
40
79
|
else {
|
|
41
80
|
throw error;
|
|
42
81
|
}
|
|
43
82
|
}
|
|
44
83
|
}
|
|
84
|
+
const snapshotId = crawlData.snapshotId;
|
|
85
|
+
const crawledAt = crawlData.crawledAt;
|
|
45
86
|
// Run clustering if requested or as default
|
|
46
87
|
detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
|
|
47
|
-
const pages = analyzePages(normalizedRoot, crawlData.pages);
|
|
88
|
+
const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
|
|
48
89
|
const activeModules = {
|
|
49
90
|
seo: !!options.seo,
|
|
50
91
|
content: !!options.content,
|
|
@@ -56,13 +97,19 @@ export async function analyzeSite(url, options) {
|
|
|
56
97
|
: pages;
|
|
57
98
|
// Filter to only the requested URL
|
|
58
99
|
const targetPage = filteredPages.find(p => p.url === normalizedRoot);
|
|
59
|
-
|
|
100
|
+
let resultPages;
|
|
101
|
+
if (options.allPages) {
|
|
102
|
+
resultPages = filteredPages;
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
|
|
106
|
+
}
|
|
60
107
|
const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
|
|
61
108
|
const thinPages = pages.filter((page) => page.thinScore >= 70).length;
|
|
62
|
-
const siteScores = aggregateSiteScore(crawlData.metrics, pages);
|
|
109
|
+
const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
|
|
63
110
|
return {
|
|
64
111
|
site_summary: {
|
|
65
|
-
pages_analyzed:
|
|
112
|
+
pages_analyzed: resultPages.length,
|
|
66
113
|
avg_seo_score: siteScores.seoHealthScore,
|
|
67
114
|
thin_pages: thinPages,
|
|
68
115
|
duplicate_titles: duplicateTitles,
|
|
@@ -71,7 +118,9 @@ export async function analyzeSite(url, options) {
|
|
|
71
118
|
site_scores: siteScores,
|
|
72
119
|
pages: resultPages,
|
|
73
120
|
active_modules: activeModules,
|
|
74
|
-
clusters: crawlData.graph.contentClusters
|
|
121
|
+
clusters: crawlData.graph.contentClusters,
|
|
122
|
+
snapshotId,
|
|
123
|
+
crawledAt
|
|
75
124
|
};
|
|
76
125
|
}
|
|
77
126
|
export function renderAnalysisHtml(result) {
|
|
@@ -79,141 +128,50 @@ export function renderAnalysisHtml(result) {
|
|
|
79
128
|
return renderSinglePageHtml(result.pages[0]);
|
|
80
129
|
}
|
|
81
130
|
const rows = result.pages
|
|
82
|
-
.map((page) => `<
|
|
131
|
+
.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
|
|
83
132
|
.join('');
|
|
84
|
-
return
|
|
133
|
+
return ANALYSIS_LIST_TEMPLATE
|
|
134
|
+
.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
|
|
135
|
+
.replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
|
|
136
|
+
.replace('{{ROWS}}', rows);
|
|
85
137
|
}
|
|
86
138
|
function renderSinglePageHtml(page) {
|
|
87
|
-
|
|
88
|
-
<html lang="en">
|
|
89
|
-
<head>
|
|
90
|
-
<meta charset="UTF-8">
|
|
91
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
92
|
-
<title>Analysis for ${escapeHtml(page.url)}</title>
|
|
93
|
-
<style>
|
|
94
|
-
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
|
|
95
|
-
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
96
|
-
h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
|
|
97
|
-
.score-card { display: flex; gap: 20px; margin-bottom: 30px; }
|
|
98
|
-
.score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
|
|
99
|
-
.score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
|
|
100
|
-
.status-ok { color: green; font-weight: bold; }
|
|
101
|
-
.status-warning { color: orange; font-weight: bold; }
|
|
102
|
-
.status-critical { color: red; font-weight: bold; }
|
|
103
|
-
.status-missing { color: red; font-weight: bold; }
|
|
104
|
-
.data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
|
|
105
|
-
.data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
|
|
106
|
-
.data-table th { width: 150px; color: #666; }
|
|
107
|
-
code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
|
|
108
|
-
</style>
|
|
109
|
-
</head>
|
|
110
|
-
<body>
|
|
111
|
-
<h1>Page Analysis</h1>
|
|
112
|
-
<p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
|
|
113
|
-
|
|
114
|
-
<div class="score-card">
|
|
115
|
-
<div class="score-box">
|
|
116
|
-
<div class="score-val">${page.seoScore}</div>
|
|
117
|
-
<div>SEO Score</div>
|
|
118
|
-
</div>
|
|
119
|
-
<div class="score-box">
|
|
120
|
-
<div class="score-val">${page.thinScore}</div>
|
|
121
|
-
<div>Thin Content Score</div>
|
|
122
|
-
</div>
|
|
123
|
-
<div class="score-box">
|
|
124
|
-
<div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
|
|
125
|
-
<div>HTTP Status</div>
|
|
126
|
-
</div>
|
|
127
|
-
</div>
|
|
128
|
-
|
|
129
|
-
<h2>Meta Tags</h2>
|
|
130
|
-
<table class="data-table">
|
|
131
|
-
<tr>
|
|
132
|
-
<th>Title</th>
|
|
133
|
-
<td>
|
|
134
|
-
<div>${escapeHtml(page.title.value || '(missing)')}</div>
|
|
135
|
-
<small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
|
|
136
|
-
</td>
|
|
137
|
-
</tr>
|
|
138
|
-
<tr>
|
|
139
|
-
<th>Description</th>
|
|
140
|
-
<td>
|
|
141
|
-
<div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
|
|
142
|
-
<small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
|
|
143
|
-
</td>
|
|
144
|
-
</tr>
|
|
145
|
-
<tr>
|
|
146
|
-
<th>Canonical</th>
|
|
147
|
-
<td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
|
|
148
|
-
</tr>
|
|
149
|
-
<tr>
|
|
150
|
-
<th>Robots</th>
|
|
151
|
-
<td>
|
|
152
|
-
Index: ${!page.meta.noindex},
|
|
153
|
-
Follow: ${!page.meta.nofollow}
|
|
154
|
-
</td>
|
|
155
|
-
</tr>
|
|
156
|
-
</table>
|
|
157
|
-
|
|
158
|
-
<h2>Content & Heading</h2>
|
|
159
|
-
<table class="data-table">
|
|
160
|
-
<tr>
|
|
161
|
-
<th>H1 Tag</th>
|
|
162
|
-
<td>
|
|
163
|
-
Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
|
|
164
|
-
(${page.h1.count} detected)
|
|
165
|
-
${page.h1.matchesTitle ? ' | Matches Title' : ''}
|
|
166
|
-
</td>
|
|
167
|
-
</tr>
|
|
168
|
-
<tr>
|
|
169
|
-
<th>Word Count</th>
|
|
170
|
-
<td>${page.content.wordCount} words</td>
|
|
171
|
-
</tr>
|
|
172
|
-
<tr>
|
|
173
|
-
<th>Unique Sentences</th>
|
|
174
|
-
<td>${page.content.uniqueSentenceCount}</td>
|
|
175
|
-
</tr>
|
|
176
|
-
<tr>
|
|
177
|
-
<th>Text / HTML Ratio</th>
|
|
178
|
-
<td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
|
|
179
|
-
</tr>
|
|
180
|
-
</table>
|
|
181
|
-
|
|
182
|
-
<h2>Links & Images</h2>
|
|
183
|
-
<table class="data-table">
|
|
184
|
-
<tr>
|
|
185
|
-
<th>Internal Links</th>
|
|
186
|
-
<td>${page.links.internalLinks}</td>
|
|
187
|
-
</tr>
|
|
188
|
-
<tr>
|
|
189
|
-
<th>External Links</th>
|
|
190
|
-
<td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
|
|
191
|
-
</tr>
|
|
192
|
-
<tr>
|
|
193
|
-
<th>Images</th>
|
|
194
|
-
<td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
|
|
195
|
-
</tr>
|
|
196
|
-
</table>
|
|
197
|
-
|
|
198
|
-
<h2>Structured Data</h2>
|
|
199
|
-
<table class="data-table">
|
|
200
|
-
<tr>
|
|
201
|
-
<th>Status</th>
|
|
202
|
-
<td>
|
|
203
|
-
${page.structuredData.present
|
|
139
|
+
const structuredDataStatus = page.structuredData.present
|
|
204
140
|
? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
|
|
205
|
-
: 'Not detected'
|
|
206
|
-
|
|
207
|
-
</tr>
|
|
208
|
-
${page.structuredData.present ? `
|
|
141
|
+
: 'Not detected';
|
|
142
|
+
const structuredDataTypesRow = page.structuredData.present ? `
|
|
209
143
|
<tr>
|
|
210
144
|
<th>Types Found</th>
|
|
211
145
|
<td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
|
|
212
146
|
</tr>
|
|
213
|
-
` : ''
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
147
|
+
` : '';
|
|
148
|
+
return ANALYSIS_PAGE_TEMPLATE
|
|
149
|
+
.replaceAll('{{URL}}', escapeHtml(page.url))
|
|
150
|
+
.replace('{{SEO_SCORE}}', page.seoScore.toString())
|
|
151
|
+
.replace('{{THIN_SCORE}}', page.thinScore.toString())
|
|
152
|
+
.replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
|
|
153
|
+
.replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
|
|
154
|
+
.replace('{{TITLE_LENGTH}}', page.title.length.toString())
|
|
155
|
+
.replaceAll('{{TITLE_STATUS}}', page.title.status)
|
|
156
|
+
.replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
|
|
157
|
+
.replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
|
|
158
|
+
.replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
|
|
159
|
+
.replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
|
|
160
|
+
.replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
|
|
161
|
+
.replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
|
|
162
|
+
.replaceAll('{{H1_STATUS}}', page.h1.status)
|
|
163
|
+
.replace('{{H1_COUNT}}', page.h1.count.toString())
|
|
164
|
+
.replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
|
|
165
|
+
.replace('{{WORD_COUNT}}', page.content.wordCount.toString())
|
|
166
|
+
.replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
|
|
167
|
+
.replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
|
|
168
|
+
.replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
|
|
169
|
+
.replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
|
|
170
|
+
.replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
|
|
171
|
+
.replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
|
|
172
|
+
.replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
|
|
173
|
+
.replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
|
|
174
|
+
.replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
|
|
217
175
|
}
|
|
218
176
|
export function renderAnalysisMarkdown(result) {
|
|
219
177
|
const summary = [
|
|
@@ -259,48 +217,84 @@ export function renderAnalysisCsv(result) {
|
|
|
259
217
|
function escapeHtml(value) {
|
|
260
218
|
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
261
219
|
}
|
|
262
|
-
function analyzePages(rootUrl, pages) {
|
|
263
|
-
const
|
|
264
|
-
const
|
|
265
|
-
const titles = applyDuplicateStatuses(titleCandidates);
|
|
266
|
-
const metas = applyDuplicateStatuses(metaCandidates);
|
|
220
|
+
export function analyzePages(rootUrl, pages, robots) {
|
|
221
|
+
const titleCounts = new Map();
|
|
222
|
+
const metaCounts = new Map();
|
|
267
223
|
const sentenceCountFrequency = new Map();
|
|
268
|
-
const
|
|
269
|
-
for (const
|
|
270
|
-
sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
|
|
271
|
-
}
|
|
272
|
-
return pages.map((page, index) => {
|
|
224
|
+
const results = [];
|
|
225
|
+
for (const page of pages) {
|
|
273
226
|
const html = page.html || '';
|
|
274
|
-
|
|
275
|
-
|
|
227
|
+
// 0. Update crawl status based on current robots rules
|
|
228
|
+
let crawlStatus = page.crawlStatus;
|
|
229
|
+
if (robots) {
|
|
230
|
+
const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
|
|
231
|
+
(!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
|
|
232
|
+
if (isBlocked) {
|
|
233
|
+
crawlStatus = 'blocked_by_robots';
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
// 1. Analyze Individual Components
|
|
237
|
+
const title = analyzeTitle(html);
|
|
238
|
+
const metaDescription = analyzeMetaDescription(html);
|
|
276
239
|
const h1 = analyzeH1(html, title.value);
|
|
277
|
-
const content =
|
|
278
|
-
const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
279
|
-
const thinScore = calculateThinContentScore(content, duplicationScore);
|
|
240
|
+
const content = analyzeContent(html);
|
|
280
241
|
const images = analyzeImageAlts(html);
|
|
281
242
|
const links = analyzeLinks(html, page.url, rootUrl);
|
|
282
243
|
const structuredData = analyzeStructuredData(html);
|
|
283
|
-
|
|
244
|
+
// 2. Accumulate Frequencies for Duplicates
|
|
245
|
+
if (title.value) {
|
|
246
|
+
const key = (title.value || '').trim().toLowerCase();
|
|
247
|
+
titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
|
|
248
|
+
}
|
|
249
|
+
if (metaDescription.value) {
|
|
250
|
+
const key = (metaDescription.value || '').trim().toLowerCase();
|
|
251
|
+
metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
|
|
252
|
+
}
|
|
253
|
+
sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
|
|
254
|
+
// 3. Store Preliminary Result
|
|
255
|
+
results.push({
|
|
284
256
|
url: page.url,
|
|
285
257
|
status: page.status || 0,
|
|
286
258
|
title,
|
|
287
259
|
metaDescription,
|
|
288
260
|
h1,
|
|
289
261
|
content,
|
|
290
|
-
thinScore,
|
|
262
|
+
thinScore: 0, // Calculated in pass 2
|
|
291
263
|
images,
|
|
292
264
|
links,
|
|
293
265
|
structuredData,
|
|
294
|
-
seoScore: 0,
|
|
266
|
+
seoScore: 0, // Calculated in pass 2
|
|
295
267
|
meta: {
|
|
296
268
|
canonical: page.canonical,
|
|
297
269
|
noindex: page.noindex,
|
|
298
|
-
nofollow: page.nofollow
|
|
270
|
+
nofollow: page.nofollow,
|
|
271
|
+
crawlStatus
|
|
272
|
+
}
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
// 4. Finalize Statuses and Scores (Pass 2)
|
|
276
|
+
for (const analysis of results) {
|
|
277
|
+
// Check Title Duplicates
|
|
278
|
+
if (analysis.title.value) {
|
|
279
|
+
const key = (analysis.title.value || '').trim().toLowerCase();
|
|
280
|
+
if ((titleCounts.get(key) || 0) > 1) {
|
|
281
|
+
analysis.title.status = 'duplicate';
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
// Check Meta Duplicates
|
|
285
|
+
if (analysis.metaDescription.value) {
|
|
286
|
+
const key = (analysis.metaDescription.value || '').trim().toLowerCase();
|
|
287
|
+
if ((metaCounts.get(key) || 0) > 1) {
|
|
288
|
+
analysis.metaDescription.status = 'duplicate';
|
|
299
289
|
}
|
|
300
|
-
}
|
|
290
|
+
}
|
|
291
|
+
// Check Content Duplication
|
|
292
|
+
const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
293
|
+
analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
|
|
294
|
+
// Calculate Final SEO Score
|
|
301
295
|
analysis.seoScore = scorePageSeo(analysis);
|
|
302
|
-
|
|
303
|
-
|
|
296
|
+
}
|
|
297
|
+
return results;
|
|
304
298
|
}
|
|
305
299
|
function filterPageModules(page, modules) {
|
|
306
300
|
const keepSeo = modules.seo;
|
|
@@ -318,22 +312,7 @@ function filterPageModules(page, modules) {
|
|
|
318
312
|
images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
|
|
319
313
|
};
|
|
320
314
|
}
|
|
321
|
-
async function loadCrawlData(rootUrl
|
|
322
|
-
// If fromCrawl is provided, we could theoretically load JSON, but
|
|
323
|
-
// we now default to DB fetching for all operations.
|
|
324
|
-
if (fromCrawl) {
|
|
325
|
-
try {
|
|
326
|
-
const content = await fs.readFile(fromCrawl, 'utf-8');
|
|
327
|
-
const raw = JSON.parse(content);
|
|
328
|
-
const pages = parsePages(raw);
|
|
329
|
-
const graph = graphFromPages(rootUrl, pages, raw);
|
|
330
|
-
const metrics = calculateMetrics(graph, 5);
|
|
331
|
-
return { pages, metrics, graph };
|
|
332
|
-
}
|
|
333
|
-
catch (_e) {
|
|
334
|
-
// Fallback downwards if file doesn't exist
|
|
335
|
-
}
|
|
336
|
-
}
|
|
315
|
+
async function loadCrawlData(rootUrl) {
|
|
337
316
|
const db = getDb();
|
|
338
317
|
const siteRepo = new SiteRepository(db);
|
|
339
318
|
const snapshotRepo = new SnapshotRepository(db);
|
|
@@ -341,96 +320,61 @@ async function loadCrawlData(rootUrl, fromCrawl) {
|
|
|
341
320
|
const urlObj = new URL(rootUrl);
|
|
342
321
|
const domain = urlObj.hostname.replace('www.', '');
|
|
343
322
|
const site = siteRepo.firstOrCreateSite(domain);
|
|
344
|
-
|
|
323
|
+
let snapshot;
|
|
324
|
+
const page = pageRepo.getPage(site.id, rootUrl);
|
|
325
|
+
if (page && page.last_seen_snapshot_id) {
|
|
326
|
+
snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
|
|
327
|
+
}
|
|
345
328
|
if (!snapshot) {
|
|
346
|
-
|
|
329
|
+
snapshot = snapshotRepo.getLatestSnapshot(site.id);
|
|
330
|
+
}
|
|
331
|
+
if (!snapshot) {
|
|
332
|
+
throw new Error(`No crawl data found for ${rootUrl} in database.`);
|
|
347
333
|
}
|
|
348
334
|
const graph = loadGraphFromSnapshot(snapshot.id);
|
|
349
335
|
const metrics = calculateMetrics(graph, 5);
|
|
350
|
-
//
|
|
351
|
-
|
|
352
|
-
//
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
function parsePages(raw) {
|
|
365
|
-
if (Array.isArray(raw.pages)) {
|
|
366
|
-
return raw.pages.map((page) => {
|
|
367
|
-
const p = page;
|
|
368
|
-
return {
|
|
369
|
-
url: String(p.url || ''),
|
|
370
|
-
status: Number(p.status || 0),
|
|
371
|
-
html: typeof p.html === 'string' ? p.html : '',
|
|
372
|
-
depth: Number(p.depth || 0)
|
|
336
|
+
// Use iterator to save memory
|
|
337
|
+
const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
|
|
338
|
+
// We need to map the DB pages to CrawlPage format lazily
|
|
339
|
+
const pagesGenerator = function* () {
|
|
340
|
+
for (const p of dbPagesIterator) {
|
|
341
|
+
yield {
|
|
342
|
+
url: p.normalized_url,
|
|
343
|
+
status: p.http_status || 0,
|
|
344
|
+
html: p.html || '',
|
|
345
|
+
depth: p.depth || 0,
|
|
346
|
+
canonical: p.canonical_url || undefined,
|
|
347
|
+
noindex: !!p.noindex,
|
|
348
|
+
nofollow: !!p.nofollow,
|
|
349
|
+
crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
|
|
373
350
|
};
|
|
374
|
-
}).filter((page) => Boolean(page.url));
|
|
375
|
-
}
|
|
376
|
-
if (Array.isArray(raw.nodes)) {
|
|
377
|
-
return raw.nodes.map((node) => {
|
|
378
|
-
const n = node;
|
|
379
|
-
return {
|
|
380
|
-
url: String(n.url || ''),
|
|
381
|
-
status: Number(n.status || 0),
|
|
382
|
-
html: typeof n.html === 'string' ? n.html : '',
|
|
383
|
-
depth: Number(n.depth || 0)
|
|
384
|
-
};
|
|
385
|
-
}).filter((page) => Boolean(page.url));
|
|
386
|
-
}
|
|
387
|
-
return [];
|
|
388
|
-
}
|
|
389
|
-
function graphFromPages(rootUrl, pages, raw) {
|
|
390
|
-
const graph = new Graph();
|
|
391
|
-
for (const page of pages) {
|
|
392
|
-
graph.addNode(page.url, page.depth || 0, page.status || 0);
|
|
393
|
-
}
|
|
394
|
-
if (Array.isArray(raw.edges)) {
|
|
395
|
-
for (const edge of raw.edges) {
|
|
396
|
-
const e = edge;
|
|
397
|
-
if (typeof e.source === 'string' && typeof e.target === 'string') {
|
|
398
|
-
graph.addNode(e.source, 0, 0);
|
|
399
|
-
graph.addNode(e.target, 0, 0);
|
|
400
|
-
graph.addEdge(e.source, e.target);
|
|
401
|
-
}
|
|
402
351
|
}
|
|
403
|
-
|
|
404
|
-
}
|
|
405
|
-
for (const page of pages) {
|
|
406
|
-
if (!page.html)
|
|
407
|
-
continue;
|
|
408
|
-
const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
|
|
409
|
-
if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0)
|
|
410
|
-
continue;
|
|
411
|
-
}
|
|
412
|
-
return graph;
|
|
352
|
+
};
|
|
353
|
+
return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
|
|
413
354
|
}
|
|
414
|
-
async function runLiveCrawl(url, options) {
|
|
355
|
+
async function runLiveCrawl(url, options, context) {
|
|
415
356
|
const snapshotId = await crawl(url, {
|
|
416
|
-
limit: 1,
|
|
357
|
+
limit: 1, // Always limit to 1 for single page live analysis
|
|
417
358
|
depth: 0,
|
|
418
359
|
rate: options.rate,
|
|
419
360
|
proxyUrl: options.proxyUrl,
|
|
420
361
|
userAgent: options.userAgent,
|
|
421
362
|
maxRedirects: options.maxRedirects,
|
|
422
|
-
debug: options.debug
|
|
423
|
-
|
|
363
|
+
debug: options.debug,
|
|
364
|
+
snapshotType: 'partial'
|
|
365
|
+
}, context);
|
|
424
366
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
425
367
|
const pages = graph.getNodes().map((node) => ({
|
|
426
368
|
url: node.url,
|
|
427
369
|
status: node.status,
|
|
428
370
|
html: node.html || '', // Include HTML
|
|
429
|
-
depth: node.depth
|
|
371
|
+
depth: node.depth,
|
|
372
|
+
crawlStatus: node.crawlStatus
|
|
430
373
|
}));
|
|
431
374
|
return {
|
|
432
375
|
pages,
|
|
433
376
|
metrics: calculateMetrics(graph, 1),
|
|
434
|
-
graph
|
|
377
|
+
graph,
|
|
378
|
+
snapshotId
|
|
435
379
|
};
|
|
436
380
|
}
|
package/dist/analysis/scoring.js
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
export function scorePageSeo(page) {
|
|
2
|
+
if (page.meta.crawlStatus === 'blocked_by_robots') {
|
|
3
|
+
return 0;
|
|
4
|
+
}
|
|
2
5
|
const titleMeta = (scoreTextStatus(page.title.status) + scoreTextStatus(page.metaDescription.status)) / 2;
|
|
3
6
|
const h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
|
|
4
7
|
const wordQuality = Math.min(100, (page.content.wordCount / 600) * 100) * 0.7 + Math.min(100, page.content.textHtmlRatio * 500) * 0.3;
|
|
@@ -33,7 +36,10 @@ export function aggregateSiteScore(metrics, pages) {
|
|
|
33
36
|
const entropyScore = Math.max(0, 100 - Math.abs(metrics.structuralEntropy - 2) * 25);
|
|
34
37
|
const orphanPenalty = metrics.totalPages === 0 ? 0 : (metrics.orphanPages.length / metrics.totalPages) * 100;
|
|
35
38
|
const authorityEntropyOrphanScore = Math.max(0, Math.min(100, (avgAuthority * 100 * 0.4) + (entropyScore * 0.35) + ((100 - orphanPenalty) * 0.25)));
|
|
36
|
-
|
|
39
|
+
let overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
|
|
40
|
+
if (pages.some(p => p.meta.crawlStatus === 'blocked_by_robots')) {
|
|
41
|
+
overallScore = 0;
|
|
42
|
+
}
|
|
37
43
|
return {
|
|
38
44
|
seoHealthScore: Number(seoHealthScore.toFixed(2)),
|
|
39
45
|
authorityEntropyOrphanScore: Number(authorityEntropyOrphanScore.toFixed(2)),
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
5
|
+
const __dirname = path.dirname(__filename);
|
|
6
|
+
export const ANALYSIS_LIST_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_list.html'), 'utf-8');
|
|
7
|
+
export const ANALYSIS_PAGE_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_page.html'), 'utf-8');
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import * as dns from 'dns';
|
|
2
|
+
import { Agent } from 'undici';
|
|
1
3
|
export declare class IPGuard {
|
|
2
4
|
/**
|
|
3
5
|
* Checks if an IP address is internal/private
|
|
@@ -7,5 +9,14 @@ export declare class IPGuard {
|
|
|
7
9
|
* Resolves a hostname and validates all result IPs
|
|
8
10
|
*/
|
|
9
11
|
static validateHost(host: string): Promise<boolean>;
|
|
12
|
+
/**
|
|
13
|
+
* Custom lookup function for undici that validates the resolved IP.
|
|
14
|
+
* Prevents DNS Rebinding attacks by checking the IP immediately before connection.
|
|
15
|
+
*/
|
|
16
|
+
static secureLookup(hostname: string, options: dns.LookupOneOptions | dns.LookupAllOptions, callback: (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => void): void;
|
|
17
|
+
/**
|
|
18
|
+
* Returns an undici Agent configured with secure DNS lookup.
|
|
19
|
+
*/
|
|
20
|
+
static getSecureDispatcher(): Agent;
|
|
10
21
|
private static expandIPv6;
|
|
11
22
|
}
|