@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import { ClusterInfo } from '../graph/graph.js';
|
|
2
|
+
import { analyzeContent } from './content.js';
|
|
3
|
+
import { H1Analysis, TextFieldAnalysis } from './seo.js';
|
|
4
|
+
import { ImageAltAnalysis } from './images.js';
|
|
5
|
+
import { LinkRatioAnalysis } from './links.js';
|
|
6
|
+
import { StructuredDataResult } from './structuredData.js';
|
|
7
|
+
import { aggregateSiteScore } from './scoring.js';
|
|
8
|
+
export interface CrawlPage {
|
|
9
|
+
url: string;
|
|
10
|
+
status?: number;
|
|
11
|
+
html?: string;
|
|
12
|
+
depth?: number;
|
|
13
|
+
canonical?: string;
|
|
14
|
+
noindex?: boolean;
|
|
15
|
+
nofollow?: boolean;
|
|
16
|
+
}
|
|
17
|
+
export interface AnalyzeOptions {
|
|
18
|
+
fromCrawl?: string;
|
|
19
|
+
live?: boolean;
|
|
20
|
+
html?: boolean;
|
|
21
|
+
seo?: boolean;
|
|
22
|
+
content?: boolean;
|
|
23
|
+
accessibility?: boolean;
|
|
24
|
+
rate?: number;
|
|
25
|
+
proxyUrl?: string;
|
|
26
|
+
userAgent?: string;
|
|
27
|
+
maxRedirects?: number;
|
|
28
|
+
debug?: boolean;
|
|
29
|
+
clusterThreshold?: number;
|
|
30
|
+
minClusterSize?: number;
|
|
31
|
+
}
|
|
32
|
+
export interface PageAnalysis {
|
|
33
|
+
url: string;
|
|
34
|
+
status: number;
|
|
35
|
+
title: TextFieldAnalysis;
|
|
36
|
+
metaDescription: TextFieldAnalysis;
|
|
37
|
+
h1: H1Analysis;
|
|
38
|
+
content: ReturnType<typeof analyzeContent>;
|
|
39
|
+
thinScore: number;
|
|
40
|
+
images: ImageAltAnalysis;
|
|
41
|
+
links: LinkRatioAnalysis;
|
|
42
|
+
structuredData: StructuredDataResult;
|
|
43
|
+
seoScore: number;
|
|
44
|
+
meta: {
|
|
45
|
+
canonical?: string;
|
|
46
|
+
noindex?: boolean;
|
|
47
|
+
nofollow?: boolean;
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
export interface AnalysisResult {
|
|
51
|
+
site_summary: {
|
|
52
|
+
pages_analyzed: number;
|
|
53
|
+
avg_seo_score: number;
|
|
54
|
+
thin_pages: number;
|
|
55
|
+
duplicate_titles: number;
|
|
56
|
+
site_score: number;
|
|
57
|
+
};
|
|
58
|
+
site_scores: ReturnType<typeof aggregateSiteScore>;
|
|
59
|
+
pages: PageAnalysis[];
|
|
60
|
+
active_modules: {
|
|
61
|
+
seo: boolean;
|
|
62
|
+
content: boolean;
|
|
63
|
+
accessibility: boolean;
|
|
64
|
+
};
|
|
65
|
+
clusters?: ClusterInfo[];
|
|
66
|
+
}
|
|
67
|
+
export declare function analyzeSite(url: string, options: AnalyzeOptions): Promise<AnalysisResult>;
|
|
68
|
+
export declare function renderAnalysisHtml(result: AnalysisResult): string;
|
|
69
|
+
export declare function renderAnalysisMarkdown(result: AnalysisResult): string;
|
|
70
|
+
export declare function renderAnalysisCsv(result: AnalysisResult): string;
|
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import { crawl } from '../crawler/crawl.js';
|
|
3
|
+
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
4
|
+
import { normalizeUrl } from '../crawler/normalize.js';
|
|
5
|
+
import { calculateMetrics } from '../graph/metrics.js';
|
|
6
|
+
import { Graph } from '../graph/graph.js';
|
|
7
|
+
import { analyzeContent, calculateThinContentScore } from './content.js';
|
|
8
|
+
import { analyzeH1, analyzeMetaDescription, analyzeTitle, applyDuplicateStatuses } from './seo.js';
|
|
9
|
+
import { analyzeImageAlts } from './images.js';
|
|
10
|
+
import { analyzeLinks } from './links.js';
|
|
11
|
+
import { analyzeStructuredData } from './structuredData.js';
|
|
12
|
+
import { aggregateSiteScore, scorePageSeo } from './scoring.js';
|
|
13
|
+
import { detectContentClusters } from '../graph/cluster.js';
|
|
14
|
+
import { getDb } from '../db/index.js';
|
|
15
|
+
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
16
|
+
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
17
|
+
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
18
|
+
export async function analyzeSite(url, options) {
|
|
19
|
+
const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
|
|
20
|
+
if (!normalizedRoot) {
|
|
21
|
+
throw new Error('Invalid URL for analysis');
|
|
22
|
+
}
|
|
23
|
+
let crawlData;
|
|
24
|
+
if (options.live) {
|
|
25
|
+
crawlData = await runLiveCrawl(normalizedRoot, options);
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
try {
|
|
29
|
+
crawlData = await loadCrawlData(normalizedRoot, options.fromCrawl);
|
|
30
|
+
}
|
|
31
|
+
catch (error) {
|
|
32
|
+
const isNotFound = error.code === 'ENOENT' ||
|
|
33
|
+
error.message.includes('Crawl data not found') ||
|
|
34
|
+
error.message.includes('No completed snapshot found') ||
|
|
35
|
+
error.message.includes('not found in database');
|
|
36
|
+
if (isNotFound && !options.fromCrawl) {
|
|
37
|
+
console.log('No local crawl data found. Switching to live analysis mode...');
|
|
38
|
+
crawlData = await runLiveCrawl(normalizedRoot, options);
|
|
39
|
+
}
|
|
40
|
+
else {
|
|
41
|
+
throw error;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// Run clustering if requested or as default
|
|
46
|
+
detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
|
|
47
|
+
const pages = analyzePages(normalizedRoot, crawlData.pages);
|
|
48
|
+
const activeModules = {
|
|
49
|
+
seo: !!options.seo,
|
|
50
|
+
content: !!options.content,
|
|
51
|
+
accessibility: !!options.accessibility
|
|
52
|
+
};
|
|
53
|
+
const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
|
|
54
|
+
const filteredPages = hasFilters
|
|
55
|
+
? pages.map((page) => filterPageModules(page, activeModules))
|
|
56
|
+
: pages;
|
|
57
|
+
// Filter to only the requested URL
|
|
58
|
+
const targetPage = filteredPages.find(p => p.url === normalizedRoot);
|
|
59
|
+
const resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : filteredPages);
|
|
60
|
+
const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
|
|
61
|
+
const thinPages = pages.filter((page) => page.thinScore >= 70).length;
|
|
62
|
+
const siteScores = aggregateSiteScore(crawlData.metrics, pages);
|
|
63
|
+
return {
|
|
64
|
+
site_summary: {
|
|
65
|
+
pages_analyzed: pages.length,
|
|
66
|
+
avg_seo_score: siteScores.seoHealthScore,
|
|
67
|
+
thin_pages: thinPages,
|
|
68
|
+
duplicate_titles: duplicateTitles,
|
|
69
|
+
site_score: siteScores.overallScore
|
|
70
|
+
},
|
|
71
|
+
site_scores: siteScores,
|
|
72
|
+
pages: resultPages,
|
|
73
|
+
active_modules: activeModules,
|
|
74
|
+
clusters: crawlData.graph.contentClusters
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
export function renderAnalysisHtml(result) {
|
|
78
|
+
if (result.pages.length === 1) {
|
|
79
|
+
return renderSinglePageHtml(result.pages[0]);
|
|
80
|
+
}
|
|
81
|
+
const rows = result.pages
|
|
82
|
+
.map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
|
|
83
|
+
.join('');
|
|
84
|
+
return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
|
|
85
|
+
}
|
|
86
|
+
function renderSinglePageHtml(page) {
|
|
87
|
+
return `<!DOCTYPE html>
|
|
88
|
+
<html lang="en">
|
|
89
|
+
<head>
|
|
90
|
+
<meta charset="UTF-8">
|
|
91
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
92
|
+
<title>Analysis for ${escapeHtml(page.url)}</title>
|
|
93
|
+
<style>
|
|
94
|
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
|
|
95
|
+
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
96
|
+
h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
|
|
97
|
+
.score-card { display: flex; gap: 20px; margin-bottom: 30px; }
|
|
98
|
+
.score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
|
|
99
|
+
.score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
|
|
100
|
+
.status-ok { color: green; font-weight: bold; }
|
|
101
|
+
.status-warning { color: orange; font-weight: bold; }
|
|
102
|
+
.status-critical { color: red; font-weight: bold; }
|
|
103
|
+
.status-missing { color: red; font-weight: bold; }
|
|
104
|
+
.data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
|
|
105
|
+
.data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
|
|
106
|
+
.data-table th { width: 150px; color: #666; }
|
|
107
|
+
code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
|
|
108
|
+
</style>
|
|
109
|
+
</head>
|
|
110
|
+
<body>
|
|
111
|
+
<h1>Page Analysis</h1>
|
|
112
|
+
<p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
|
|
113
|
+
|
|
114
|
+
<div class="score-card">
|
|
115
|
+
<div class="score-box">
|
|
116
|
+
<div class="score-val">${page.seoScore}</div>
|
|
117
|
+
<div>SEO Score</div>
|
|
118
|
+
</div>
|
|
119
|
+
<div class="score-box">
|
|
120
|
+
<div class="score-val">${page.thinScore}</div>
|
|
121
|
+
<div>Thin Content Score</div>
|
|
122
|
+
</div>
|
|
123
|
+
<div class="score-box">
|
|
124
|
+
<div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
|
|
125
|
+
<div>HTTP Status</div>
|
|
126
|
+
</div>
|
|
127
|
+
</div>
|
|
128
|
+
|
|
129
|
+
<h2>Meta Tags</h2>
|
|
130
|
+
<table class="data-table">
|
|
131
|
+
<tr>
|
|
132
|
+
<th>Title</th>
|
|
133
|
+
<td>
|
|
134
|
+
<div>${escapeHtml(page.title.value || '(missing)')}</div>
|
|
135
|
+
<small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
|
|
136
|
+
</td>
|
|
137
|
+
</tr>
|
|
138
|
+
<tr>
|
|
139
|
+
<th>Description</th>
|
|
140
|
+
<td>
|
|
141
|
+
<div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
|
|
142
|
+
<small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
|
|
143
|
+
</td>
|
|
144
|
+
</tr>
|
|
145
|
+
<tr>
|
|
146
|
+
<th>Canonical</th>
|
|
147
|
+
<td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
|
|
148
|
+
</tr>
|
|
149
|
+
<tr>
|
|
150
|
+
<th>Robots</th>
|
|
151
|
+
<td>
|
|
152
|
+
Index: ${!page.meta.noindex},
|
|
153
|
+
Follow: ${!page.meta.nofollow}
|
|
154
|
+
</td>
|
|
155
|
+
</tr>
|
|
156
|
+
</table>
|
|
157
|
+
|
|
158
|
+
<h2>Content & Heading</h2>
|
|
159
|
+
<table class="data-table">
|
|
160
|
+
<tr>
|
|
161
|
+
<th>H1 Tag</th>
|
|
162
|
+
<td>
|
|
163
|
+
Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
|
|
164
|
+
(${page.h1.count} detected)
|
|
165
|
+
${page.h1.matchesTitle ? ' | Matches Title' : ''}
|
|
166
|
+
</td>
|
|
167
|
+
</tr>
|
|
168
|
+
<tr>
|
|
169
|
+
<th>Word Count</th>
|
|
170
|
+
<td>${page.content.wordCount} words</td>
|
|
171
|
+
</tr>
|
|
172
|
+
<tr>
|
|
173
|
+
<th>Unique Sentences</th>
|
|
174
|
+
<td>${page.content.uniqueSentenceCount}</td>
|
|
175
|
+
</tr>
|
|
176
|
+
<tr>
|
|
177
|
+
<th>Text / HTML Ratio</th>
|
|
178
|
+
<td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
|
|
179
|
+
</tr>
|
|
180
|
+
</table>
|
|
181
|
+
|
|
182
|
+
<h2>Links & Images</h2>
|
|
183
|
+
<table class="data-table">
|
|
184
|
+
<tr>
|
|
185
|
+
<th>Internal Links</th>
|
|
186
|
+
<td>${page.links.internalLinks}</td>
|
|
187
|
+
</tr>
|
|
188
|
+
<tr>
|
|
189
|
+
<th>External Links</th>
|
|
190
|
+
<td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
|
|
191
|
+
</tr>
|
|
192
|
+
<tr>
|
|
193
|
+
<th>Images</th>
|
|
194
|
+
<td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
|
|
195
|
+
</tr>
|
|
196
|
+
</table>
|
|
197
|
+
|
|
198
|
+
<h2>Structured Data</h2>
|
|
199
|
+
<table class="data-table">
|
|
200
|
+
<tr>
|
|
201
|
+
<th>Status</th>
|
|
202
|
+
<td>
|
|
203
|
+
${page.structuredData.present
|
|
204
|
+
? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
|
|
205
|
+
: 'Not detected'}
|
|
206
|
+
</td>
|
|
207
|
+
</tr>
|
|
208
|
+
${page.structuredData.present ? `
|
|
209
|
+
<tr>
|
|
210
|
+
<th>Types Found</th>
|
|
211
|
+
<td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
|
|
212
|
+
</tr>
|
|
213
|
+
` : ''}
|
|
214
|
+
</table>
|
|
215
|
+
</body>
|
|
216
|
+
</html>`;
|
|
217
|
+
}
|
|
218
|
+
export function renderAnalysisMarkdown(result) {
|
|
219
|
+
const summary = [
|
|
220
|
+
'# Crawlith SEO Analysis Report',
|
|
221
|
+
'',
|
|
222
|
+
'## 📊 Summary',
|
|
223
|
+
`- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
|
|
224
|
+
`- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
|
|
225
|
+
`- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
|
|
226
|
+
`- Thin Pages Found: ${result.site_summary.thin_pages}`,
|
|
227
|
+
`- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
|
|
228
|
+
'',
|
|
229
|
+
'## 📄 Page Details',
|
|
230
|
+
'',
|
|
231
|
+
'| URL | SEO Score | Thin Score | Title Status | Meta Status |',
|
|
232
|
+
'| :--- | :--- | :--- | :--- | :--- |',
|
|
233
|
+
];
|
|
234
|
+
result.pages.forEach((page) => {
|
|
235
|
+
summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
|
|
236
|
+
});
|
|
237
|
+
return summary.join('\n');
|
|
238
|
+
}
|
|
239
|
+
export function renderAnalysisCsv(result) {
|
|
240
|
+
const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
|
|
241
|
+
const rows = result.pages.map((p) => {
|
|
242
|
+
const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
|
|
243
|
+
return [
|
|
244
|
+
p.url,
|
|
245
|
+
p.seoScore,
|
|
246
|
+
p.thinScore,
|
|
247
|
+
statusStr,
|
|
248
|
+
`"${(p.title.value || '').replace(/"/g, '""')}"`,
|
|
249
|
+
p.title.length,
|
|
250
|
+
`"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
|
|
251
|
+
p.metaDescription.length,
|
|
252
|
+
p.content.wordCount,
|
|
253
|
+
p.links.internalLinks,
|
|
254
|
+
p.links.externalLinks
|
|
255
|
+
].join(',');
|
|
256
|
+
});
|
|
257
|
+
return [headers.join(','), ...rows].join('\n');
|
|
258
|
+
}
|
|
259
|
+
function escapeHtml(value) {
|
|
260
|
+
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
261
|
+
}
|
|
262
|
+
function analyzePages(rootUrl, pages) {
|
|
263
|
+
const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
|
|
264
|
+
const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
|
|
265
|
+
const titles = applyDuplicateStatuses(titleCandidates);
|
|
266
|
+
const metas = applyDuplicateStatuses(metaCandidates);
|
|
267
|
+
const sentenceCountFrequency = new Map();
|
|
268
|
+
const baseContent = pages.map((page) => analyzeContent(page.html || ''));
|
|
269
|
+
for (const item of baseContent) {
|
|
270
|
+
sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
|
|
271
|
+
}
|
|
272
|
+
return pages.map((page, index) => {
|
|
273
|
+
const html = page.html || '';
|
|
274
|
+
const title = titles[index];
|
|
275
|
+
const metaDescription = metas[index];
|
|
276
|
+
const h1 = analyzeH1(html, title.value);
|
|
277
|
+
const content = baseContent[index];
|
|
278
|
+
const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
279
|
+
const thinScore = calculateThinContentScore(content, duplicationScore);
|
|
280
|
+
const images = analyzeImageAlts(html);
|
|
281
|
+
const links = analyzeLinks(html, page.url, rootUrl);
|
|
282
|
+
const structuredData = analyzeStructuredData(html);
|
|
283
|
+
const analysis = {
|
|
284
|
+
url: page.url,
|
|
285
|
+
status: page.status || 0,
|
|
286
|
+
title,
|
|
287
|
+
metaDescription,
|
|
288
|
+
h1,
|
|
289
|
+
content,
|
|
290
|
+
thinScore,
|
|
291
|
+
images,
|
|
292
|
+
links,
|
|
293
|
+
structuredData,
|
|
294
|
+
seoScore: 0,
|
|
295
|
+
meta: {
|
|
296
|
+
canonical: page.canonical,
|
|
297
|
+
noindex: page.noindex,
|
|
298
|
+
nofollow: page.nofollow
|
|
299
|
+
}
|
|
300
|
+
};
|
|
301
|
+
analysis.seoScore = scorePageSeo(analysis);
|
|
302
|
+
return analysis;
|
|
303
|
+
});
|
|
304
|
+
}
|
|
305
|
+
function filterPageModules(page, modules) {
|
|
306
|
+
const keepSeo = modules.seo;
|
|
307
|
+
const keepContent = modules.content;
|
|
308
|
+
const keepAccessibility = modules.accessibility;
|
|
309
|
+
return {
|
|
310
|
+
...page,
|
|
311
|
+
title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
|
|
312
|
+
metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
|
|
313
|
+
h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
|
|
314
|
+
links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
|
|
315
|
+
structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
|
|
316
|
+
content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
|
|
317
|
+
thinScore: keepContent ? page.thinScore : 0,
|
|
318
|
+
images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
async function loadCrawlData(rootUrl, fromCrawl) {
|
|
322
|
+
// If fromCrawl is provided, we could theoretically load JSON, but
|
|
323
|
+
// we now default to DB fetching for all operations.
|
|
324
|
+
if (fromCrawl) {
|
|
325
|
+
try {
|
|
326
|
+
const content = await fs.readFile(fromCrawl, 'utf-8');
|
|
327
|
+
const raw = JSON.parse(content);
|
|
328
|
+
const pages = parsePages(raw);
|
|
329
|
+
const graph = graphFromPages(rootUrl, pages, raw);
|
|
330
|
+
const metrics = calculateMetrics(graph, 5);
|
|
331
|
+
return { pages, metrics, graph };
|
|
332
|
+
}
|
|
333
|
+
catch (_e) {
|
|
334
|
+
// Fallback downwards if file doesn't exist
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
const db = getDb();
|
|
338
|
+
const siteRepo = new SiteRepository(db);
|
|
339
|
+
const snapshotRepo = new SnapshotRepository(db);
|
|
340
|
+
const pageRepo = new PageRepository(db);
|
|
341
|
+
const urlObj = new URL(rootUrl);
|
|
342
|
+
const domain = urlObj.hostname.replace('www.', '');
|
|
343
|
+
const site = siteRepo.firstOrCreateSite(domain);
|
|
344
|
+
const snapshot = snapshotRepo.getLatestSnapshot(site.id, 'completed');
|
|
345
|
+
if (!snapshot) {
|
|
346
|
+
throw new Error(`No completed snapshot found for ${rootUrl} in database.`);
|
|
347
|
+
}
|
|
348
|
+
const graph = loadGraphFromSnapshot(snapshot.id);
|
|
349
|
+
const metrics = calculateMetrics(graph, 5);
|
|
350
|
+
// We also need the `pages` array for analysis.
|
|
351
|
+
// It needs `html` which might not be fully available unless we look up from the DB or Graph.
|
|
352
|
+
// Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
|
|
353
|
+
// Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
|
|
354
|
+
// We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
|
|
355
|
+
const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
|
|
356
|
+
const pages = dbPages.map((p) => ({
|
|
357
|
+
url: p.normalized_url,
|
|
358
|
+
status: p.http_status || 0,
|
|
359
|
+
html: p.html || '',
|
|
360
|
+
depth: p.depth || 0
|
|
361
|
+
}));
|
|
362
|
+
return { pages, metrics, graph };
|
|
363
|
+
}
|
|
364
|
+
function parsePages(raw) {
|
|
365
|
+
if (Array.isArray(raw.pages)) {
|
|
366
|
+
return raw.pages.map((page) => {
|
|
367
|
+
const p = page;
|
|
368
|
+
return {
|
|
369
|
+
url: String(p.url || ''),
|
|
370
|
+
status: Number(p.status || 0),
|
|
371
|
+
html: typeof p.html === 'string' ? p.html : '',
|
|
372
|
+
depth: Number(p.depth || 0)
|
|
373
|
+
};
|
|
374
|
+
}).filter((page) => Boolean(page.url));
|
|
375
|
+
}
|
|
376
|
+
if (Array.isArray(raw.nodes)) {
|
|
377
|
+
return raw.nodes.map((node) => {
|
|
378
|
+
const n = node;
|
|
379
|
+
return {
|
|
380
|
+
url: String(n.url || ''),
|
|
381
|
+
status: Number(n.status || 0),
|
|
382
|
+
html: typeof n.html === 'string' ? n.html : '',
|
|
383
|
+
depth: Number(n.depth || 0)
|
|
384
|
+
};
|
|
385
|
+
}).filter((page) => Boolean(page.url));
|
|
386
|
+
}
|
|
387
|
+
return [];
|
|
388
|
+
}
|
|
389
|
+
function graphFromPages(rootUrl, pages, raw) {
|
|
390
|
+
const graph = new Graph();
|
|
391
|
+
for (const page of pages) {
|
|
392
|
+
graph.addNode(page.url, page.depth || 0, page.status || 0);
|
|
393
|
+
}
|
|
394
|
+
if (Array.isArray(raw.edges)) {
|
|
395
|
+
for (const edge of raw.edges) {
|
|
396
|
+
const e = edge;
|
|
397
|
+
if (typeof e.source === 'string' && typeof e.target === 'string') {
|
|
398
|
+
graph.addNode(e.source, 0, 0);
|
|
399
|
+
graph.addNode(e.target, 0, 0);
|
|
400
|
+
graph.addEdge(e.source, e.target);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
return graph;
|
|
404
|
+
}
|
|
405
|
+
for (const page of pages) {
|
|
406
|
+
if (!page.html)
|
|
407
|
+
continue;
|
|
408
|
+
const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
|
|
409
|
+
if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0)
|
|
410
|
+
continue;
|
|
411
|
+
}
|
|
412
|
+
return graph;
|
|
413
|
+
}
|
|
414
|
+
async function runLiveCrawl(url, options) {
|
|
415
|
+
const snapshotId = await crawl(url, {
|
|
416
|
+
limit: 1,
|
|
417
|
+
depth: 0,
|
|
418
|
+
rate: options.rate,
|
|
419
|
+
proxyUrl: options.proxyUrl,
|
|
420
|
+
userAgent: options.userAgent,
|
|
421
|
+
maxRedirects: options.maxRedirects,
|
|
422
|
+
debug: options.debug
|
|
423
|
+
});
|
|
424
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
425
|
+
const pages = graph.getNodes().map((node) => ({
|
|
426
|
+
url: node.url,
|
|
427
|
+
status: node.status,
|
|
428
|
+
html: node.html || '', // Include HTML
|
|
429
|
+
depth: node.depth
|
|
430
|
+
}));
|
|
431
|
+
return {
|
|
432
|
+
pages,
|
|
433
|
+
metrics: calculateMetrics(graph, 1),
|
|
434
|
+
graph
|
|
435
|
+
};
|
|
436
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export interface ContentAnalysis {
|
|
2
|
+
wordCount: number;
|
|
3
|
+
textHtmlRatio: number;
|
|
4
|
+
uniqueSentenceCount: number;
|
|
5
|
+
}
|
|
6
|
+
export interface ThinScoreWeights {
|
|
7
|
+
lowWordWeight: number;
|
|
8
|
+
ratioWeight: number;
|
|
9
|
+
dupWeight: number;
|
|
10
|
+
}
|
|
11
|
+
export declare function analyzeContent(html: string): ContentAnalysis;
|
|
12
|
+
export declare function calculateThinContentScore(content: ContentAnalysis, duplicationScore: number, weights?: ThinScoreWeights): number;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
const DEFAULT_WEIGHTS = {
|
|
3
|
+
lowWordWeight: 0.4,
|
|
4
|
+
ratioWeight: 0.35,
|
|
5
|
+
dupWeight: 0.25
|
|
6
|
+
};
|
|
7
|
+
export function analyzeContent(html) {
|
|
8
|
+
const $ = load(html || '<html></html>');
|
|
9
|
+
$('script,style,nav,footer').remove();
|
|
10
|
+
const text = $('body').length ? $('body').text() : $.text();
|
|
11
|
+
const cleanText = text.replace(/\s+/g, ' ').trim();
|
|
12
|
+
const words = cleanText ? cleanText.split(/\s+/).filter(Boolean) : [];
|
|
13
|
+
const wordCount = words.length;
|
|
14
|
+
const htmlLength = Math.max(html.length, 1);
|
|
15
|
+
const textHtmlRatio = cleanText.length / htmlLength;
|
|
16
|
+
const sentenceSet = new Set(cleanText
|
|
17
|
+
.split(/[.!?]+/)
|
|
18
|
+
.map((item) => item.trim().toLowerCase())
|
|
19
|
+
.filter(Boolean));
|
|
20
|
+
return {
|
|
21
|
+
wordCount,
|
|
22
|
+
textHtmlRatio,
|
|
23
|
+
uniqueSentenceCount: sentenceSet.size
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
export function calculateThinContentScore(content, duplicationScore, weights = DEFAULT_WEIGHTS) {
|
|
27
|
+
const wordScore = content.wordCount >= 300 ? 0 : 100 - Math.min(100, (content.wordCount / 300) * 100);
|
|
28
|
+
const textRatioScore = content.textHtmlRatio >= 0.2 ? 0 : 100 - Math.min(100, (content.textHtmlRatio / 0.2) * 100);
|
|
29
|
+
const raw = weights.lowWordWeight * wordScore +
|
|
30
|
+
weights.ratioWeight * textRatioScore +
|
|
31
|
+
weights.dupWeight * duplicationScore;
|
|
32
|
+
return Math.max(0, Math.min(100, Number(raw.toFixed(2))));
|
|
33
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
export function analyzeImageAlts(html) {
|
|
3
|
+
const $ = load(html);
|
|
4
|
+
let missingAlt = 0;
|
|
5
|
+
let emptyAlt = 0;
|
|
6
|
+
$('img').each((_idx, el) => {
|
|
7
|
+
const alt = $(el).attr('alt');
|
|
8
|
+
if (alt === undefined) {
|
|
9
|
+
missingAlt += 1;
|
|
10
|
+
return;
|
|
11
|
+
}
|
|
12
|
+
if (!alt.trim()) {
|
|
13
|
+
emptyAlt += 1;
|
|
14
|
+
}
|
|
15
|
+
});
|
|
16
|
+
const totalImages = $('img').length;
|
|
17
|
+
return { totalImages, missingAlt, emptyAlt };
|
|
18
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
import { normalizeUrl } from '../crawler/normalize.js';
|
|
3
|
+
export function analyzeLinks(html, pageUrl, rootUrl) {
|
|
4
|
+
const $ = load(html);
|
|
5
|
+
const rootOrigin = new URL(rootUrl).origin;
|
|
6
|
+
let internalLinks = 0;
|
|
7
|
+
let externalLinks = 0;
|
|
8
|
+
let nofollowCount = 0;
|
|
9
|
+
$('a[href]').each((_idx, el) => {
|
|
10
|
+
const href = $(el).attr('href');
|
|
11
|
+
if (!href)
|
|
12
|
+
return;
|
|
13
|
+
const normalized = normalizeUrl(href, pageUrl, { stripQuery: false });
|
|
14
|
+
if (!normalized)
|
|
15
|
+
return;
|
|
16
|
+
const rel = ($(el).attr('rel') || '').toLowerCase();
|
|
17
|
+
if (rel.includes('nofollow')) {
|
|
18
|
+
nofollowCount += 1;
|
|
19
|
+
}
|
|
20
|
+
if (new URL(normalized).origin === rootOrigin) {
|
|
21
|
+
internalLinks += 1;
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
externalLinks += 1;
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
const total = internalLinks + externalLinks;
|
|
28
|
+
const externalRatio = total === 0 ? 0 : externalLinks / total;
|
|
29
|
+
return { internalLinks, externalLinks, nofollowCount, externalRatio };
|
|
30
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { Metrics } from '../graph/metrics.js';
|
|
2
|
+
import type { PageAnalysis } from './analyze.js';
|
|
3
|
+
export interface SiteScore {
|
|
4
|
+
seoHealthScore: number;
|
|
5
|
+
authorityEntropyOrphanScore: number;
|
|
6
|
+
overallScore: number;
|
|
7
|
+
}
|
|
8
|
+
export declare function scorePageSeo(page: PageAnalysis): number;
|
|
9
|
+
export declare function aggregateSiteScore(metrics: Metrics, pages: PageAnalysis[]): SiteScore;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
export function scorePageSeo(page) {
|
|
2
|
+
const titleMeta = (scoreTextStatus(page.title.status) + scoreTextStatus(page.metaDescription.status)) / 2;
|
|
3
|
+
const h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
|
|
4
|
+
const wordQuality = Math.min(100, (page.content.wordCount / 600) * 100) * 0.7 + Math.min(100, page.content.textHtmlRatio * 500) * 0.3;
|
|
5
|
+
const thin = 100 - page.thinScore;
|
|
6
|
+
const imageDen = Math.max(1, page.images.totalImages);
|
|
7
|
+
const imageAlt = Math.max(0, 100 - ((page.images.missingAlt + page.images.emptyAlt) / imageDen) * 100);
|
|
8
|
+
const structured = page.structuredData.present ? (page.structuredData.valid ? 100 : 40) : 30;
|
|
9
|
+
const linkBalance = Math.max(0, 100 - Math.abs(page.links.externalRatio - 0.3) * 200);
|
|
10
|
+
const score = titleMeta * 0.15 +
|
|
11
|
+
h1 * 0.1 +
|
|
12
|
+
wordQuality * 0.2 +
|
|
13
|
+
thin * 0.2 +
|
|
14
|
+
imageAlt * 0.1 +
|
|
15
|
+
structured * 0.1 +
|
|
16
|
+
linkBalance * 0.15;
|
|
17
|
+
return Number(Math.max(0, Math.min(100, score)).toFixed(2));
|
|
18
|
+
}
|
|
19
|
+
function scoreTextStatus(status) {
|
|
20
|
+
switch (status) {
|
|
21
|
+
case 'ok': return 100;
|
|
22
|
+
case 'duplicate': return 45;
|
|
23
|
+
case 'too_short': return 60;
|
|
24
|
+
case 'too_long': return 60;
|
|
25
|
+
case 'missing': return 0;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
export function aggregateSiteScore(metrics, pages) {
|
|
29
|
+
const seoHealthScore = pages.length === 0 ? 0 : pages.reduce((acc, page) => acc + page.seoScore, 0) / pages.length;
|
|
30
|
+
const avgAuthority = metrics.topAuthorityPages.length === 0
|
|
31
|
+
? 0
|
|
32
|
+
: metrics.topAuthorityPages.reduce((acc, item) => acc + item.authority, 0) / metrics.topAuthorityPages.length;
|
|
33
|
+
const entropyScore = Math.max(0, 100 - Math.abs(metrics.structuralEntropy - 2) * 25);
|
|
34
|
+
const orphanPenalty = metrics.totalPages === 0 ? 0 : (metrics.orphanPages.length / metrics.totalPages) * 100;
|
|
35
|
+
const authorityEntropyOrphanScore = Math.max(0, Math.min(100, (avgAuthority * 100 * 0.4) + (entropyScore * 0.35) + ((100 - orphanPenalty) * 0.25)));
|
|
36
|
+
const overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
|
|
37
|
+
return {
|
|
38
|
+
seoHealthScore: Number(seoHealthScore.toFixed(2)),
|
|
39
|
+
authorityEntropyOrphanScore: Number(authorityEntropyOrphanScore.toFixed(2)),
|
|
40
|
+
overallScore
|
|
41
|
+
};
|
|
42
|
+
}
|