@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/src/analysis/analyze.ts
DELETED
|
@@ -1,505 +0,0 @@
|
|
|
1
|
-
import { crawl } from '../crawler/crawl.js';
|
|
2
|
-
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
3
|
-
import { normalizeUrl } from '../crawler/normalize.js';
|
|
4
|
-
import { calculateMetrics, Metrics } from '../graph/metrics.js';
|
|
5
|
-
import { Graph, ClusterInfo } from '../graph/graph.js';
|
|
6
|
-
import { analyzeContent, calculateThinContentScore } from './content.js';
|
|
7
|
-
import { analyzeH1, analyzeMetaDescription, analyzeTitle, H1Analysis, TextFieldAnalysis } from './seo.js';
|
|
8
|
-
import { analyzeImageAlts, ImageAltAnalysis } from './images.js';
|
|
9
|
-
import { analyzeLinks, LinkRatioAnalysis } from './links.js';
|
|
10
|
-
import { analyzeStructuredData, StructuredDataResult } from './structuredData.js';
|
|
11
|
-
import { aggregateSiteScore, scorePageSeo } from './scoring.js';
|
|
12
|
-
import { detectContentClusters } from '../graph/cluster.js';
|
|
13
|
-
import { getDb } from '../db/index.js';
|
|
14
|
-
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
15
|
-
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
16
|
-
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
17
|
-
import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
|
|
18
|
-
import { EngineContext } from '../events.js';
|
|
19
|
-
|
|
20
|
-
export interface CrawlPage {
|
|
21
|
-
url: string;
|
|
22
|
-
status?: number;
|
|
23
|
-
html?: string;
|
|
24
|
-
depth?: number;
|
|
25
|
-
canonical?: string;
|
|
26
|
-
noindex?: boolean;
|
|
27
|
-
nofollow?: boolean;
|
|
28
|
-
crawlStatus?: string;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
export interface AnalyzeOptions {
|
|
32
|
-
live?: boolean;
|
|
33
|
-
seo?: boolean;
|
|
34
|
-
content?: boolean;
|
|
35
|
-
accessibility?: boolean;
|
|
36
|
-
rate?: number;
|
|
37
|
-
proxyUrl?: string;
|
|
38
|
-
userAgent?: string;
|
|
39
|
-
maxRedirects?: number;
|
|
40
|
-
debug?: boolean;
|
|
41
|
-
clusterThreshold?: number;
|
|
42
|
-
minClusterSize?: number;
|
|
43
|
-
allPages?: boolean;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
export interface PageAnalysis {
|
|
47
|
-
url: string;
|
|
48
|
-
status: number;
|
|
49
|
-
title: TextFieldAnalysis;
|
|
50
|
-
metaDescription: TextFieldAnalysis;
|
|
51
|
-
h1: H1Analysis;
|
|
52
|
-
content: ReturnType<typeof analyzeContent>;
|
|
53
|
-
thinScore: number;
|
|
54
|
-
images: ImageAltAnalysis;
|
|
55
|
-
links: LinkRatioAnalysis;
|
|
56
|
-
structuredData: StructuredDataResult;
|
|
57
|
-
seoScore: number;
|
|
58
|
-
meta: {
|
|
59
|
-
canonical?: string;
|
|
60
|
-
noindex?: boolean;
|
|
61
|
-
nofollow?: boolean;
|
|
62
|
-
crawlStatus?: string;
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
export interface AnalysisResult {
|
|
67
|
-
site_summary: {
|
|
68
|
-
pages_analyzed: number;
|
|
69
|
-
avg_seo_score: number;
|
|
70
|
-
thin_pages: number;
|
|
71
|
-
duplicate_titles: number;
|
|
72
|
-
site_score: number;
|
|
73
|
-
};
|
|
74
|
-
site_scores: ReturnType<typeof aggregateSiteScore>;
|
|
75
|
-
pages: PageAnalysis[];
|
|
76
|
-
active_modules: {
|
|
77
|
-
seo: boolean;
|
|
78
|
-
content: boolean;
|
|
79
|
-
accessibility: boolean;
|
|
80
|
-
};
|
|
81
|
-
clusters?: ClusterInfo[];
|
|
82
|
-
snapshotId?: number;
|
|
83
|
-
crawledAt?: string;
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
interface CrawlData {
|
|
87
|
-
pages: Iterable<CrawlPage> | CrawlPage[];
|
|
88
|
-
metrics: Metrics;
|
|
89
|
-
graph: Graph;
|
|
90
|
-
snapshotId: number;
|
|
91
|
-
crawledAt?: string;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
/**
|
|
95
|
-
* Analyzes a site for SEO, content, and accessibility.
|
|
96
|
-
* Supports live crawling or loading from a database snapshot.
|
|
97
|
-
* Note: File-based data loading is not supported.
|
|
98
|
-
*
|
|
99
|
-
* @param url The root URL to analyze
|
|
100
|
-
* @param options Analysis options
|
|
101
|
-
* @param context Engine context for event emission
|
|
102
|
-
*/
|
|
103
|
-
export async function analyzeSite(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<AnalysisResult> {
|
|
104
|
-
const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
|
|
105
|
-
if (!normalizedRoot) {
|
|
106
|
-
throw new Error('Invalid URL for analysis');
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
let crawlData: CrawlData;
|
|
110
|
-
let robots: any = null;
|
|
111
|
-
|
|
112
|
-
// Always try to fetch robots.txt for the analysis session
|
|
113
|
-
// to ensure we have the latest rules for visibility reporting.
|
|
114
|
-
try {
|
|
115
|
-
const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
|
|
116
|
-
const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
|
|
117
|
-
const status = robotsRes.status;
|
|
118
|
-
if (typeof status === 'number' && status >= 200 && status < 300) {
|
|
119
|
-
const robotsParserModule = await import('robots-parser');
|
|
120
|
-
const robotsParser = (robotsParserModule as any).default || robotsParserModule;
|
|
121
|
-
robots = (robotsParser as any)(robotsUrl, robotsRes.body);
|
|
122
|
-
}
|
|
123
|
-
} catch {
|
|
124
|
-
// Silence robots fetch errors, fallback to existing or none
|
|
125
|
-
}
|
|
126
|
-
if (options.live) {
|
|
127
|
-
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
128
|
-
} else {
|
|
129
|
-
try {
|
|
130
|
-
crawlData = await loadCrawlData(normalizedRoot);
|
|
131
|
-
|
|
132
|
-
// Convert generator to array so it can be reused multiple times
|
|
133
|
-
const allPages = Array.from(crawlData.pages);
|
|
134
|
-
crawlData.pages = allPages;
|
|
135
|
-
|
|
136
|
-
// Check if the requested URL actually exists in this snapshot
|
|
137
|
-
const exists = allPages.some(p => p.url === normalizedRoot);
|
|
138
|
-
if (!exists) {
|
|
139
|
-
options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
|
|
140
|
-
if (context) {
|
|
141
|
-
context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
|
|
142
|
-
}
|
|
143
|
-
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
144
|
-
}
|
|
145
|
-
} catch (error: any) {
|
|
146
|
-
const isNotFound = error.code === 'ENOENT' ||
|
|
147
|
-
error.message.includes('Crawl data not found') ||
|
|
148
|
-
error.message.includes('No completed snapshot found') ||
|
|
149
|
-
error.message.includes('not found in database');
|
|
150
|
-
if (isNotFound) {
|
|
151
|
-
options.live = true; // Force live mode
|
|
152
|
-
if (context) {
|
|
153
|
-
context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
|
|
154
|
-
}
|
|
155
|
-
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
156
|
-
} else {
|
|
157
|
-
throw error;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
const snapshotId = crawlData.snapshotId;
|
|
163
|
-
const crawledAt = crawlData.crawledAt;
|
|
164
|
-
|
|
165
|
-
// Run clustering if requested or as default
|
|
166
|
-
detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
|
|
167
|
-
|
|
168
|
-
const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
|
|
169
|
-
|
|
170
|
-
const activeModules = {
|
|
171
|
-
seo: !!options.seo,
|
|
172
|
-
content: !!options.content,
|
|
173
|
-
accessibility: !!options.accessibility
|
|
174
|
-
};
|
|
175
|
-
|
|
176
|
-
const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
|
|
177
|
-
|
|
178
|
-
const filteredPages = hasFilters
|
|
179
|
-
? pages.map((page) => filterPageModules(page, activeModules))
|
|
180
|
-
: pages;
|
|
181
|
-
|
|
182
|
-
// Filter to only the requested URL
|
|
183
|
-
const targetPage = filteredPages.find(p => p.url === normalizedRoot);
|
|
184
|
-
let resultPages: PageAnalysis[];
|
|
185
|
-
|
|
186
|
-
if (options.allPages) {
|
|
187
|
-
resultPages = filteredPages;
|
|
188
|
-
} else {
|
|
189
|
-
resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
|
|
193
|
-
const thinPages = pages.filter((page) => page.thinScore >= 70).length;
|
|
194
|
-
const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
|
|
195
|
-
|
|
196
|
-
return {
|
|
197
|
-
site_summary: {
|
|
198
|
-
pages_analyzed: resultPages.length,
|
|
199
|
-
avg_seo_score: siteScores.seoHealthScore,
|
|
200
|
-
thin_pages: thinPages,
|
|
201
|
-
duplicate_titles: duplicateTitles,
|
|
202
|
-
site_score: siteScores.overallScore
|
|
203
|
-
},
|
|
204
|
-
site_scores: siteScores,
|
|
205
|
-
pages: resultPages,
|
|
206
|
-
active_modules: activeModules,
|
|
207
|
-
clusters: crawlData.graph.contentClusters,
|
|
208
|
-
snapshotId,
|
|
209
|
-
crawledAt
|
|
210
|
-
};
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
export function renderAnalysisHtml(result: AnalysisResult): string {
|
|
214
|
-
if (result.pages.length === 1) {
|
|
215
|
-
return renderSinglePageHtml(result.pages[0]);
|
|
216
|
-
}
|
|
217
|
-
const rows = result.pages
|
|
218
|
-
.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
|
|
219
|
-
.join('');
|
|
220
|
-
|
|
221
|
-
return ANALYSIS_LIST_TEMPLATE
|
|
222
|
-
.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
|
|
223
|
-
.replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
|
|
224
|
-
.replace('{{ROWS}}', rows);
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
function renderSinglePageHtml(page: PageAnalysis): string {
|
|
228
|
-
const structuredDataStatus = page.structuredData.present
|
|
229
|
-
? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
|
|
230
|
-
: 'Not detected';
|
|
231
|
-
|
|
232
|
-
const structuredDataTypesRow = page.structuredData.present ? `
|
|
233
|
-
<tr>
|
|
234
|
-
<th>Types Found</th>
|
|
235
|
-
<td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
|
|
236
|
-
</tr>
|
|
237
|
-
` : '';
|
|
238
|
-
|
|
239
|
-
return ANALYSIS_PAGE_TEMPLATE
|
|
240
|
-
.replaceAll('{{URL}}', escapeHtml(page.url))
|
|
241
|
-
.replace('{{SEO_SCORE}}', page.seoScore.toString())
|
|
242
|
-
.replace('{{THIN_SCORE}}', page.thinScore.toString())
|
|
243
|
-
.replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
|
|
244
|
-
.replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
|
|
245
|
-
.replace('{{TITLE_LENGTH}}', page.title.length.toString())
|
|
246
|
-
.replaceAll('{{TITLE_STATUS}}', page.title.status)
|
|
247
|
-
.replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
|
|
248
|
-
.replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
|
|
249
|
-
.replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
|
|
250
|
-
.replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
|
|
251
|
-
.replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
|
|
252
|
-
.replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
|
|
253
|
-
.replaceAll('{{H1_STATUS}}', page.h1.status)
|
|
254
|
-
.replace('{{H1_COUNT}}', page.h1.count.toString())
|
|
255
|
-
.replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
|
|
256
|
-
.replace('{{WORD_COUNT}}', page.content.wordCount.toString())
|
|
257
|
-
.replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
|
|
258
|
-
.replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
|
|
259
|
-
.replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
|
|
260
|
-
.replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
|
|
261
|
-
.replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
|
|
262
|
-
.replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
|
|
263
|
-
.replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
|
|
264
|
-
.replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
|
|
265
|
-
.replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
export function renderAnalysisMarkdown(result: AnalysisResult): string {
|
|
269
|
-
const summary = [
|
|
270
|
-
'# Crawlith SEO Analysis Report',
|
|
271
|
-
'',
|
|
272
|
-
'## 📊 Summary',
|
|
273
|
-
`- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
|
|
274
|
-
`- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
|
|
275
|
-
`- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
|
|
276
|
-
`- Thin Pages Found: ${result.site_summary.thin_pages}`,
|
|
277
|
-
`- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
|
|
278
|
-
'',
|
|
279
|
-
'## 📄 Page Details',
|
|
280
|
-
'',
|
|
281
|
-
'| URL | SEO Score | Thin Score | Title Status | Meta Status |',
|
|
282
|
-
'| :--- | :--- | :--- | :--- | :--- |',
|
|
283
|
-
];
|
|
284
|
-
|
|
285
|
-
result.pages.forEach((page) => {
|
|
286
|
-
summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
|
|
287
|
-
});
|
|
288
|
-
|
|
289
|
-
return summary.join('\n');
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
export function renderAnalysisCsv(result: AnalysisResult): string {
|
|
293
|
-
const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
|
|
294
|
-
const rows = result.pages.map((p) => {
|
|
295
|
-
const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
|
|
296
|
-
return [
|
|
297
|
-
p.url,
|
|
298
|
-
p.seoScore,
|
|
299
|
-
p.thinScore,
|
|
300
|
-
statusStr,
|
|
301
|
-
`"${(p.title.value || '').replace(/"/g, '""')}"`,
|
|
302
|
-
p.title.length,
|
|
303
|
-
`"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
|
|
304
|
-
p.metaDescription.length,
|
|
305
|
-
p.content.wordCount,
|
|
306
|
-
p.links.internalLinks,
|
|
307
|
-
p.links.externalLinks
|
|
308
|
-
].join(',');
|
|
309
|
-
});
|
|
310
|
-
|
|
311
|
-
return [headers.join(','), ...rows].join('\n');
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
function escapeHtml(value: string): string {
|
|
315
|
-
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
export function analyzePages(rootUrl: string, pages: Iterable<CrawlPage> | CrawlPage[], robots?: any): PageAnalysis[] {
|
|
319
|
-
const titleCounts = new Map<string, number>();
|
|
320
|
-
const metaCounts = new Map<string, number>();
|
|
321
|
-
const sentenceCountFrequency = new Map<number, number>();
|
|
322
|
-
|
|
323
|
-
const results: PageAnalysis[] = [];
|
|
324
|
-
|
|
325
|
-
for (const page of pages) {
|
|
326
|
-
const html = page.html || '';
|
|
327
|
-
|
|
328
|
-
// 0. Update crawl status based on current robots rules
|
|
329
|
-
let crawlStatus = page.crawlStatus;
|
|
330
|
-
if (robots) {
|
|
331
|
-
const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
|
|
332
|
-
(!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
|
|
333
|
-
if (isBlocked) {
|
|
334
|
-
crawlStatus = 'blocked_by_robots';
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
// 1. Analyze Individual Components
|
|
339
|
-
const title = analyzeTitle(html);
|
|
340
|
-
const metaDescription = analyzeMetaDescription(html);
|
|
341
|
-
const h1 = analyzeH1(html, title.value);
|
|
342
|
-
const content = analyzeContent(html);
|
|
343
|
-
const images = analyzeImageAlts(html);
|
|
344
|
-
const links = analyzeLinks(html, page.url, rootUrl);
|
|
345
|
-
const structuredData = analyzeStructuredData(html);
|
|
346
|
-
|
|
347
|
-
// 2. Accumulate Frequencies for Duplicates
|
|
348
|
-
if (title.value) {
|
|
349
|
-
const key = (title.value || '').trim().toLowerCase();
|
|
350
|
-
titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
|
|
351
|
-
}
|
|
352
|
-
if (metaDescription.value) {
|
|
353
|
-
const key = (metaDescription.value || '').trim().toLowerCase();
|
|
354
|
-
metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
|
|
355
|
-
}
|
|
356
|
-
sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
|
|
357
|
-
|
|
358
|
-
// 3. Store Preliminary Result
|
|
359
|
-
results.push({
|
|
360
|
-
url: page.url,
|
|
361
|
-
status: page.status || 0,
|
|
362
|
-
title,
|
|
363
|
-
metaDescription,
|
|
364
|
-
h1,
|
|
365
|
-
content,
|
|
366
|
-
thinScore: 0, // Calculated in pass 2
|
|
367
|
-
images,
|
|
368
|
-
links,
|
|
369
|
-
structuredData,
|
|
370
|
-
seoScore: 0, // Calculated in pass 2
|
|
371
|
-
meta: {
|
|
372
|
-
canonical: page.canonical,
|
|
373
|
-
noindex: page.noindex,
|
|
374
|
-
nofollow: page.nofollow,
|
|
375
|
-
crawlStatus
|
|
376
|
-
}
|
|
377
|
-
});
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
// 4. Finalize Statuses and Scores (Pass 2)
|
|
381
|
-
for (const analysis of results) {
|
|
382
|
-
// Check Title Duplicates
|
|
383
|
-
if (analysis.title.value) {
|
|
384
|
-
const key = (analysis.title.value || '').trim().toLowerCase();
|
|
385
|
-
if ((titleCounts.get(key) || 0) > 1) {
|
|
386
|
-
analysis.title.status = 'duplicate';
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
// Check Meta Duplicates
|
|
391
|
-
if (analysis.metaDescription.value) {
|
|
392
|
-
const key = (analysis.metaDescription.value || '').trim().toLowerCase();
|
|
393
|
-
if ((metaCounts.get(key) || 0) > 1) {
|
|
394
|
-
analysis.metaDescription.status = 'duplicate';
|
|
395
|
-
}
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
// Check Content Duplication
|
|
399
|
-
const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
400
|
-
analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
|
|
401
|
-
|
|
402
|
-
// Calculate Final SEO Score
|
|
403
|
-
analysis.seoScore = scorePageSeo(analysis);
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
return results;
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
function filterPageModules(
|
|
410
|
-
page: PageAnalysis,
|
|
411
|
-
modules: { seo: boolean; content: boolean; accessibility: boolean }
|
|
412
|
-
): PageAnalysis {
|
|
413
|
-
const keepSeo = modules.seo;
|
|
414
|
-
const keepContent = modules.content;
|
|
415
|
-
const keepAccessibility = modules.accessibility;
|
|
416
|
-
|
|
417
|
-
return {
|
|
418
|
-
...page,
|
|
419
|
-
title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
|
|
420
|
-
metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
|
|
421
|
-
h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
|
|
422
|
-
links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
|
|
423
|
-
structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
|
|
424
|
-
content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
|
|
425
|
-
thinScore: keepContent ? page.thinScore : 0,
|
|
426
|
-
images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
|
|
427
|
-
};
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
async function loadCrawlData(rootUrl: string): Promise<CrawlData> {
|
|
431
|
-
const db = getDb();
|
|
432
|
-
const siteRepo = new SiteRepository(db);
|
|
433
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
434
|
-
const pageRepo = new PageRepository(db);
|
|
435
|
-
|
|
436
|
-
const urlObj = new URL(rootUrl);
|
|
437
|
-
const domain = urlObj.hostname.replace('www.', '');
|
|
438
|
-
const site = siteRepo.firstOrCreateSite(domain);
|
|
439
|
-
|
|
440
|
-
let snapshot;
|
|
441
|
-
const page = pageRepo.getPage(site.id, rootUrl);
|
|
442
|
-
if (page && page.last_seen_snapshot_id) {
|
|
443
|
-
snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
if (!snapshot) {
|
|
447
|
-
snapshot = snapshotRepo.getLatestSnapshot(site.id);
|
|
448
|
-
}
|
|
449
|
-
|
|
450
|
-
if (!snapshot) {
|
|
451
|
-
throw new Error(`No crawl data found for ${rootUrl} in database.`);
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
const graph = loadGraphFromSnapshot(snapshot.id);
|
|
455
|
-
const metrics = calculateMetrics(graph, 5);
|
|
456
|
-
|
|
457
|
-
// Use iterator to save memory
|
|
458
|
-
const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
|
|
459
|
-
|
|
460
|
-
// We need to map the DB pages to CrawlPage format lazily
|
|
461
|
-
const pagesGenerator = function* () {
|
|
462
|
-
for (const p of dbPagesIterator) {
|
|
463
|
-
yield {
|
|
464
|
-
url: p.normalized_url,
|
|
465
|
-
status: p.http_status || 0,
|
|
466
|
-
html: p.html || '',
|
|
467
|
-
depth: p.depth || 0,
|
|
468
|
-
canonical: p.canonical_url || undefined,
|
|
469
|
-
noindex: !!p.noindex,
|
|
470
|
-
nofollow: !!p.nofollow,
|
|
471
|
-
crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
|
|
472
|
-
} as CrawlPage;
|
|
473
|
-
}
|
|
474
|
-
};
|
|
475
|
-
|
|
476
|
-
return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
async function runLiveCrawl(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<CrawlData> {
|
|
481
|
-
const snapshotId = await crawl(url, {
|
|
482
|
-
limit: 1, // Always limit to 1 for single page live analysis
|
|
483
|
-
depth: 0,
|
|
484
|
-
rate: options.rate,
|
|
485
|
-
proxyUrl: options.proxyUrl,
|
|
486
|
-
userAgent: options.userAgent,
|
|
487
|
-
maxRedirects: options.maxRedirects,
|
|
488
|
-
debug: options.debug,
|
|
489
|
-
snapshotType: 'partial'
|
|
490
|
-
}, context) as number;
|
|
491
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
492
|
-
const pages = graph.getNodes().map((node) => ({
|
|
493
|
-
url: node.url,
|
|
494
|
-
status: node.status,
|
|
495
|
-
html: node.html || '', // Include HTML
|
|
496
|
-
depth: node.depth,
|
|
497
|
-
crawlStatus: node.crawlStatus
|
|
498
|
-
}));
|
|
499
|
-
return {
|
|
500
|
-
pages,
|
|
501
|
-
metrics: calculateMetrics(graph, 1),
|
|
502
|
-
graph,
|
|
503
|
-
snapshotId
|
|
504
|
-
};
|
|
505
|
-
}
|
package/src/analysis/content.ts
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import { load } from 'cheerio';
|
|
2
|
-
|
|
3
|
-
export interface ContentAnalysis {
|
|
4
|
-
wordCount: number;
|
|
5
|
-
textHtmlRatio: number;
|
|
6
|
-
uniqueSentenceCount: number;
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
export interface ThinScoreWeights {
|
|
10
|
-
lowWordWeight: number;
|
|
11
|
-
ratioWeight: number;
|
|
12
|
-
dupWeight: number;
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
const DEFAULT_WEIGHTS: ThinScoreWeights = {
|
|
16
|
-
lowWordWeight: 0.4,
|
|
17
|
-
ratioWeight: 0.35,
|
|
18
|
-
dupWeight: 0.25
|
|
19
|
-
};
|
|
20
|
-
|
|
21
|
-
export function analyzeContent(html: string): ContentAnalysis {
|
|
22
|
-
const $ = load(html || '<html></html>');
|
|
23
|
-
$('script,style,nav,footer').remove();
|
|
24
|
-
|
|
25
|
-
const text = $('body').length ? $('body').text() : $.text();
|
|
26
|
-
const cleanText = text.replace(/\s+/g, ' ').trim();
|
|
27
|
-
|
|
28
|
-
const words = cleanText ? cleanText.split(/\s+/).filter(Boolean) : [];
|
|
29
|
-
const wordCount = words.length;
|
|
30
|
-
|
|
31
|
-
const htmlLength = Math.max(html.length, 1);
|
|
32
|
-
const textHtmlRatio = cleanText.length / htmlLength;
|
|
33
|
-
|
|
34
|
-
const sentenceSet = new Set(
|
|
35
|
-
cleanText
|
|
36
|
-
.split(/[.!?]+/)
|
|
37
|
-
.map((item) => item.trim().toLowerCase())
|
|
38
|
-
.filter(Boolean)
|
|
39
|
-
);
|
|
40
|
-
|
|
41
|
-
return {
|
|
42
|
-
wordCount,
|
|
43
|
-
textHtmlRatio,
|
|
44
|
-
uniqueSentenceCount: sentenceSet.size
|
|
45
|
-
};
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
export function calculateThinContentScore(
|
|
49
|
-
content: ContentAnalysis,
|
|
50
|
-
duplicationScore: number,
|
|
51
|
-
weights: ThinScoreWeights = DEFAULT_WEIGHTS
|
|
52
|
-
): number {
|
|
53
|
-
const wordScore = content.wordCount >= 300 ? 0 : 100 - Math.min(100, (content.wordCount / 300) * 100);
|
|
54
|
-
const textRatioScore = content.textHtmlRatio >= 0.2 ? 0 : 100 - Math.min(100, (content.textHtmlRatio / 0.2) * 100);
|
|
55
|
-
|
|
56
|
-
const raw =
|
|
57
|
-
weights.lowWordWeight * wordScore +
|
|
58
|
-
weights.ratioWeight * textRatioScore +
|
|
59
|
-
weights.dupWeight * duplicationScore;
|
|
60
|
-
|
|
61
|
-
return Math.max(0, Math.min(100, Number(raw.toFixed(2))));
|
|
62
|
-
}
|
package/src/analysis/images.ts
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import { load } from 'cheerio';
|
|
2
|
-
|
|
3
|
-
export interface ImageAltAnalysis {
|
|
4
|
-
totalImages: number;
|
|
5
|
-
missingAlt: number;
|
|
6
|
-
emptyAlt: number;
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
export function analyzeImageAlts(html: string): ImageAltAnalysis {
|
|
10
|
-
const $ = load(html);
|
|
11
|
-
let missingAlt = 0;
|
|
12
|
-
let emptyAlt = 0;
|
|
13
|
-
|
|
14
|
-
$('img').each((_idx, el) => {
|
|
15
|
-
const alt = $(el).attr('alt');
|
|
16
|
-
if (alt === undefined) {
|
|
17
|
-
missingAlt += 1;
|
|
18
|
-
return;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
if (!alt.trim()) {
|
|
22
|
-
emptyAlt += 1;
|
|
23
|
-
}
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
const totalImages = $('img').length;
|
|
27
|
-
return { totalImages, missingAlt, emptyAlt };
|
|
28
|
-
}
|
package/src/analysis/links.ts
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import { load } from 'cheerio';
|
|
2
|
-
import { normalizeUrl } from '../crawler/normalize.js';
|
|
3
|
-
|
|
4
|
-
export interface LinkRatioAnalysis {
|
|
5
|
-
internalLinks: number;
|
|
6
|
-
externalLinks: number;
|
|
7
|
-
nofollowCount: number;
|
|
8
|
-
externalRatio: number;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export function analyzeLinks(html: string, pageUrl: string, rootUrl: string): LinkRatioAnalysis {
|
|
12
|
-
const $ = load(html);
|
|
13
|
-
const rootOrigin = new URL(rootUrl).origin;
|
|
14
|
-
|
|
15
|
-
let internalLinks = 0;
|
|
16
|
-
let externalLinks = 0;
|
|
17
|
-
let nofollowCount = 0;
|
|
18
|
-
|
|
19
|
-
$('a[href]').each((_idx, el) => {
|
|
20
|
-
const href = $(el).attr('href');
|
|
21
|
-
if (!href) return;
|
|
22
|
-
const normalized = normalizeUrl(href, pageUrl, { stripQuery: false });
|
|
23
|
-
if (!normalized) return;
|
|
24
|
-
|
|
25
|
-
const rel = ($(el).attr('rel') || '').toLowerCase();
|
|
26
|
-
if (rel.includes('nofollow')) {
|
|
27
|
-
nofollowCount += 1;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
if (new URL(normalized).origin === rootOrigin) {
|
|
31
|
-
internalLinks += 1;
|
|
32
|
-
} else {
|
|
33
|
-
externalLinks += 1;
|
|
34
|
-
}
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
const total = internalLinks + externalLinks;
|
|
38
|
-
const externalRatio = total === 0 ? 0 : externalLinks / total;
|
|
39
|
-
|
|
40
|
-
return { internalLinks, externalLinks, nofollowCount, externalRatio };
|
|
41
|
-
}
|