@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/analysis/analyze.js
CHANGED
|
@@ -1,293 +1,342 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
2
|
import { crawl } from '../crawler/crawl.js';
|
|
3
|
+
import { UrlResolver } from '../crawler/resolver.js';
|
|
4
|
+
import { Fetcher } from '../crawler/fetcher.js';
|
|
3
5
|
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
4
|
-
import { normalizeUrl } from '../crawler/normalize.js';
|
|
6
|
+
import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
|
|
5
7
|
import { calculateMetrics } from '../graph/metrics.js';
|
|
6
|
-
import { Graph } from '../graph/graph.js';
|
|
7
8
|
import { analyzeContent, calculateThinContentScore } from './content.js';
|
|
8
|
-
import { analyzeH1, analyzeMetaDescription, analyzeTitle
|
|
9
|
+
import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
|
|
9
10
|
import { analyzeImageAlts } from './images.js';
|
|
10
11
|
import { analyzeLinks } from './links.js';
|
|
11
12
|
import { analyzeStructuredData } from './structuredData.js';
|
|
12
13
|
import { aggregateSiteScore, scorePageSeo } from './scoring.js';
|
|
13
|
-
import {
|
|
14
|
+
import { ClusteringService } from './clustering.js';
|
|
15
|
+
import { DuplicateService } from './duplicate.js';
|
|
16
|
+
import { Soft404Service } from './soft404.js';
|
|
14
17
|
import { getDb } from '../db/index.js';
|
|
15
18
|
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
16
19
|
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
17
20
|
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
+
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
22
|
+
import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
|
|
23
|
+
import { DEFAULTS } from '../constants.js';
|
|
24
|
+
import { PageRankService } from '../graph/pagerank.js';
|
|
25
|
+
import { HITSService } from '../graph/hits.js';
|
|
26
|
+
import { HeadingHealthService } from './heading.js';
|
|
27
|
+
import { annotateOrphans } from './orphan.js';
|
|
28
|
+
import { HealthService } from '../scoring/health.js';
|
|
29
|
+
/**
|
|
30
|
+
* Analyzes a site for SEO, content, and accessibility.
|
|
31
|
+
* Supports live crawling or loading from a database snapshot.
|
|
32
|
+
*/
|
|
33
|
+
export async function analyzeSite(url, options, context) {
|
|
34
|
+
// 1. Parse siteOrigin (e.g. https://example.com) and targetPath (e.g. /stats) from the URL.
|
|
35
|
+
// We resolve the *origin* — not the full page URL — so rootOrigin is always just the
|
|
36
|
+
// scheme+host and normalizedPath is always the pathname.
|
|
37
|
+
let parsedUrl = null;
|
|
38
|
+
try {
|
|
39
|
+
parsedUrl = new URL(url);
|
|
40
|
+
}
|
|
41
|
+
catch { /* bare domain fallback below */ }
|
|
42
|
+
const inputFullUrl = parsedUrl ? parsedUrl.toString() : (url.startsWith('http') ? url : `https://${url}`);
|
|
43
|
+
const inputOrigin = parsedUrl ? `${parsedUrl.protocol}//${parsedUrl.host}` : url;
|
|
44
|
+
let rootOrigin = inputOrigin;
|
|
45
|
+
if (options.live !== false) {
|
|
46
|
+
const resolver = new UrlResolver();
|
|
47
|
+
const fetcher = new Fetcher({ rate: options.rate, proxyUrl: options.proxyUrl, userAgent: options.userAgent });
|
|
48
|
+
try {
|
|
49
|
+
const resolved = await resolver.resolve(inputOrigin, fetcher);
|
|
50
|
+
rootOrigin = resolved.url;
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// Fallback to basic normalization if resolution fails
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// Normalize origin and target URL independently.
|
|
57
|
+
const normalizedOrigin = normalizeUrl(rootOrigin, '', { stripQuery: false });
|
|
58
|
+
if (!normalizedOrigin) {
|
|
21
59
|
throw new Error('Invalid URL for analysis');
|
|
22
60
|
}
|
|
61
|
+
const normalizedTargetAbs = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false }) || inputFullUrl;
|
|
62
|
+
const normalizedPath = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false, toPath: true })
|
|
63
|
+
|| UrlUtil.toPath(normalizedTargetAbs, rootOrigin);
|
|
64
|
+
const start = Date.now();
|
|
23
65
|
let crawlData;
|
|
66
|
+
let robots = null;
|
|
67
|
+
// 1. Robots fetch (live-mode only to keep snapshot analysis deterministic and fast)
|
|
68
|
+
if (options.live) {
|
|
69
|
+
try {
|
|
70
|
+
const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
|
|
71
|
+
const { Fetcher } = await import('../crawler/fetcher.js');
|
|
72
|
+
const fetcher = new Fetcher({
|
|
73
|
+
rate: DEFAULTS.RATE_LIMIT,
|
|
74
|
+
proxyUrl: options.proxyUrl,
|
|
75
|
+
userAgent: options.userAgent
|
|
76
|
+
});
|
|
77
|
+
const robotsRes = await fetcher.fetch(robotsUrl, { maxBytes: 500000 });
|
|
78
|
+
if (typeof robotsRes.status === 'number' && robotsRes.status >= 200 && robotsRes.status < 300) {
|
|
79
|
+
const robotsParserModule = await import('robots-parser');
|
|
80
|
+
const robotsParser = robotsParserModule.default || robotsParserModule;
|
|
81
|
+
robots = robotsParser(robotsUrl, robotsRes.body);
|
|
82
|
+
if (context)
|
|
83
|
+
context.emit({ type: 'info', message: `[analyze] Robots fetch took ${Date.now() - start}ms` });
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
catch {
|
|
87
|
+
// Fallback
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
// Data Acquisition
|
|
24
91
|
if (options.live) {
|
|
25
|
-
|
|
92
|
+
const crawlStart = Date.now();
|
|
93
|
+
crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
|
|
94
|
+
if (context)
|
|
95
|
+
context.emit({ type: 'info', message: `[analyze] runLiveCrawl took ${Date.now() - crawlStart}ms` });
|
|
26
96
|
}
|
|
27
97
|
else {
|
|
28
98
|
try {
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
if (
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
else {
|
|
41
|
-
throw error;
|
|
99
|
+
const loadStart = Date.now();
|
|
100
|
+
crawlData = await loadCrawlData(normalizedOrigin, options.snapshotId);
|
|
101
|
+
if (context)
|
|
102
|
+
context.emit({ type: 'debug', message: `[analyze] loadCrawlData took ${Date.now() - loadStart}ms` });
|
|
103
|
+
const allPages = Array.from(crawlData.pages);
|
|
104
|
+
crawlData.pages = allPages;
|
|
105
|
+
const exists = allPages.some(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
|
|
106
|
+
if (!exists) {
|
|
107
|
+
if (context)
|
|
108
|
+
context.emit({ type: 'info', message: `URL ${normalizedTargetAbs} not found. Fetching live...` });
|
|
109
|
+
crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
|
|
42
110
|
}
|
|
43
111
|
}
|
|
112
|
+
catch (_error) {
|
|
113
|
+
if (context)
|
|
114
|
+
context.emit({ type: 'info', message: 'No local crawl data found. Switching to live...' });
|
|
115
|
+
crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
const snapshotId = crawlData.snapshotId;
|
|
119
|
+
const crawledAt = crawlData.crawledAt;
|
|
120
|
+
const pagesStart = Date.now();
|
|
121
|
+
const pages = analyzePages(normalizedTargetAbs, rootOrigin, crawlData.pages, robots, options);
|
|
122
|
+
if (context)
|
|
123
|
+
context.emit({ type: 'debug', message: `[analyze] analyzePages took ${Date.now() - pagesStart}ms` });
|
|
124
|
+
// Sync basic page analysis results back to graph nodes for persistence
|
|
125
|
+
for (const pageAnalysis of pages) {
|
|
126
|
+
const node = crawlData.graph.nodes.get(pageAnalysis.url);
|
|
127
|
+
if (node) {
|
|
128
|
+
node.soft404Score = pageAnalysis.soft404?.score;
|
|
129
|
+
node.wordCount = pageAnalysis.content.wordCount;
|
|
130
|
+
node.externalLinkRatio = pageAnalysis.links.externalRatio;
|
|
131
|
+
node.thinContentScore = pageAnalysis.thinScore;
|
|
132
|
+
node.title = pageAnalysis.title.value || undefined;
|
|
133
|
+
}
|
|
44
134
|
}
|
|
45
|
-
// Run clustering if requested or as default
|
|
46
|
-
detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
|
|
47
|
-
const pages = analyzePages(normalizedRoot, crawlData.pages);
|
|
48
135
|
const activeModules = {
|
|
49
136
|
seo: !!options.seo,
|
|
50
137
|
content: !!options.content,
|
|
51
138
|
accessibility: !!options.accessibility
|
|
52
139
|
};
|
|
53
140
|
const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
|
|
54
|
-
const filteredPages = hasFilters
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
141
|
+
const filteredPages = hasFilters ? pages.map((page) => filterPageModules(page, activeModules)) : pages;
|
|
142
|
+
const targetPage = filteredPages.find(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
|
|
143
|
+
let resultPages;
|
|
144
|
+
if (options.allPages) {
|
|
145
|
+
resultPages = filteredPages;
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
|
|
149
|
+
}
|
|
150
|
+
let clusters = [];
|
|
151
|
+
let duplicates = [];
|
|
152
|
+
let prResults = new Map();
|
|
153
|
+
let hitsResults = new Map();
|
|
154
|
+
let headingPayloads = {};
|
|
155
|
+
if (options.clustering) {
|
|
156
|
+
const clustering = new ClusteringService();
|
|
157
|
+
clusters = clustering.detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
|
|
158
|
+
}
|
|
159
|
+
if (options.allPages) {
|
|
160
|
+
const duplication = new DuplicateService();
|
|
161
|
+
duplicates = duplication.detectDuplicates(crawlData.graph, { collapse: false });
|
|
162
|
+
}
|
|
163
|
+
if (options.computePagerank) {
|
|
164
|
+
const prService = new PageRankService();
|
|
165
|
+
prResults = prService.evaluate(crawlData.graph);
|
|
166
|
+
}
|
|
167
|
+
if (options.computeHits) {
|
|
168
|
+
const hitsService = new HITSService();
|
|
169
|
+
hitsResults = hitsService.evaluate(crawlData.graph);
|
|
170
|
+
}
|
|
171
|
+
if (options.heading) {
|
|
172
|
+
const headingService = new HeadingHealthService();
|
|
173
|
+
const { payloadsByUrl } = headingService.evaluateNodes(crawlData.graph.getNodes());
|
|
174
|
+
headingPayloads = payloadsByUrl;
|
|
175
|
+
}
|
|
176
|
+
if (options.orphans) {
|
|
177
|
+
const edges = crawlData.graph.getEdges();
|
|
178
|
+
annotateOrphans(crawlData.graph.getNodes(), edges, {
|
|
179
|
+
enabled: true,
|
|
180
|
+
severityEnabled: !!options.orphanSeverity,
|
|
181
|
+
includeSoftOrphans: !!options.includeSoftOrphans,
|
|
182
|
+
minInbound: options.minInbound || 2,
|
|
183
|
+
rootUrl: normalizedOrigin
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
// Run HealthService when --health is enabled
|
|
187
|
+
let healthBreakdown;
|
|
188
|
+
if (options.health) {
|
|
189
|
+
const healthService = new HealthService();
|
|
190
|
+
const issues = healthService.collectCrawlIssues(crawlData.graph, crawlData.metrics, rootOrigin);
|
|
191
|
+
healthBreakdown = healthService.calculateHealthScore(crawlData.graph.nodes.size, issues);
|
|
192
|
+
}
|
|
193
|
+
// Update nodes in graph with results
|
|
194
|
+
for (const node of crawlData.graph.getNodes()) {
|
|
195
|
+
const pr = prResults.get(node.url);
|
|
196
|
+
if (pr)
|
|
197
|
+
node.pagerankScore = pr.score;
|
|
198
|
+
const hits = hitsResults.get(node.url);
|
|
199
|
+
if (hits) {
|
|
200
|
+
node.hubScore = hits.hub_score;
|
|
201
|
+
node.authScore = hits.authority_score;
|
|
202
|
+
node.linkRole = hits.link_role;
|
|
203
|
+
}
|
|
204
|
+
const heading = headingPayloads[node.url];
|
|
205
|
+
if (heading) {
|
|
206
|
+
node.headingScore = heading.score;
|
|
207
|
+
node.headingData = JSON.stringify(heading);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
// Synchronize graph-level final scores back to PageAnalysis models
|
|
211
|
+
for (const page of pages) {
|
|
212
|
+
const node = crawlData.graph.nodes.get(page.url);
|
|
213
|
+
if (node) {
|
|
214
|
+
if (node.headingScore !== undefined)
|
|
215
|
+
page.headingScore = node.headingScore;
|
|
216
|
+
page.seoScore = scorePageSeo(page);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
60
219
|
const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
|
|
61
220
|
const thinPages = pages.filter((page) => page.thinScore >= 70).length;
|
|
62
|
-
const siteScores = aggregateSiteScore(crawlData.metrics, pages);
|
|
63
|
-
|
|
221
|
+
const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
|
|
222
|
+
if (context)
|
|
223
|
+
context.emit({ type: 'debug', message: `[analyze] Total analysis completed in ${Date.now() - start}ms` });
|
|
224
|
+
// Persist to Database
|
|
225
|
+
const db = getDb();
|
|
226
|
+
const metricsRepo = new MetricsRepository(db);
|
|
227
|
+
const pageRepo = new PageRepository(db);
|
|
228
|
+
// Efficiently map URLs to IDs for this snapshot
|
|
229
|
+
const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
|
|
230
|
+
const urlToIdMap = new Map(pagesIdentity.map(p => [p.normalized_url, p.id]));
|
|
231
|
+
const metricsToSave = crawlData.graph.getNodes().map(node => {
|
|
232
|
+
const pageId = urlToIdMap.get(node.url);
|
|
233
|
+
if (!pageId)
|
|
234
|
+
return null;
|
|
235
|
+
return {
|
|
236
|
+
snapshot_id: snapshotId,
|
|
237
|
+
page_id: pageId,
|
|
238
|
+
crawl_status: node.crawlStatus || null,
|
|
239
|
+
word_count: node.wordCount || null,
|
|
240
|
+
thin_content_score: node.thinContentScore || null,
|
|
241
|
+
external_link_ratio: node.externalLinkRatio || null,
|
|
242
|
+
pagerank_score: node.pagerankScore || null,
|
|
243
|
+
hub_score: node.hubScore || null,
|
|
244
|
+
auth_score: node.authScore || null,
|
|
245
|
+
link_role: node.linkRole || null,
|
|
246
|
+
duplicate_cluster_id: node.duplicateClusterId || null,
|
|
247
|
+
duplicate_type: node.duplicateType || null,
|
|
248
|
+
cluster_id: node.clusterId || null,
|
|
249
|
+
soft404_score: node.soft404Score || null,
|
|
250
|
+
heading_score: node.headingScore || null,
|
|
251
|
+
orphan_score: node.orphanScore || null,
|
|
252
|
+
orphan_type: node.orphanType || null,
|
|
253
|
+
impact_level: node.impactLevel || null,
|
|
254
|
+
heading_data: node.headingData || null,
|
|
255
|
+
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
256
|
+
};
|
|
257
|
+
}).filter(m => m !== null);
|
|
258
|
+
// Persist health score to snapshot if computed
|
|
259
|
+
if (healthBreakdown && snapshotId) {
|
|
260
|
+
const db2 = getDb();
|
|
261
|
+
const snapshotRepo = new SnapshotRepository(db2);
|
|
262
|
+
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
263
|
+
health_score: healthBreakdown.score
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
metricsRepo.insertMany(metricsToSave);
|
|
267
|
+
const result = {
|
|
64
268
|
site_summary: {
|
|
65
|
-
pages_analyzed:
|
|
269
|
+
pages_analyzed: resultPages.length,
|
|
66
270
|
avg_seo_score: siteScores.seoHealthScore,
|
|
67
271
|
thin_pages: thinPages,
|
|
68
272
|
duplicate_titles: duplicateTitles,
|
|
69
|
-
site_score: siteScores.overallScore
|
|
273
|
+
site_score: siteScores.overallScore,
|
|
274
|
+
site_score_breakdown: siteScores.breakdown
|
|
70
275
|
},
|
|
71
276
|
site_scores: siteScores,
|
|
72
277
|
pages: resultPages,
|
|
73
278
|
active_modules: activeModules,
|
|
74
|
-
|
|
279
|
+
snapshotId,
|
|
280
|
+
crawledAt,
|
|
281
|
+
clusters,
|
|
282
|
+
duplicates
|
|
75
283
|
};
|
|
284
|
+
return result;
|
|
76
285
|
}
|
|
77
|
-
export function
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
}
|
|
81
|
-
const rows = result.pages
|
|
82
|
-
.map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
|
|
83
|
-
.join('');
|
|
84
|
-
return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
|
|
85
|
-
}
|
|
86
|
-
function renderSinglePageHtml(page) {
|
|
87
|
-
return `<!DOCTYPE html>
|
|
88
|
-
<html lang="en">
|
|
89
|
-
<head>
|
|
90
|
-
<meta charset="UTF-8">
|
|
91
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
92
|
-
<title>Analysis for ${escapeHtml(page.url)}</title>
|
|
93
|
-
<style>
|
|
94
|
-
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
|
|
95
|
-
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
96
|
-
h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
|
|
97
|
-
.score-card { display: flex; gap: 20px; margin-bottom: 30px; }
|
|
98
|
-
.score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
|
|
99
|
-
.score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
|
|
100
|
-
.status-ok { color: green; font-weight: bold; }
|
|
101
|
-
.status-warning { color: orange; font-weight: bold; }
|
|
102
|
-
.status-critical { color: red; font-weight: bold; }
|
|
103
|
-
.status-missing { color: red; font-weight: bold; }
|
|
104
|
-
.data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
|
|
105
|
-
.data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
|
|
106
|
-
.data-table th { width: 150px; color: #666; }
|
|
107
|
-
code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
|
|
108
|
-
</style>
|
|
109
|
-
</head>
|
|
110
|
-
<body>
|
|
111
|
-
<h1>Page Analysis</h1>
|
|
112
|
-
<p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
|
|
113
|
-
|
|
114
|
-
<div class="score-card">
|
|
115
|
-
<div class="score-box">
|
|
116
|
-
<div class="score-val">${page.seoScore}</div>
|
|
117
|
-
<div>SEO Score</div>
|
|
118
|
-
</div>
|
|
119
|
-
<div class="score-box">
|
|
120
|
-
<div class="score-val">${page.thinScore}</div>
|
|
121
|
-
<div>Thin Content Score</div>
|
|
122
|
-
</div>
|
|
123
|
-
<div class="score-box">
|
|
124
|
-
<div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
|
|
125
|
-
<div>HTTP Status</div>
|
|
126
|
-
</div>
|
|
127
|
-
</div>
|
|
128
|
-
|
|
129
|
-
<h2>Meta Tags</h2>
|
|
130
|
-
<table class="data-table">
|
|
131
|
-
<tr>
|
|
132
|
-
<th>Title</th>
|
|
133
|
-
<td>
|
|
134
|
-
<div>${escapeHtml(page.title.value || '(missing)')}</div>
|
|
135
|
-
<small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
|
|
136
|
-
</td>
|
|
137
|
-
</tr>
|
|
138
|
-
<tr>
|
|
139
|
-
<th>Description</th>
|
|
140
|
-
<td>
|
|
141
|
-
<div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
|
|
142
|
-
<small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
|
|
143
|
-
</td>
|
|
144
|
-
</tr>
|
|
145
|
-
<tr>
|
|
146
|
-
<th>Canonical</th>
|
|
147
|
-
<td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
|
|
148
|
-
</tr>
|
|
149
|
-
<tr>
|
|
150
|
-
<th>Robots</th>
|
|
151
|
-
<td>
|
|
152
|
-
Index: ${!page.meta.noindex},
|
|
153
|
-
Follow: ${!page.meta.nofollow}
|
|
154
|
-
</td>
|
|
155
|
-
</tr>
|
|
156
|
-
</table>
|
|
157
|
-
|
|
158
|
-
<h2>Content & Heading</h2>
|
|
159
|
-
<table class="data-table">
|
|
160
|
-
<tr>
|
|
161
|
-
<th>H1 Tag</th>
|
|
162
|
-
<td>
|
|
163
|
-
Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
|
|
164
|
-
(${page.h1.count} detected)
|
|
165
|
-
${page.h1.matchesTitle ? ' | Matches Title' : ''}
|
|
166
|
-
</td>
|
|
167
|
-
</tr>
|
|
168
|
-
<tr>
|
|
169
|
-
<th>Word Count</th>
|
|
170
|
-
<td>${page.content.wordCount} words</td>
|
|
171
|
-
</tr>
|
|
172
|
-
<tr>
|
|
173
|
-
<th>Unique Sentences</th>
|
|
174
|
-
<td>${page.content.uniqueSentenceCount}</td>
|
|
175
|
-
</tr>
|
|
176
|
-
<tr>
|
|
177
|
-
<th>Text / HTML Ratio</th>
|
|
178
|
-
<td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
|
|
179
|
-
</tr>
|
|
180
|
-
</table>
|
|
181
|
-
|
|
182
|
-
<h2>Links & Images</h2>
|
|
183
|
-
<table class="data-table">
|
|
184
|
-
<tr>
|
|
185
|
-
<th>Internal Links</th>
|
|
186
|
-
<td>${page.links.internalLinks}</td>
|
|
187
|
-
</tr>
|
|
188
|
-
<tr>
|
|
189
|
-
<th>External Links</th>
|
|
190
|
-
<td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
|
|
191
|
-
</tr>
|
|
192
|
-
<tr>
|
|
193
|
-
<th>Images</th>
|
|
194
|
-
<td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
|
|
195
|
-
</tr>
|
|
196
|
-
</table>
|
|
197
|
-
|
|
198
|
-
<h2>Structured Data</h2>
|
|
199
|
-
<table class="data-table">
|
|
200
|
-
<tr>
|
|
201
|
-
<th>Status</th>
|
|
202
|
-
<td>
|
|
203
|
-
${page.structuredData.present
|
|
204
|
-
? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
|
|
205
|
-
: 'Not detected'}
|
|
206
|
-
</td>
|
|
207
|
-
</tr>
|
|
208
|
-
${page.structuredData.present ? `
|
|
209
|
-
<tr>
|
|
210
|
-
<th>Types Found</th>
|
|
211
|
-
<td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
|
|
212
|
-
</tr>
|
|
213
|
-
` : ''}
|
|
214
|
-
</table>
|
|
215
|
-
</body>
|
|
216
|
-
</html>`;
|
|
217
|
-
}
|
|
218
|
-
export function renderAnalysisMarkdown(result) {
|
|
219
|
-
const summary = [
|
|
220
|
-
'# Crawlith SEO Analysis Report',
|
|
221
|
-
'',
|
|
222
|
-
'## 📊 Summary',
|
|
223
|
-
`- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
|
|
224
|
-
`- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
|
|
225
|
-
`- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
|
|
226
|
-
`- Thin Pages Found: ${result.site_summary.thin_pages}`,
|
|
227
|
-
`- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
|
|
228
|
-
'',
|
|
229
|
-
'## 📄 Page Details',
|
|
230
|
-
'',
|
|
231
|
-
'| URL | SEO Score | Thin Score | Title Status | Meta Status |',
|
|
232
|
-
'| :--- | :--- | :--- | :--- | :--- |',
|
|
233
|
-
];
|
|
234
|
-
result.pages.forEach((page) => {
|
|
235
|
-
summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
|
|
236
|
-
});
|
|
237
|
-
return summary.join('\n');
|
|
238
|
-
}
|
|
239
|
-
export function renderAnalysisCsv(result) {
|
|
240
|
-
const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
|
|
241
|
-
const rows = result.pages.map((p) => {
|
|
242
|
-
const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
|
|
243
|
-
return [
|
|
244
|
-
p.url,
|
|
245
|
-
p.seoScore,
|
|
246
|
-
p.thinScore,
|
|
247
|
-
statusStr,
|
|
248
|
-
`"${(p.title.value || '').replace(/"/g, '""')}"`,
|
|
249
|
-
p.title.length,
|
|
250
|
-
`"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
|
|
251
|
-
p.metaDescription.length,
|
|
252
|
-
p.content.wordCount,
|
|
253
|
-
p.links.internalLinks,
|
|
254
|
-
p.links.externalLinks
|
|
255
|
-
].join(',');
|
|
256
|
-
});
|
|
257
|
-
return [headers.join(','), ...rows].join('\n');
|
|
258
|
-
}
|
|
259
|
-
function escapeHtml(value) {
|
|
260
|
-
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
261
|
-
}
|
|
262
|
-
function analyzePages(rootUrl, pages) {
|
|
263
|
-
const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
|
|
264
|
-
const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
|
|
265
|
-
const titles = applyDuplicateStatuses(titleCandidates);
|
|
266
|
-
const metas = applyDuplicateStatuses(metaCandidates);
|
|
286
|
+
export function analyzePages(targetUrl, rootOrigin, pages, robots, options = {}) {
|
|
287
|
+
const titleCounts = new Map();
|
|
288
|
+
const metaCounts = new Map();
|
|
267
289
|
const sentenceCountFrequency = new Map();
|
|
268
|
-
const
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
290
|
+
const results = [];
|
|
291
|
+
const targetPath = UrlUtil.toPath(targetUrl, rootOrigin);
|
|
292
|
+
const targetAbs = UrlUtil.toAbsolute(targetUrl, rootOrigin);
|
|
293
|
+
for (const page of pages) {
|
|
294
|
+
const pagePath = UrlUtil.toPath(page.url, rootOrigin);
|
|
295
|
+
const pageAbs = UrlUtil.toAbsolute(page.url, rootOrigin);
|
|
296
|
+
const isTarget = page.url === targetUrl || pagePath === targetPath || pageAbs === targetAbs;
|
|
297
|
+
// In single-page mode, if it's not the target, we skip it entirely for speed.
|
|
298
|
+
if (!options.allPages && !isTarget)
|
|
299
|
+
continue;
|
|
273
300
|
const html = page.html || '';
|
|
274
|
-
const
|
|
275
|
-
|
|
276
|
-
const
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
301
|
+
const $ = load(html || '<html></html>');
|
|
302
|
+
// Reconstruct absolute URL from stored path for robots & link resolution
|
|
303
|
+
const pageAbsUrl = UrlUtil.toAbsolute(page.url, rootOrigin);
|
|
304
|
+
let crawlStatus = page.crawlStatus;
|
|
305
|
+
if (robots) {
|
|
306
|
+
const isBlocked = !robots.isAllowed(pageAbsUrl, 'crawlith') ||
|
|
307
|
+
(!pageAbsUrl.endsWith('/') && !robots.isAllowed(pageAbsUrl + '/', 'crawlith'));
|
|
308
|
+
if (isBlocked)
|
|
309
|
+
crawlStatus = 'blocked_by_robots';
|
|
310
|
+
}
|
|
311
|
+
// Shared DOM Analysis
|
|
312
|
+
const title = analyzeTitle($);
|
|
313
|
+
const metaDescription = analyzeMetaDescription($);
|
|
314
|
+
const h1 = analyzeH1($, title.value);
|
|
315
|
+
const content = analyzeContent($);
|
|
316
|
+
const images = analyzeImageAlts($);
|
|
317
|
+
const links = analyzeLinks($, pageAbsUrl, rootOrigin);
|
|
318
|
+
const structuredData = analyzeStructuredData($);
|
|
319
|
+
if (title.value) {
|
|
320
|
+
const key = title.value.trim().toLowerCase();
|
|
321
|
+
titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
|
|
322
|
+
}
|
|
323
|
+
if (metaDescription.value) {
|
|
324
|
+
const key = metaDescription.value.trim().toLowerCase();
|
|
325
|
+
metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
|
|
326
|
+
}
|
|
327
|
+
sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
|
|
328
|
+
const soft404Service = new Soft404Service();
|
|
329
|
+
const soft404 = soft404Service.analyze(html, links.externalLinks + links.internalLinks);
|
|
330
|
+
const isCanonicalConflict = !!(page.canonical && page.canonical !== page.url && page.canonical !== pageAbsUrl &&
|
|
331
|
+
page.canonical.replace(/\/$/, '') !== pageAbsUrl.replace(/\/$/, ''));
|
|
332
|
+
const resultPage = {
|
|
284
333
|
url: page.url,
|
|
285
334
|
status: page.status || 0,
|
|
286
335
|
title,
|
|
287
336
|
metaDescription,
|
|
288
337
|
h1,
|
|
289
338
|
content,
|
|
290
|
-
thinScore,
|
|
339
|
+
thinScore: 0,
|
|
291
340
|
images,
|
|
292
341
|
links,
|
|
293
342
|
structuredData,
|
|
@@ -295,45 +344,50 @@ function analyzePages(rootUrl, pages) {
|
|
|
295
344
|
meta: {
|
|
296
345
|
canonical: page.canonical,
|
|
297
346
|
noindex: page.noindex,
|
|
298
|
-
nofollow: page.nofollow
|
|
299
|
-
|
|
347
|
+
nofollow: page.nofollow,
|
|
348
|
+
crawlStatus,
|
|
349
|
+
canonicalConflict: isCanonicalConflict
|
|
350
|
+
},
|
|
351
|
+
soft404
|
|
300
352
|
};
|
|
353
|
+
Object.defineProperty(resultPage, 'html', { value: html, enumerable: false });
|
|
354
|
+
results.push(resultPage);
|
|
355
|
+
}
|
|
356
|
+
for (const analysis of results) {
|
|
357
|
+
if (analysis.title.value) {
|
|
358
|
+
const key = analysis.title.value.trim().toLowerCase();
|
|
359
|
+
if ((titleCounts.get(key) || 0) > 1)
|
|
360
|
+
analysis.title.status = 'duplicate';
|
|
361
|
+
}
|
|
362
|
+
if (analysis.metaDescription.value) {
|
|
363
|
+
const key = analysis.metaDescription.value.trim().toLowerCase();
|
|
364
|
+
if ((metaCounts.get(key) || 0) > 1)
|
|
365
|
+
analysis.metaDescription.status = 'duplicate';
|
|
366
|
+
}
|
|
367
|
+
const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
368
|
+
analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
|
|
301
369
|
analysis.seoScore = scorePageSeo(analysis);
|
|
302
|
-
|
|
303
|
-
|
|
370
|
+
}
|
|
371
|
+
return results;
|
|
304
372
|
}
|
|
305
373
|
function filterPageModules(page, modules) {
|
|
306
|
-
const
|
|
307
|
-
const keepContent = modules.content;
|
|
308
|
-
const keepAccessibility = modules.accessibility;
|
|
309
|
-
return {
|
|
374
|
+
const filtered = {
|
|
310
375
|
...page,
|
|
311
|
-
title:
|
|
312
|
-
metaDescription:
|
|
313
|
-
h1: (
|
|
314
|
-
links:
|
|
315
|
-
structuredData:
|
|
316
|
-
content:
|
|
317
|
-
thinScore:
|
|
318
|
-
images:
|
|
376
|
+
title: modules.seo ? page.title : { value: null, length: 0, status: 'missing' },
|
|
377
|
+
metaDescription: modules.seo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
|
|
378
|
+
h1: (modules.seo || modules.content) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false, value: null },
|
|
379
|
+
links: modules.seo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
|
|
380
|
+
structuredData: modules.seo ? page.structuredData : { present: false, valid: false, types: [] },
|
|
381
|
+
content: modules.content ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
|
|
382
|
+
thinScore: modules.content ? page.thinScore : 0,
|
|
383
|
+
images: modules.accessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
|
|
319
384
|
};
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
// If fromCrawl is provided, we could theoretically load JSON, but
|
|
323
|
-
// we now default to DB fetching for all operations.
|
|
324
|
-
if (fromCrawl) {
|
|
325
|
-
try {
|
|
326
|
-
const content = await fs.readFile(fromCrawl, 'utf-8');
|
|
327
|
-
const raw = JSON.parse(content);
|
|
328
|
-
const pages = parsePages(raw);
|
|
329
|
-
const graph = graphFromPages(rootUrl, pages, raw);
|
|
330
|
-
const metrics = calculateMetrics(graph, 5);
|
|
331
|
-
return { pages, metrics, graph };
|
|
332
|
-
}
|
|
333
|
-
catch (_e) {
|
|
334
|
-
// Fallback downwards if file doesn't exist
|
|
335
|
-
}
|
|
385
|
+
if (page.html) {
|
|
386
|
+
Object.defineProperty(filtered, 'html', { value: page.html, enumerable: false });
|
|
336
387
|
}
|
|
388
|
+
return filtered;
|
|
389
|
+
}
|
|
390
|
+
async function loadCrawlData(rootUrl, snapshotId) {
|
|
337
391
|
const db = getDb();
|
|
338
392
|
const siteRepo = new SiteRepository(db);
|
|
339
393
|
const snapshotRepo = new SnapshotRepository(db);
|
|
@@ -341,77 +395,43 @@ async function loadCrawlData(rootUrl, fromCrawl) {
|
|
|
341
395
|
const urlObj = new URL(rootUrl);
|
|
342
396
|
const domain = urlObj.hostname.replace('www.', '');
|
|
343
397
|
const site = siteRepo.firstOrCreateSite(domain);
|
|
344
|
-
|
|
398
|
+
let snapshot = null;
|
|
399
|
+
if (snapshotId) {
|
|
400
|
+
snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
401
|
+
}
|
|
345
402
|
if (!snapshot) {
|
|
346
|
-
|
|
403
|
+
for (const candidate of UrlUtil.toLookupCandidates(rootUrl, `${urlObj.protocol}//${urlObj.host}`)) {
|
|
404
|
+
const page = pageRepo.getPage(site.id, candidate);
|
|
405
|
+
if (page?.last_seen_snapshot_id) {
|
|
406
|
+
snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
|
|
407
|
+
break;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
347
410
|
}
|
|
411
|
+
if (!snapshot)
|
|
412
|
+
snapshot = snapshotRepo.getLatestSnapshot(site.id);
|
|
413
|
+
if (!snapshot)
|
|
414
|
+
throw new Error(`No crawl data found for ${rootUrl}`);
|
|
348
415
|
const graph = loadGraphFromSnapshot(snapshot.id);
|
|
349
416
|
const metrics = calculateMetrics(graph, 5);
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
return { pages, metrics, graph };
|
|
363
|
-
}
|
|
364
|
-
function parsePages(raw) {
|
|
365
|
-
if (Array.isArray(raw.pages)) {
|
|
366
|
-
return raw.pages.map((page) => {
|
|
367
|
-
const p = page;
|
|
368
|
-
return {
|
|
369
|
-
url: String(p.url || ''),
|
|
370
|
-
status: Number(p.status || 0),
|
|
371
|
-
html: typeof p.html === 'string' ? p.html : '',
|
|
372
|
-
depth: Number(p.depth || 0)
|
|
373
|
-
};
|
|
374
|
-
}).filter((page) => Boolean(page.url));
|
|
375
|
-
}
|
|
376
|
-
if (Array.isArray(raw.nodes)) {
|
|
377
|
-
return raw.nodes.map((node) => {
|
|
378
|
-
const n = node;
|
|
379
|
-
return {
|
|
380
|
-
url: String(n.url || ''),
|
|
381
|
-
status: Number(n.status || 0),
|
|
382
|
-
html: typeof n.html === 'string' ? n.html : '',
|
|
383
|
-
depth: Number(n.depth || 0)
|
|
417
|
+
const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
|
|
418
|
+
const pagesGenerator = function* () {
|
|
419
|
+
for (const p of dbPagesIterator) {
|
|
420
|
+
yield {
|
|
421
|
+
url: p.normalized_url,
|
|
422
|
+
status: p.http_status || 0,
|
|
423
|
+
html: p.html || '',
|
|
424
|
+
depth: p.depth || 0,
|
|
425
|
+
canonical: p.canonical_url || undefined,
|
|
426
|
+
noindex: !!p.noindex,
|
|
427
|
+
nofollow: !!p.nofollow,
|
|
428
|
+
crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
|
|
384
429
|
};
|
|
385
|
-
}).filter((page) => Boolean(page.url));
|
|
386
|
-
}
|
|
387
|
-
return [];
|
|
388
|
-
}
|
|
389
|
-
function graphFromPages(rootUrl, pages, raw) {
|
|
390
|
-
const graph = new Graph();
|
|
391
|
-
for (const page of pages) {
|
|
392
|
-
graph.addNode(page.url, page.depth || 0, page.status || 0);
|
|
393
|
-
}
|
|
394
|
-
if (Array.isArray(raw.edges)) {
|
|
395
|
-
for (const edge of raw.edges) {
|
|
396
|
-
const e = edge;
|
|
397
|
-
if (typeof e.source === 'string' && typeof e.target === 'string') {
|
|
398
|
-
graph.addNode(e.source, 0, 0);
|
|
399
|
-
graph.addNode(e.target, 0, 0);
|
|
400
|
-
graph.addEdge(e.source, e.target);
|
|
401
|
-
}
|
|
402
430
|
}
|
|
403
|
-
|
|
404
|
-
}
|
|
405
|
-
for (const page of pages) {
|
|
406
|
-
if (!page.html)
|
|
407
|
-
continue;
|
|
408
|
-
const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
|
|
409
|
-
if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0)
|
|
410
|
-
continue;
|
|
411
|
-
}
|
|
412
|
-
return graph;
|
|
431
|
+
};
|
|
432
|
+
return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
|
|
413
433
|
}
|
|
414
|
-
async function runLiveCrawl(url, options) {
|
|
434
|
+
async function runLiveCrawl(url, origin, options, context, robots) {
|
|
415
435
|
const snapshotId = await crawl(url, {
|
|
416
436
|
limit: 1,
|
|
417
437
|
depth: 0,
|
|
@@ -419,18 +439,46 @@ async function runLiveCrawl(url, options) {
|
|
|
419
439
|
proxyUrl: options.proxyUrl,
|
|
420
440
|
userAgent: options.userAgent,
|
|
421
441
|
maxRedirects: options.maxRedirects,
|
|
422
|
-
debug: options.debug
|
|
423
|
-
|
|
442
|
+
debug: options.debug,
|
|
443
|
+
snapshotRunType: 'single',
|
|
444
|
+
robots,
|
|
445
|
+
sitemap: options.sitemap,
|
|
446
|
+
plugins: options.plugins
|
|
447
|
+
}, context);
|
|
424
448
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
425
449
|
const pages = graph.getNodes().map((node) => ({
|
|
426
450
|
url: node.url,
|
|
427
451
|
status: node.status,
|
|
428
|
-
html: node.html || '',
|
|
429
|
-
depth: node.depth
|
|
452
|
+
html: node.html || '',
|
|
453
|
+
depth: node.depth,
|
|
454
|
+
crawlStatus: node.crawlStatus
|
|
430
455
|
}));
|
|
431
|
-
return {
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
456
|
+
return { pages, metrics: calculateMetrics(graph, 1), graph, snapshotId };
|
|
457
|
+
}
|
|
458
|
+
export function escapeHtml(value) {
|
|
459
|
+
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
460
|
+
}
|
|
461
|
+
export function renderAnalysisHtml(result) {
|
|
462
|
+
if (result.pages.length === 1)
|
|
463
|
+
return renderSinglePageHtml(result.pages[0]);
|
|
464
|
+
const rows = result.pages.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`).join('');
|
|
465
|
+
return ANALYSIS_LIST_TEMPLATE.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString()).replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString()).replace('{{ROWS}}', rows);
|
|
466
|
+
}
|
|
467
|
+
function renderSinglePageHtml(page) {
|
|
468
|
+
const structuredDataStatus = page.structuredData.present ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>') : 'Not detected';
|
|
469
|
+
const structuredDataTypesRow = page.structuredData.present ? `<tr><th>Types Found</th><td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td></tr>` : '';
|
|
470
|
+
return ANALYSIS_PAGE_TEMPLATE.replaceAll('{{URL}}', escapeHtml(page.url)).replace('{{SEO_SCORE}}', page.seoScore.toString()).replace('{{THIN_SCORE}}', page.thinScore.toString()).replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString()).replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)')).replace('{{TITLE_LENGTH}}', page.title.length.toString()).replaceAll('{{TITLE_STATUS}}', page.title.status).replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)')).replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString()).replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status).replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>').replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString()).replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString()).replaceAll('{{H1_STATUS}}', page.h1.status).replace('{{H1_COUNT}}', page.h1.count.toString()).replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '').replace('{{WORD_COUNT}}', page.content.wordCount.toString()).replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString()).replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2)).replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString()).replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString()).replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1)).replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString()).replace('{{MISSING_ALT}}', page.images.missingAlt.toString()).replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus).replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
|
|
471
|
+
}
|
|
472
|
+
export function renderAnalysisMarkdown(result) {
|
|
473
|
+
const summary = ['# Crawlith SEO Analysis Report', '', '## 📊 Summary', `- Pages Analyzed: ${result.site_summary.pages_analyzed}`, `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`, `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`, `- Thin Pages Found: ${result.site_summary.thin_pages}`, `- Duplicate Titles: ${result.site_summary.duplicate_titles}`, '', '## 📄 Page Details', '', '| URL | SEO Score | Thin Score | Title Status | Meta Status | Canonical |', '| :--- | :--- | :--- | :--- | :--- | :--- |'];
|
|
474
|
+
result.pages.forEach((page) => summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} | ${page.meta.canonical || '-'} |`));
|
|
475
|
+
return summary.join('\n');
|
|
476
|
+
}
|
|
477
|
+
export function renderAnalysisCsv(result) {
|
|
478
|
+
const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links', 'Canonical'];
|
|
479
|
+
const rows = result.pages.map((p) => {
|
|
480
|
+
const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
|
|
481
|
+
return [p.url, p.seoScore, p.thinScore, statusStr, `"${(p.title.value || '').replace(/"/g, '""')}"`, p.title.length, `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`, p.metaDescription.length, p.content.wordCount, p.links.internalLinks, p.links.externalLinks, p.meta.canonical || ''].join(',');
|
|
482
|
+
});
|
|
483
|
+
return [headers.join(','), ...rows].join('\n');
|
|
436
484
|
}
|