@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/analysis/analyze.js
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
1
2
|
import { crawl } from '../crawler/crawl.js';
|
|
3
|
+
import { UrlResolver } from '../crawler/resolver.js';
|
|
4
|
+
import { Fetcher } from '../crawler/fetcher.js';
|
|
2
5
|
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
3
|
-
import { normalizeUrl } from '../crawler/normalize.js';
|
|
6
|
+
import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
|
|
4
7
|
import { calculateMetrics } from '../graph/metrics.js';
|
|
5
8
|
import { analyzeContent, calculateThinContentScore } from './content.js';
|
|
6
9
|
import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
|
|
@@ -8,95 +11,135 @@ import { analyzeImageAlts } from './images.js';
|
|
|
8
11
|
import { analyzeLinks } from './links.js';
|
|
9
12
|
import { analyzeStructuredData } from './structuredData.js';
|
|
10
13
|
import { aggregateSiteScore, scorePageSeo } from './scoring.js';
|
|
11
|
-
import {
|
|
14
|
+
import { ClusteringService } from './clustering.js';
|
|
15
|
+
import { DuplicateService } from './duplicate.js';
|
|
16
|
+
import { Soft404Service } from './soft404.js';
|
|
12
17
|
import { getDb } from '../db/index.js';
|
|
13
18
|
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
14
19
|
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
15
20
|
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
21
|
+
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
16
22
|
import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
|
|
23
|
+
import { DEFAULTS } from '../constants.js';
|
|
24
|
+
import { PageRankService } from '../graph/pagerank.js';
|
|
25
|
+
import { HITSService } from '../graph/hits.js';
|
|
26
|
+
import { HeadingHealthService } from './heading.js';
|
|
27
|
+
import { annotateOrphans } from './orphan.js';
|
|
28
|
+
import { HealthService } from '../scoring/health.js';
|
|
17
29
|
/**
|
|
18
30
|
* Analyzes a site for SEO, content, and accessibility.
|
|
19
31
|
* Supports live crawling or loading from a database snapshot.
|
|
20
|
-
* Note: File-based data loading is not supported.
|
|
21
|
-
*
|
|
22
|
-
* @param url The root URL to analyze
|
|
23
|
-
* @param options Analysis options
|
|
24
|
-
* @param context Engine context for event emission
|
|
25
32
|
*/
|
|
26
33
|
export async function analyzeSite(url, options, context) {
|
|
27
|
-
|
|
28
|
-
|
|
34
|
+
// 1. Parse siteOrigin (e.g. https://example.com) and targetPath (e.g. /stats) from the URL.
|
|
35
|
+
// We resolve the *origin* — not the full page URL — so rootOrigin is always just the
|
|
36
|
+
// scheme+host and normalizedPath is always the pathname.
|
|
37
|
+
let parsedUrl = null;
|
|
38
|
+
try {
|
|
39
|
+
parsedUrl = new URL(url);
|
|
40
|
+
}
|
|
41
|
+
catch { /* bare domain fallback below */ }
|
|
42
|
+
const inputFullUrl = parsedUrl ? parsedUrl.toString() : (url.startsWith('http') ? url : `https://${url}`);
|
|
43
|
+
const inputOrigin = parsedUrl ? `${parsedUrl.protocol}//${parsedUrl.host}` : url;
|
|
44
|
+
let rootOrigin = inputOrigin;
|
|
45
|
+
if (options.live !== false) {
|
|
46
|
+
const resolver = new UrlResolver();
|
|
47
|
+
const fetcher = new Fetcher({ rate: options.rate, proxyUrl: options.proxyUrl, userAgent: options.userAgent });
|
|
48
|
+
try {
|
|
49
|
+
const resolved = await resolver.resolve(inputOrigin, fetcher);
|
|
50
|
+
rootOrigin = resolved.url;
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// Fallback to basic normalization if resolution fails
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// Normalize origin and target URL independently.
|
|
57
|
+
const normalizedOrigin = normalizeUrl(rootOrigin, '', { stripQuery: false });
|
|
58
|
+
if (!normalizedOrigin) {
|
|
29
59
|
throw new Error('Invalid URL for analysis');
|
|
30
60
|
}
|
|
61
|
+
const normalizedTargetAbs = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false }) || inputFullUrl;
|
|
62
|
+
const normalizedPath = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false, toPath: true })
|
|
63
|
+
|| UrlUtil.toPath(normalizedTargetAbs, rootOrigin);
|
|
64
|
+
const start = Date.now();
|
|
31
65
|
let crawlData;
|
|
32
66
|
let robots = null;
|
|
33
|
-
//
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
67
|
+
// 1. Robots fetch (live-mode only to keep snapshot analysis deterministic and fast)
|
|
68
|
+
if (options.live) {
|
|
69
|
+
try {
|
|
70
|
+
const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
|
|
71
|
+
const { Fetcher } = await import('../crawler/fetcher.js');
|
|
72
|
+
const fetcher = new Fetcher({
|
|
73
|
+
rate: DEFAULTS.RATE_LIMIT,
|
|
74
|
+
proxyUrl: options.proxyUrl,
|
|
75
|
+
userAgent: options.userAgent
|
|
76
|
+
});
|
|
77
|
+
const robotsRes = await fetcher.fetch(robotsUrl, { maxBytes: 500000 });
|
|
78
|
+
if (typeof robotsRes.status === 'number' && robotsRes.status >= 200 && robotsRes.status < 300) {
|
|
79
|
+
const robotsParserModule = await import('robots-parser');
|
|
80
|
+
const robotsParser = robotsParserModule.default || robotsParserModule;
|
|
81
|
+
robots = robotsParser(robotsUrl, robotsRes.body);
|
|
82
|
+
if (context)
|
|
83
|
+
context.emit({ type: 'info', message: `[analyze] Robots fetch took ${Date.now() - start}ms` });
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
catch {
|
|
87
|
+
// Fallback
|
|
43
88
|
}
|
|
44
89
|
}
|
|
45
|
-
|
|
46
|
-
// Silence robots fetch errors, fallback to existing or none
|
|
47
|
-
}
|
|
90
|
+
// Data Acquisition
|
|
48
91
|
if (options.live) {
|
|
49
|
-
|
|
92
|
+
const crawlStart = Date.now();
|
|
93
|
+
crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
|
|
94
|
+
if (context)
|
|
95
|
+
context.emit({ type: 'info', message: `[analyze] runLiveCrawl took ${Date.now() - crawlStart}ms` });
|
|
50
96
|
}
|
|
51
97
|
else {
|
|
52
98
|
try {
|
|
53
|
-
|
|
54
|
-
|
|
99
|
+
const loadStart = Date.now();
|
|
100
|
+
crawlData = await loadCrawlData(normalizedOrigin, options.snapshotId);
|
|
101
|
+
if (context)
|
|
102
|
+
context.emit({ type: 'debug', message: `[analyze] loadCrawlData took ${Date.now() - loadStart}ms` });
|
|
55
103
|
const allPages = Array.from(crawlData.pages);
|
|
56
104
|
crawlData.pages = allPages;
|
|
57
|
-
|
|
58
|
-
const exists = allPages.some(p => p.url === normalizedRoot);
|
|
105
|
+
const exists = allPages.some(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
|
|
59
106
|
if (!exists) {
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
}
|
|
64
|
-
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
107
|
+
if (context)
|
|
108
|
+
context.emit({ type: 'info', message: `URL ${normalizedTargetAbs} not found. Fetching live...` });
|
|
109
|
+
crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
|
|
65
110
|
}
|
|
66
111
|
}
|
|
67
|
-
catch (
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
error.message.includes('not found in database');
|
|
72
|
-
if (isNotFound) {
|
|
73
|
-
options.live = true; // Force live mode
|
|
74
|
-
if (context) {
|
|
75
|
-
context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
|
|
76
|
-
}
|
|
77
|
-
crawlData = await runLiveCrawl(normalizedRoot, options, context);
|
|
78
|
-
}
|
|
79
|
-
else {
|
|
80
|
-
throw error;
|
|
81
|
-
}
|
|
112
|
+
catch (_error) {
|
|
113
|
+
if (context)
|
|
114
|
+
context.emit({ type: 'info', message: 'No local crawl data found. Switching to live...' });
|
|
115
|
+
crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
|
|
82
116
|
}
|
|
83
117
|
}
|
|
84
118
|
const snapshotId = crawlData.snapshotId;
|
|
85
119
|
const crawledAt = crawlData.crawledAt;
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
120
|
+
const pagesStart = Date.now();
|
|
121
|
+
const pages = analyzePages(normalizedTargetAbs, rootOrigin, crawlData.pages, robots, options);
|
|
122
|
+
if (context)
|
|
123
|
+
context.emit({ type: 'debug', message: `[analyze] analyzePages took ${Date.now() - pagesStart}ms` });
|
|
124
|
+
// Sync basic page analysis results back to graph nodes for persistence
|
|
125
|
+
for (const pageAnalysis of pages) {
|
|
126
|
+
const node = crawlData.graph.nodes.get(pageAnalysis.url);
|
|
127
|
+
if (node) {
|
|
128
|
+
node.soft404Score = pageAnalysis.soft404?.score;
|
|
129
|
+
node.wordCount = pageAnalysis.content.wordCount;
|
|
130
|
+
node.externalLinkRatio = pageAnalysis.links.externalRatio;
|
|
131
|
+
node.thinContentScore = pageAnalysis.thinScore;
|
|
132
|
+
node.title = pageAnalysis.title.value || undefined;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
89
135
|
const activeModules = {
|
|
90
136
|
seo: !!options.seo,
|
|
91
137
|
content: !!options.content,
|
|
92
138
|
accessibility: !!options.accessibility
|
|
93
139
|
};
|
|
94
140
|
const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
|
|
95
|
-
const filteredPages = hasFilters
|
|
96
|
-
|
|
97
|
-
: pages;
|
|
98
|
-
// Filter to only the requested URL
|
|
99
|
-
const targetPage = filteredPages.find(p => p.url === normalizedRoot);
|
|
141
|
+
const filteredPages = hasFilters ? pages.map((page) => filterPageModules(page, activeModules)) : pages;
|
|
142
|
+
const targetPage = filteredPages.find(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
|
|
100
143
|
let resultPages;
|
|
101
144
|
if (options.allPages) {
|
|
102
145
|
resultPages = filteredPages;
|
|
@@ -104,215 +147,247 @@ export async function analyzeSite(url, options, context) {
|
|
|
104
147
|
else {
|
|
105
148
|
resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
|
|
106
149
|
}
|
|
150
|
+
let clusters = [];
|
|
151
|
+
let duplicates = [];
|
|
152
|
+
let prResults = new Map();
|
|
153
|
+
let hitsResults = new Map();
|
|
154
|
+
let headingPayloads = {};
|
|
155
|
+
if (options.clustering) {
|
|
156
|
+
const clustering = new ClusteringService();
|
|
157
|
+
clusters = clustering.detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
|
|
158
|
+
}
|
|
159
|
+
if (options.allPages) {
|
|
160
|
+
const duplication = new DuplicateService();
|
|
161
|
+
duplicates = duplication.detectDuplicates(crawlData.graph, { collapse: false });
|
|
162
|
+
}
|
|
163
|
+
if (options.computePagerank) {
|
|
164
|
+
const prService = new PageRankService();
|
|
165
|
+
prResults = prService.evaluate(crawlData.graph);
|
|
166
|
+
}
|
|
167
|
+
if (options.computeHits) {
|
|
168
|
+
const hitsService = new HITSService();
|
|
169
|
+
hitsResults = hitsService.evaluate(crawlData.graph);
|
|
170
|
+
}
|
|
171
|
+
if (options.heading) {
|
|
172
|
+
const headingService = new HeadingHealthService();
|
|
173
|
+
const { payloadsByUrl } = headingService.evaluateNodes(crawlData.graph.getNodes());
|
|
174
|
+
headingPayloads = payloadsByUrl;
|
|
175
|
+
}
|
|
176
|
+
if (options.orphans) {
|
|
177
|
+
const edges = crawlData.graph.getEdges();
|
|
178
|
+
annotateOrphans(crawlData.graph.getNodes(), edges, {
|
|
179
|
+
enabled: true,
|
|
180
|
+
severityEnabled: !!options.orphanSeverity,
|
|
181
|
+
includeSoftOrphans: !!options.includeSoftOrphans,
|
|
182
|
+
minInbound: options.minInbound || 2,
|
|
183
|
+
rootUrl: normalizedOrigin
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
// Run HealthService when --health is enabled
|
|
187
|
+
let healthBreakdown;
|
|
188
|
+
if (options.health) {
|
|
189
|
+
const healthService = new HealthService();
|
|
190
|
+
const issues = healthService.collectCrawlIssues(crawlData.graph, crawlData.metrics, rootOrigin);
|
|
191
|
+
healthBreakdown = healthService.calculateHealthScore(crawlData.graph.nodes.size, issues);
|
|
192
|
+
}
|
|
193
|
+
// Update nodes in graph with results
|
|
194
|
+
for (const node of crawlData.graph.getNodes()) {
|
|
195
|
+
const pr = prResults.get(node.url);
|
|
196
|
+
if (pr)
|
|
197
|
+
node.pagerankScore = pr.score;
|
|
198
|
+
const hits = hitsResults.get(node.url);
|
|
199
|
+
if (hits) {
|
|
200
|
+
node.hubScore = hits.hub_score;
|
|
201
|
+
node.authScore = hits.authority_score;
|
|
202
|
+
node.linkRole = hits.link_role;
|
|
203
|
+
}
|
|
204
|
+
const heading = headingPayloads[node.url];
|
|
205
|
+
if (heading) {
|
|
206
|
+
node.headingScore = heading.score;
|
|
207
|
+
node.headingData = JSON.stringify(heading);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
// Synchronize graph-level final scores back to PageAnalysis models
|
|
211
|
+
for (const page of pages) {
|
|
212
|
+
const node = crawlData.graph.nodes.get(page.url);
|
|
213
|
+
if (node) {
|
|
214
|
+
if (node.headingScore !== undefined)
|
|
215
|
+
page.headingScore = node.headingScore;
|
|
216
|
+
page.seoScore = scorePageSeo(page);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
107
219
|
const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
|
|
108
220
|
const thinPages = pages.filter((page) => page.thinScore >= 70).length;
|
|
109
221
|
const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
|
|
110
|
-
|
|
222
|
+
if (context)
|
|
223
|
+
context.emit({ type: 'debug', message: `[analyze] Total analysis completed in ${Date.now() - start}ms` });
|
|
224
|
+
// Persist to Database
|
|
225
|
+
const db = getDb();
|
|
226
|
+
const metricsRepo = new MetricsRepository(db);
|
|
227
|
+
const pageRepo = new PageRepository(db);
|
|
228
|
+
// Efficiently map URLs to IDs for this snapshot
|
|
229
|
+
const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
|
|
230
|
+
const urlToIdMap = new Map(pagesIdentity.map(p => [p.normalized_url, p.id]));
|
|
231
|
+
const metricsToSave = crawlData.graph.getNodes().map(node => {
|
|
232
|
+
const pageId = urlToIdMap.get(node.url);
|
|
233
|
+
if (!pageId)
|
|
234
|
+
return null;
|
|
235
|
+
return {
|
|
236
|
+
snapshot_id: snapshotId,
|
|
237
|
+
page_id: pageId,
|
|
238
|
+
crawl_status: node.crawlStatus || null,
|
|
239
|
+
word_count: node.wordCount || null,
|
|
240
|
+
thin_content_score: node.thinContentScore || null,
|
|
241
|
+
external_link_ratio: node.externalLinkRatio || null,
|
|
242
|
+
pagerank_score: node.pagerankScore || null,
|
|
243
|
+
hub_score: node.hubScore || null,
|
|
244
|
+
auth_score: node.authScore || null,
|
|
245
|
+
link_role: node.linkRole || null,
|
|
246
|
+
duplicate_cluster_id: node.duplicateClusterId || null,
|
|
247
|
+
duplicate_type: node.duplicateType || null,
|
|
248
|
+
cluster_id: node.clusterId || null,
|
|
249
|
+
soft404_score: node.soft404Score || null,
|
|
250
|
+
heading_score: node.headingScore || null,
|
|
251
|
+
orphan_score: node.orphanScore || null,
|
|
252
|
+
orphan_type: node.orphanType || null,
|
|
253
|
+
impact_level: node.impactLevel || null,
|
|
254
|
+
heading_data: node.headingData || null,
|
|
255
|
+
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
256
|
+
};
|
|
257
|
+
}).filter(m => m !== null);
|
|
258
|
+
// Persist health score to snapshot if computed
|
|
259
|
+
if (healthBreakdown && snapshotId) {
|
|
260
|
+
const db2 = getDb();
|
|
261
|
+
const snapshotRepo = new SnapshotRepository(db2);
|
|
262
|
+
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
263
|
+
health_score: healthBreakdown.score
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
metricsRepo.insertMany(metricsToSave);
|
|
267
|
+
const result = {
|
|
111
268
|
site_summary: {
|
|
112
269
|
pages_analyzed: resultPages.length,
|
|
113
270
|
avg_seo_score: siteScores.seoHealthScore,
|
|
114
271
|
thin_pages: thinPages,
|
|
115
272
|
duplicate_titles: duplicateTitles,
|
|
116
|
-
site_score: siteScores.overallScore
|
|
273
|
+
site_score: siteScores.overallScore,
|
|
274
|
+
site_score_breakdown: siteScores.breakdown
|
|
117
275
|
},
|
|
118
276
|
site_scores: siteScores,
|
|
119
277
|
pages: resultPages,
|
|
120
278
|
active_modules: activeModules,
|
|
121
|
-
clusters: crawlData.graph.contentClusters,
|
|
122
279
|
snapshotId,
|
|
123
|
-
crawledAt
|
|
280
|
+
crawledAt,
|
|
281
|
+
clusters,
|
|
282
|
+
duplicates
|
|
124
283
|
};
|
|
284
|
+
return result;
|
|
125
285
|
}
|
|
126
|
-
export function
|
|
127
|
-
if (result.pages.length === 1) {
|
|
128
|
-
return renderSinglePageHtml(result.pages[0]);
|
|
129
|
-
}
|
|
130
|
-
const rows = result.pages
|
|
131
|
-
.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
|
|
132
|
-
.join('');
|
|
133
|
-
return ANALYSIS_LIST_TEMPLATE
|
|
134
|
-
.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
|
|
135
|
-
.replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
|
|
136
|
-
.replace('{{ROWS}}', rows);
|
|
137
|
-
}
|
|
138
|
-
function renderSinglePageHtml(page) {
|
|
139
|
-
const structuredDataStatus = page.structuredData.present
|
|
140
|
-
? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
|
|
141
|
-
: 'Not detected';
|
|
142
|
-
const structuredDataTypesRow = page.structuredData.present ? `
|
|
143
|
-
<tr>
|
|
144
|
-
<th>Types Found</th>
|
|
145
|
-
<td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
|
|
146
|
-
</tr>
|
|
147
|
-
` : '';
|
|
148
|
-
return ANALYSIS_PAGE_TEMPLATE
|
|
149
|
-
.replaceAll('{{URL}}', escapeHtml(page.url))
|
|
150
|
-
.replace('{{SEO_SCORE}}', page.seoScore.toString())
|
|
151
|
-
.replace('{{THIN_SCORE}}', page.thinScore.toString())
|
|
152
|
-
.replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
|
|
153
|
-
.replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
|
|
154
|
-
.replace('{{TITLE_LENGTH}}', page.title.length.toString())
|
|
155
|
-
.replaceAll('{{TITLE_STATUS}}', page.title.status)
|
|
156
|
-
.replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
|
|
157
|
-
.replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
|
|
158
|
-
.replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
|
|
159
|
-
.replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
|
|
160
|
-
.replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
|
|
161
|
-
.replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
|
|
162
|
-
.replaceAll('{{H1_STATUS}}', page.h1.status)
|
|
163
|
-
.replace('{{H1_COUNT}}', page.h1.count.toString())
|
|
164
|
-
.replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
|
|
165
|
-
.replace('{{WORD_COUNT}}', page.content.wordCount.toString())
|
|
166
|
-
.replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
|
|
167
|
-
.replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
|
|
168
|
-
.replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
|
|
169
|
-
.replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
|
|
170
|
-
.replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
|
|
171
|
-
.replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
|
|
172
|
-
.replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
|
|
173
|
-
.replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
|
|
174
|
-
.replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
|
|
175
|
-
}
|
|
176
|
-
export function renderAnalysisMarkdown(result) {
|
|
177
|
-
const summary = [
|
|
178
|
-
'# Crawlith SEO Analysis Report',
|
|
179
|
-
'',
|
|
180
|
-
'## 📊 Summary',
|
|
181
|
-
`- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
|
|
182
|
-
`- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
|
|
183
|
-
`- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
|
|
184
|
-
`- Thin Pages Found: ${result.site_summary.thin_pages}`,
|
|
185
|
-
`- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
|
|
186
|
-
'',
|
|
187
|
-
'## 📄 Page Details',
|
|
188
|
-
'',
|
|
189
|
-
'| URL | SEO Score | Thin Score | Title Status | Meta Status |',
|
|
190
|
-
'| :--- | :--- | :--- | :--- | :--- |',
|
|
191
|
-
];
|
|
192
|
-
result.pages.forEach((page) => {
|
|
193
|
-
summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
|
|
194
|
-
});
|
|
195
|
-
return summary.join('\n');
|
|
196
|
-
}
|
|
197
|
-
export function renderAnalysisCsv(result) {
|
|
198
|
-
const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
|
|
199
|
-
const rows = result.pages.map((p) => {
|
|
200
|
-
const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
|
|
201
|
-
return [
|
|
202
|
-
p.url,
|
|
203
|
-
p.seoScore,
|
|
204
|
-
p.thinScore,
|
|
205
|
-
statusStr,
|
|
206
|
-
`"${(p.title.value || '').replace(/"/g, '""')}"`,
|
|
207
|
-
p.title.length,
|
|
208
|
-
`"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
|
|
209
|
-
p.metaDescription.length,
|
|
210
|
-
p.content.wordCount,
|
|
211
|
-
p.links.internalLinks,
|
|
212
|
-
p.links.externalLinks
|
|
213
|
-
].join(',');
|
|
214
|
-
});
|
|
215
|
-
return [headers.join(','), ...rows].join('\n');
|
|
216
|
-
}
|
|
217
|
-
function escapeHtml(value) {
|
|
218
|
-
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
219
|
-
}
|
|
220
|
-
export function analyzePages(rootUrl, pages, robots) {
|
|
286
|
+
export function analyzePages(targetUrl, rootOrigin, pages, robots, options = {}) {
|
|
221
287
|
const titleCounts = new Map();
|
|
222
288
|
const metaCounts = new Map();
|
|
223
289
|
const sentenceCountFrequency = new Map();
|
|
224
290
|
const results = [];
|
|
291
|
+
const targetPath = UrlUtil.toPath(targetUrl, rootOrigin);
|
|
292
|
+
const targetAbs = UrlUtil.toAbsolute(targetUrl, rootOrigin);
|
|
225
293
|
for (const page of pages) {
|
|
294
|
+
const pagePath = UrlUtil.toPath(page.url, rootOrigin);
|
|
295
|
+
const pageAbs = UrlUtil.toAbsolute(page.url, rootOrigin);
|
|
296
|
+
const isTarget = page.url === targetUrl || pagePath === targetPath || pageAbs === targetAbs;
|
|
297
|
+
// In single-page mode, if it's not the target, we skip it entirely for speed.
|
|
298
|
+
if (!options.allPages && !isTarget)
|
|
299
|
+
continue;
|
|
226
300
|
const html = page.html || '';
|
|
227
|
-
|
|
301
|
+
const $ = load(html || '<html></html>');
|
|
302
|
+
// Reconstruct absolute URL from stored path for robots & link resolution
|
|
303
|
+
const pageAbsUrl = UrlUtil.toAbsolute(page.url, rootOrigin);
|
|
228
304
|
let crawlStatus = page.crawlStatus;
|
|
229
305
|
if (robots) {
|
|
230
|
-
const isBlocked = !robots.isAllowed(
|
|
231
|
-
(!
|
|
232
|
-
if (isBlocked)
|
|
306
|
+
const isBlocked = !robots.isAllowed(pageAbsUrl, 'crawlith') ||
|
|
307
|
+
(!pageAbsUrl.endsWith('/') && !robots.isAllowed(pageAbsUrl + '/', 'crawlith'));
|
|
308
|
+
if (isBlocked)
|
|
233
309
|
crawlStatus = 'blocked_by_robots';
|
|
234
|
-
}
|
|
235
310
|
}
|
|
236
|
-
//
|
|
237
|
-
const title = analyzeTitle(
|
|
238
|
-
const metaDescription = analyzeMetaDescription(
|
|
239
|
-
const h1 = analyzeH1(
|
|
240
|
-
const content = analyzeContent(
|
|
241
|
-
const images = analyzeImageAlts(
|
|
242
|
-
const links = analyzeLinks(
|
|
243
|
-
const structuredData = analyzeStructuredData(
|
|
244
|
-
// 2. Accumulate Frequencies for Duplicates
|
|
311
|
+
// Shared DOM Analysis
|
|
312
|
+
const title = analyzeTitle($);
|
|
313
|
+
const metaDescription = analyzeMetaDescription($);
|
|
314
|
+
const h1 = analyzeH1($, title.value);
|
|
315
|
+
const content = analyzeContent($);
|
|
316
|
+
const images = analyzeImageAlts($);
|
|
317
|
+
const links = analyzeLinks($, pageAbsUrl, rootOrigin);
|
|
318
|
+
const structuredData = analyzeStructuredData($);
|
|
245
319
|
if (title.value) {
|
|
246
|
-
const key =
|
|
320
|
+
const key = title.value.trim().toLowerCase();
|
|
247
321
|
titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
|
|
248
322
|
}
|
|
249
323
|
if (metaDescription.value) {
|
|
250
|
-
const key =
|
|
324
|
+
const key = metaDescription.value.trim().toLowerCase();
|
|
251
325
|
metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
|
|
252
326
|
}
|
|
253
327
|
sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
|
|
254
|
-
|
|
255
|
-
|
|
328
|
+
const soft404Service = new Soft404Service();
|
|
329
|
+
const soft404 = soft404Service.analyze(html, links.externalLinks + links.internalLinks);
|
|
330
|
+
const isCanonicalConflict = !!(page.canonical && page.canonical !== page.url && page.canonical !== pageAbsUrl &&
|
|
331
|
+
page.canonical.replace(/\/$/, '') !== pageAbsUrl.replace(/\/$/, ''));
|
|
332
|
+
const resultPage = {
|
|
256
333
|
url: page.url,
|
|
257
334
|
status: page.status || 0,
|
|
258
335
|
title,
|
|
259
336
|
metaDescription,
|
|
260
337
|
h1,
|
|
261
338
|
content,
|
|
262
|
-
thinScore: 0,
|
|
339
|
+
thinScore: 0,
|
|
263
340
|
images,
|
|
264
341
|
links,
|
|
265
342
|
structuredData,
|
|
266
|
-
seoScore: 0,
|
|
343
|
+
seoScore: 0,
|
|
267
344
|
meta: {
|
|
268
345
|
canonical: page.canonical,
|
|
269
346
|
noindex: page.noindex,
|
|
270
347
|
nofollow: page.nofollow,
|
|
271
|
-
crawlStatus
|
|
272
|
-
|
|
273
|
-
|
|
348
|
+
crawlStatus,
|
|
349
|
+
canonicalConflict: isCanonicalConflict
|
|
350
|
+
},
|
|
351
|
+
soft404
|
|
352
|
+
};
|
|
353
|
+
Object.defineProperty(resultPage, 'html', { value: html, enumerable: false });
|
|
354
|
+
results.push(resultPage);
|
|
274
355
|
}
|
|
275
|
-
// 4. Finalize Statuses and Scores (Pass 2)
|
|
276
356
|
for (const analysis of results) {
|
|
277
|
-
// Check Title Duplicates
|
|
278
357
|
if (analysis.title.value) {
|
|
279
|
-
const key =
|
|
280
|
-
if ((titleCounts.get(key) || 0) > 1)
|
|
358
|
+
const key = analysis.title.value.trim().toLowerCase();
|
|
359
|
+
if ((titleCounts.get(key) || 0) > 1)
|
|
281
360
|
analysis.title.status = 'duplicate';
|
|
282
|
-
}
|
|
283
361
|
}
|
|
284
|
-
// Check Meta Duplicates
|
|
285
362
|
if (analysis.metaDescription.value) {
|
|
286
|
-
const key =
|
|
287
|
-
if ((metaCounts.get(key) || 0) > 1)
|
|
363
|
+
const key = analysis.metaDescription.value.trim().toLowerCase();
|
|
364
|
+
if ((metaCounts.get(key) || 0) > 1)
|
|
288
365
|
analysis.metaDescription.status = 'duplicate';
|
|
289
|
-
}
|
|
290
366
|
}
|
|
291
|
-
// Check Content Duplication
|
|
292
367
|
const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
293
368
|
analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
|
|
294
|
-
// Calculate Final SEO Score
|
|
295
369
|
analysis.seoScore = scorePageSeo(analysis);
|
|
296
370
|
}
|
|
297
371
|
return results;
|
|
298
372
|
}
|
|
299
373
|
function filterPageModules(page, modules) {
|
|
300
|
-
const
|
|
301
|
-
const keepContent = modules.content;
|
|
302
|
-
const keepAccessibility = modules.accessibility;
|
|
303
|
-
return {
|
|
374
|
+
const filtered = {
|
|
304
375
|
...page,
|
|
305
|
-
title:
|
|
306
|
-
metaDescription:
|
|
307
|
-
h1: (
|
|
308
|
-
links:
|
|
309
|
-
structuredData:
|
|
310
|
-
content:
|
|
311
|
-
thinScore:
|
|
312
|
-
images:
|
|
376
|
+
title: modules.seo ? page.title : { value: null, length: 0, status: 'missing' },
|
|
377
|
+
metaDescription: modules.seo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
|
|
378
|
+
h1: (modules.seo || modules.content) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false, value: null },
|
|
379
|
+
links: modules.seo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
|
|
380
|
+
structuredData: modules.seo ? page.structuredData : { present: false, valid: false, types: [] },
|
|
381
|
+
content: modules.content ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
|
|
382
|
+
thinScore: modules.content ? page.thinScore : 0,
|
|
383
|
+
images: modules.accessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
|
|
313
384
|
};
|
|
385
|
+
if (page.html) {
|
|
386
|
+
Object.defineProperty(filtered, 'html', { value: page.html, enumerable: false });
|
|
387
|
+
}
|
|
388
|
+
return filtered;
|
|
314
389
|
}
|
|
315
|
-
async function loadCrawlData(rootUrl) {
|
|
390
|
+
async function loadCrawlData(rootUrl, snapshotId) {
|
|
316
391
|
const db = getDb();
|
|
317
392
|
const siteRepo = new SiteRepository(db);
|
|
318
393
|
const snapshotRepo = new SnapshotRepository(db);
|
|
@@ -320,22 +395,26 @@ async function loadCrawlData(rootUrl) {
|
|
|
320
395
|
const urlObj = new URL(rootUrl);
|
|
321
396
|
const domain = urlObj.hostname.replace('www.', '');
|
|
322
397
|
const site = siteRepo.firstOrCreateSite(domain);
|
|
323
|
-
let snapshot;
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
|
|
327
|
-
}
|
|
328
|
-
if (!snapshot) {
|
|
329
|
-
snapshot = snapshotRepo.getLatestSnapshot(site.id);
|
|
398
|
+
let snapshot = null;
|
|
399
|
+
if (snapshotId) {
|
|
400
|
+
snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
330
401
|
}
|
|
331
402
|
if (!snapshot) {
|
|
332
|
-
|
|
403
|
+
for (const candidate of UrlUtil.toLookupCandidates(rootUrl, `${urlObj.protocol}//${urlObj.host}`)) {
|
|
404
|
+
const page = pageRepo.getPage(site.id, candidate);
|
|
405
|
+
if (page?.last_seen_snapshot_id) {
|
|
406
|
+
snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
|
|
407
|
+
break;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
333
410
|
}
|
|
411
|
+
if (!snapshot)
|
|
412
|
+
snapshot = snapshotRepo.getLatestSnapshot(site.id);
|
|
413
|
+
if (!snapshot)
|
|
414
|
+
throw new Error(`No crawl data found for ${rootUrl}`);
|
|
334
415
|
const graph = loadGraphFromSnapshot(snapshot.id);
|
|
335
416
|
const metrics = calculateMetrics(graph, 5);
|
|
336
|
-
// Use iterator to save memory
|
|
337
417
|
const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
|
|
338
|
-
// We need to map the DB pages to CrawlPage format lazily
|
|
339
418
|
const pagesGenerator = function* () {
|
|
340
419
|
for (const p of dbPagesIterator) {
|
|
341
420
|
yield {
|
|
@@ -352,29 +431,54 @@ async function loadCrawlData(rootUrl) {
|
|
|
352
431
|
};
|
|
353
432
|
return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
|
|
354
433
|
}
|
|
355
|
-
async function runLiveCrawl(url, options, context) {
|
|
434
|
+
async function runLiveCrawl(url, origin, options, context, robots) {
|
|
356
435
|
const snapshotId = await crawl(url, {
|
|
357
|
-
limit: 1,
|
|
436
|
+
limit: 1,
|
|
358
437
|
depth: 0,
|
|
359
438
|
rate: options.rate,
|
|
360
439
|
proxyUrl: options.proxyUrl,
|
|
361
440
|
userAgent: options.userAgent,
|
|
362
441
|
maxRedirects: options.maxRedirects,
|
|
363
442
|
debug: options.debug,
|
|
364
|
-
|
|
443
|
+
snapshotRunType: 'single',
|
|
444
|
+
robots,
|
|
445
|
+
sitemap: options.sitemap,
|
|
446
|
+
plugins: options.plugins
|
|
365
447
|
}, context);
|
|
366
448
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
367
449
|
const pages = graph.getNodes().map((node) => ({
|
|
368
450
|
url: node.url,
|
|
369
451
|
status: node.status,
|
|
370
|
-
html: node.html || '',
|
|
452
|
+
html: node.html || '',
|
|
371
453
|
depth: node.depth,
|
|
372
454
|
crawlStatus: node.crawlStatus
|
|
373
455
|
}));
|
|
374
|
-
return {
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
456
|
+
return { pages, metrics: calculateMetrics(graph, 1), graph, snapshotId };
|
|
457
|
+
}
|
|
458
|
+
export function escapeHtml(value) {
|
|
459
|
+
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
460
|
+
}
|
|
461
|
+
export function renderAnalysisHtml(result) {
|
|
462
|
+
if (result.pages.length === 1)
|
|
463
|
+
return renderSinglePageHtml(result.pages[0]);
|
|
464
|
+
const rows = result.pages.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`).join('');
|
|
465
|
+
return ANALYSIS_LIST_TEMPLATE.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString()).replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString()).replace('{{ROWS}}', rows);
|
|
466
|
+
}
|
|
467
|
+
function renderSinglePageHtml(page) {
|
|
468
|
+
const structuredDataStatus = page.structuredData.present ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>') : 'Not detected';
|
|
469
|
+
const structuredDataTypesRow = page.structuredData.present ? `<tr><th>Types Found</th><td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td></tr>` : '';
|
|
470
|
+
return ANALYSIS_PAGE_TEMPLATE.replaceAll('{{URL}}', escapeHtml(page.url)).replace('{{SEO_SCORE}}', page.seoScore.toString()).replace('{{THIN_SCORE}}', page.thinScore.toString()).replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString()).replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)')).replace('{{TITLE_LENGTH}}', page.title.length.toString()).replaceAll('{{TITLE_STATUS}}', page.title.status).replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)')).replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString()).replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status).replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>').replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString()).replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString()).replaceAll('{{H1_STATUS}}', page.h1.status).replace('{{H1_COUNT}}', page.h1.count.toString()).replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '').replace('{{WORD_COUNT}}', page.content.wordCount.toString()).replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString()).replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2)).replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString()).replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString()).replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1)).replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString()).replace('{{MISSING_ALT}}', page.images.missingAlt.toString()).replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus).replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
|
|
471
|
+
}
|
|
472
|
+
export function renderAnalysisMarkdown(result) {
|
|
473
|
+
const summary = ['# Crawlith SEO Analysis Report', '', '## 📊 Summary', `- Pages Analyzed: ${result.site_summary.pages_analyzed}`, `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`, `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`, `- Thin Pages Found: ${result.site_summary.thin_pages}`, `- Duplicate Titles: ${result.site_summary.duplicate_titles}`, '', '## 📄 Page Details', '', '| URL | SEO Score | Thin Score | Title Status | Meta Status | Canonical |', '| :--- | :--- | :--- | :--- | :--- | :--- |'];
|
|
474
|
+
result.pages.forEach((page) => summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} | ${page.meta.canonical || '-'} |`));
|
|
475
|
+
return summary.join('\n');
|
|
476
|
+
}
|
|
477
|
+
export function renderAnalysisCsv(result) {
|
|
478
|
+
const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links', 'Canonical'];
|
|
479
|
+
const rows = result.pages.map((p) => {
|
|
480
|
+
const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
|
|
481
|
+
return [p.url, p.seoScore, p.thinScore, statusStr, `"${(p.title.value || '').replace(/"/g, '""')}"`, p.title.length, `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`, p.metaDescription.length, p.content.wordCount, p.links.internalLinks, p.links.externalLinks, p.meta.canonical || ''].join(',');
|
|
482
|
+
});
|
|
483
|
+
return [headers.join(','), ...rows].join('\n');
|
|
380
484
|
}
|