@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/scoring/hits.js
DELETED
|
@@ -1,111 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Computes Hub and Authority scores using the HITS algorithm.
|
|
3
|
-
* Operates purely on the internal link graph.
|
|
4
|
-
*/
|
|
5
|
-
export function computeHITS(graph, options = {}) {
|
|
6
|
-
const iterations = options.iterations || 20;
|
|
7
|
-
const nodes = graph.getNodes();
|
|
8
|
-
// 1. Filter eligible nodes
|
|
9
|
-
// Eligibility: status 200, non-redirect (redirectChain empty), not noindex, non-external
|
|
10
|
-
const eligibleNodes = nodes.filter(n => n.status === 200 &&
|
|
11
|
-
(!n.redirectChain || n.redirectChain.length === 0) &&
|
|
12
|
-
!n.noindex);
|
|
13
|
-
if (eligibleNodes.length === 0)
|
|
14
|
-
return;
|
|
15
|
-
const urlToNode = new Map();
|
|
16
|
-
for (const node of eligibleNodes) {
|
|
17
|
-
urlToNode.set(node.url, node);
|
|
18
|
-
// 2. Initialization
|
|
19
|
-
node.authorityScore = 1.0;
|
|
20
|
-
node.hubScore = 1.0;
|
|
21
|
-
}
|
|
22
|
-
const allEdges = graph.getEdges();
|
|
23
|
-
// Filter edges: internal links only (both source and target must be in eligibleNodes), no self-links
|
|
24
|
-
const eligibleEdges = allEdges.filter(e => e.source !== e.target &&
|
|
25
|
-
urlToNode.has(e.source) &&
|
|
26
|
-
urlToNode.has(e.target));
|
|
27
|
-
// Group edges for efficient iteration
|
|
28
|
-
const incoming = new Map();
|
|
29
|
-
const outgoing = new Map();
|
|
30
|
-
for (const edge of eligibleEdges) {
|
|
31
|
-
if (!incoming.has(edge.target))
|
|
32
|
-
incoming.set(edge.target, []);
|
|
33
|
-
incoming.get(edge.target).push({ source: edge.source, weight: edge.weight });
|
|
34
|
-
if (!outgoing.has(edge.source))
|
|
35
|
-
outgoing.set(edge.source, []);
|
|
36
|
-
outgoing.get(edge.source).push({ target: edge.target, weight: edge.weight });
|
|
37
|
-
}
|
|
38
|
-
// 3. Iteration
|
|
39
|
-
for (let i = 0; i < iterations; i++) {
|
|
40
|
-
// Update Authorities
|
|
41
|
-
let normAuth = 0;
|
|
42
|
-
for (const node of eligibleNodes) {
|
|
43
|
-
const inLinks = incoming.get(node.url) || [];
|
|
44
|
-
let newAuth = 0;
|
|
45
|
-
for (const link of inLinks) {
|
|
46
|
-
const sourceNode = urlToNode.get(link.source);
|
|
47
|
-
newAuth += (sourceNode.hubScore || 0) * link.weight;
|
|
48
|
-
}
|
|
49
|
-
node.authorityScore = newAuth;
|
|
50
|
-
normAuth += newAuth * newAuth;
|
|
51
|
-
}
|
|
52
|
-
// Normalize Authorities (L2 norm)
|
|
53
|
-
normAuth = Math.sqrt(normAuth);
|
|
54
|
-
if (normAuth > 0) {
|
|
55
|
-
for (const node of eligibleNodes) {
|
|
56
|
-
node.authorityScore = (node.authorityScore || 0) / normAuth;
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
// Update Hubs
|
|
60
|
-
let normHub = 0;
|
|
61
|
-
for (const node of eligibleNodes) {
|
|
62
|
-
const outLinks = outgoing.get(node.url) || [];
|
|
63
|
-
let newHub = 0;
|
|
64
|
-
for (const link of outLinks) {
|
|
65
|
-
const targetNode = urlToNode.get(link.target);
|
|
66
|
-
newHub += (targetNode.authorityScore || 0) * link.weight;
|
|
67
|
-
}
|
|
68
|
-
node.hubScore = newHub;
|
|
69
|
-
normHub += newHub * newHub;
|
|
70
|
-
}
|
|
71
|
-
// Normalize Hubs (L2 norm)
|
|
72
|
-
normHub = Math.sqrt(normHub);
|
|
73
|
-
if (normHub > 0) {
|
|
74
|
-
for (const node of eligibleNodes) {
|
|
75
|
-
node.hubScore = (node.hubScore || 0) / normHub;
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
// 4. Classification Logic
|
|
80
|
-
classifyLinkRoles(eligibleNodes);
|
|
81
|
-
}
|
|
82
|
-
function classifyLinkRoles(nodes) {
|
|
83
|
-
if (nodes.length === 0)
|
|
84
|
-
return;
|
|
85
|
-
const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
|
|
86
|
-
const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
|
|
87
|
-
// Use 75th percentile as "high" threshold
|
|
88
|
-
const medianAuth = authScores[Math.floor(authScores.length / 2)];
|
|
89
|
-
const medianHub = hubScores[Math.floor(hubScores.length / 2)];
|
|
90
|
-
for (const node of nodes) {
|
|
91
|
-
const auth = node.authorityScore || 0;
|
|
92
|
-
const hub = node.hubScore || 0;
|
|
93
|
-
const isHighAuth = auth > medianAuth && auth > 0.0001;
|
|
94
|
-
const isHighHub = hub > medianHub && hub > 0.0001;
|
|
95
|
-
if (isHighAuth && isHighHub) {
|
|
96
|
-
node.linkRole = 'power';
|
|
97
|
-
}
|
|
98
|
-
else if (isHighAuth) {
|
|
99
|
-
node.linkRole = 'authority';
|
|
100
|
-
}
|
|
101
|
-
else if (isHighHub) {
|
|
102
|
-
node.linkRole = 'hub';
|
|
103
|
-
}
|
|
104
|
-
else if (auth > 0.0001 && hub > 0.0001) {
|
|
105
|
-
node.linkRole = 'balanced';
|
|
106
|
-
}
|
|
107
|
-
else {
|
|
108
|
-
node.linkRole = 'peripheral';
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
}
|
package/src/analysis/analyze.ts
DELETED
|
@@ -1,548 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
|
-
import { crawl } from '../crawler/crawl.js';
|
|
3
|
-
import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
4
|
-
import { normalizeUrl } from '../crawler/normalize.js';
|
|
5
|
-
import { calculateMetrics, Metrics } from '../graph/metrics.js';
|
|
6
|
-
import { Graph, ClusterInfo } from '../graph/graph.js';
|
|
7
|
-
import { analyzeContent, calculateThinContentScore } from './content.js';
|
|
8
|
-
import { analyzeH1, analyzeMetaDescription, analyzeTitle, applyDuplicateStatuses, H1Analysis, TextFieldAnalysis } from './seo.js';
|
|
9
|
-
import { analyzeImageAlts, ImageAltAnalysis } from './images.js';
|
|
10
|
-
import { analyzeLinks, LinkRatioAnalysis } from './links.js';
|
|
11
|
-
import { analyzeStructuredData, StructuredDataResult } from './structuredData.js';
|
|
12
|
-
import { aggregateSiteScore, scorePageSeo } from './scoring.js';
|
|
13
|
-
import { detectContentClusters } from '../graph/cluster.js';
|
|
14
|
-
import { getDb } from '../db/index.js';
|
|
15
|
-
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
16
|
-
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
17
|
-
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
18
|
-
|
|
19
|
-
export interface CrawlPage {
|
|
20
|
-
url: string;
|
|
21
|
-
status?: number;
|
|
22
|
-
html?: string;
|
|
23
|
-
depth?: number;
|
|
24
|
-
canonical?: string;
|
|
25
|
-
noindex?: boolean;
|
|
26
|
-
nofollow?: boolean;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
export interface AnalyzeOptions {
|
|
30
|
-
fromCrawl?: string;
|
|
31
|
-
live?: boolean;
|
|
32
|
-
html?: boolean;
|
|
33
|
-
seo?: boolean;
|
|
34
|
-
content?: boolean;
|
|
35
|
-
accessibility?: boolean;
|
|
36
|
-
rate?: number;
|
|
37
|
-
proxyUrl?: string;
|
|
38
|
-
userAgent?: string;
|
|
39
|
-
maxRedirects?: number;
|
|
40
|
-
debug?: boolean;
|
|
41
|
-
clusterThreshold?: number;
|
|
42
|
-
minClusterSize?: number;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
export interface PageAnalysis {
|
|
46
|
-
url: string;
|
|
47
|
-
status: number;
|
|
48
|
-
title: TextFieldAnalysis;
|
|
49
|
-
metaDescription: TextFieldAnalysis;
|
|
50
|
-
h1: H1Analysis;
|
|
51
|
-
content: ReturnType<typeof analyzeContent>;
|
|
52
|
-
thinScore: number;
|
|
53
|
-
images: ImageAltAnalysis;
|
|
54
|
-
links: LinkRatioAnalysis;
|
|
55
|
-
structuredData: StructuredDataResult;
|
|
56
|
-
seoScore: number;
|
|
57
|
-
meta: {
|
|
58
|
-
canonical?: string;
|
|
59
|
-
noindex?: boolean;
|
|
60
|
-
nofollow?: boolean;
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
export interface AnalysisResult {
|
|
65
|
-
site_summary: {
|
|
66
|
-
pages_analyzed: number;
|
|
67
|
-
avg_seo_score: number;
|
|
68
|
-
thin_pages: number;
|
|
69
|
-
duplicate_titles: number;
|
|
70
|
-
site_score: number;
|
|
71
|
-
};
|
|
72
|
-
site_scores: ReturnType<typeof aggregateSiteScore>;
|
|
73
|
-
pages: PageAnalysis[];
|
|
74
|
-
active_modules: {
|
|
75
|
-
seo: boolean;
|
|
76
|
-
content: boolean;
|
|
77
|
-
accessibility: boolean;
|
|
78
|
-
};
|
|
79
|
-
clusters?: ClusterInfo[];
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
interface CrawlData {
|
|
83
|
-
pages: CrawlPage[];
|
|
84
|
-
metrics: Metrics;
|
|
85
|
-
graph: Graph;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
export async function analyzeSite(url: string, options: AnalyzeOptions): Promise<AnalysisResult> {
|
|
89
|
-
const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
|
|
90
|
-
if (!normalizedRoot) {
|
|
91
|
-
throw new Error('Invalid URL for analysis');
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
let crawlData: CrawlData;
|
|
95
|
-
|
|
96
|
-
if (options.live) {
|
|
97
|
-
crawlData = await runLiveCrawl(normalizedRoot, options);
|
|
98
|
-
} else {
|
|
99
|
-
try {
|
|
100
|
-
crawlData = await loadCrawlData(normalizedRoot, options.fromCrawl);
|
|
101
|
-
} catch (error: any) {
|
|
102
|
-
const isNotFound = error.code === 'ENOENT' ||
|
|
103
|
-
error.message.includes('Crawl data not found') ||
|
|
104
|
-
error.message.includes('No completed snapshot found') ||
|
|
105
|
-
error.message.includes('not found in database');
|
|
106
|
-
if (isNotFound && !options.fromCrawl) {
|
|
107
|
-
console.log('No local crawl data found. Switching to live analysis mode...');
|
|
108
|
-
crawlData = await runLiveCrawl(normalizedRoot, options);
|
|
109
|
-
} else {
|
|
110
|
-
throw error;
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
// Run clustering if requested or as default
|
|
116
|
-
detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
|
|
117
|
-
|
|
118
|
-
const pages = analyzePages(normalizedRoot, crawlData.pages);
|
|
119
|
-
|
|
120
|
-
const activeModules = {
|
|
121
|
-
seo: !!options.seo,
|
|
122
|
-
content: !!options.content,
|
|
123
|
-
accessibility: !!options.accessibility
|
|
124
|
-
};
|
|
125
|
-
|
|
126
|
-
const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
|
|
127
|
-
|
|
128
|
-
const filteredPages = hasFilters
|
|
129
|
-
? pages.map((page) => filterPageModules(page, activeModules))
|
|
130
|
-
: pages;
|
|
131
|
-
|
|
132
|
-
// Filter to only the requested URL
|
|
133
|
-
const targetPage = filteredPages.find(p => p.url === normalizedRoot);
|
|
134
|
-
const resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : filteredPages);
|
|
135
|
-
|
|
136
|
-
const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
|
|
137
|
-
const thinPages = pages.filter((page) => page.thinScore >= 70).length;
|
|
138
|
-
const siteScores = aggregateSiteScore(crawlData.metrics, pages);
|
|
139
|
-
|
|
140
|
-
return {
|
|
141
|
-
site_summary: {
|
|
142
|
-
pages_analyzed: pages.length,
|
|
143
|
-
avg_seo_score: siteScores.seoHealthScore,
|
|
144
|
-
thin_pages: thinPages,
|
|
145
|
-
duplicate_titles: duplicateTitles,
|
|
146
|
-
site_score: siteScores.overallScore
|
|
147
|
-
},
|
|
148
|
-
site_scores: siteScores,
|
|
149
|
-
pages: resultPages,
|
|
150
|
-
active_modules: activeModules,
|
|
151
|
-
clusters: crawlData.graph.contentClusters
|
|
152
|
-
};
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
export function renderAnalysisHtml(result: AnalysisResult): string {
|
|
156
|
-
if (result.pages.length === 1) {
|
|
157
|
-
return renderSinglePageHtml(result.pages[0]);
|
|
158
|
-
}
|
|
159
|
-
const rows = result.pages
|
|
160
|
-
.map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
|
|
161
|
-
.join('');
|
|
162
|
-
|
|
163
|
-
return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
function renderSinglePageHtml(page: PageAnalysis): string {
|
|
167
|
-
return `<!DOCTYPE html>
|
|
168
|
-
<html lang="en">
|
|
169
|
-
<head>
|
|
170
|
-
<meta charset="UTF-8">
|
|
171
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
172
|
-
<title>Analysis for ${escapeHtml(page.url)}</title>
|
|
173
|
-
<style>
|
|
174
|
-
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
|
|
175
|
-
h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
|
|
176
|
-
h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
|
|
177
|
-
.score-card { display: flex; gap: 20px; margin-bottom: 30px; }
|
|
178
|
-
.score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
|
|
179
|
-
.score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
|
|
180
|
-
.status-ok { color: green; font-weight: bold; }
|
|
181
|
-
.status-warning { color: orange; font-weight: bold; }
|
|
182
|
-
.status-critical { color: red; font-weight: bold; }
|
|
183
|
-
.status-missing { color: red; font-weight: bold; }
|
|
184
|
-
.data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
|
|
185
|
-
.data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
|
|
186
|
-
.data-table th { width: 150px; color: #666; }
|
|
187
|
-
code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
|
|
188
|
-
</style>
|
|
189
|
-
</head>
|
|
190
|
-
<body>
|
|
191
|
-
<h1>Page Analysis</h1>
|
|
192
|
-
<p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
|
|
193
|
-
|
|
194
|
-
<div class="score-card">
|
|
195
|
-
<div class="score-box">
|
|
196
|
-
<div class="score-val">${page.seoScore}</div>
|
|
197
|
-
<div>SEO Score</div>
|
|
198
|
-
</div>
|
|
199
|
-
<div class="score-box">
|
|
200
|
-
<div class="score-val">${page.thinScore}</div>
|
|
201
|
-
<div>Thin Content Score</div>
|
|
202
|
-
</div>
|
|
203
|
-
<div class="score-box">
|
|
204
|
-
<div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
|
|
205
|
-
<div>HTTP Status</div>
|
|
206
|
-
</div>
|
|
207
|
-
</div>
|
|
208
|
-
|
|
209
|
-
<h2>Meta Tags</h2>
|
|
210
|
-
<table class="data-table">
|
|
211
|
-
<tr>
|
|
212
|
-
<th>Title</th>
|
|
213
|
-
<td>
|
|
214
|
-
<div>${escapeHtml(page.title.value || '(missing)')}</div>
|
|
215
|
-
<small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
|
|
216
|
-
</td>
|
|
217
|
-
</tr>
|
|
218
|
-
<tr>
|
|
219
|
-
<th>Description</th>
|
|
220
|
-
<td>
|
|
221
|
-
<div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
|
|
222
|
-
<small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
|
|
223
|
-
</td>
|
|
224
|
-
</tr>
|
|
225
|
-
<tr>
|
|
226
|
-
<th>Canonical</th>
|
|
227
|
-
<td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
|
|
228
|
-
</tr>
|
|
229
|
-
<tr>
|
|
230
|
-
<th>Robots</th>
|
|
231
|
-
<td>
|
|
232
|
-
Index: ${!page.meta.noindex},
|
|
233
|
-
Follow: ${!page.meta.nofollow}
|
|
234
|
-
</td>
|
|
235
|
-
</tr>
|
|
236
|
-
</table>
|
|
237
|
-
|
|
238
|
-
<h2>Content & Heading</h2>
|
|
239
|
-
<table class="data-table">
|
|
240
|
-
<tr>
|
|
241
|
-
<th>H1 Tag</th>
|
|
242
|
-
<td>
|
|
243
|
-
Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
|
|
244
|
-
(${page.h1.count} detected)
|
|
245
|
-
${page.h1.matchesTitle ? ' | Matches Title' : ''}
|
|
246
|
-
</td>
|
|
247
|
-
</tr>
|
|
248
|
-
<tr>
|
|
249
|
-
<th>Word Count</th>
|
|
250
|
-
<td>${page.content.wordCount} words</td>
|
|
251
|
-
</tr>
|
|
252
|
-
<tr>
|
|
253
|
-
<th>Unique Sentences</th>
|
|
254
|
-
<td>${page.content.uniqueSentenceCount}</td>
|
|
255
|
-
</tr>
|
|
256
|
-
<tr>
|
|
257
|
-
<th>Text / HTML Ratio</th>
|
|
258
|
-
<td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
|
|
259
|
-
</tr>
|
|
260
|
-
</table>
|
|
261
|
-
|
|
262
|
-
<h2>Links & Images</h2>
|
|
263
|
-
<table class="data-table">
|
|
264
|
-
<tr>
|
|
265
|
-
<th>Internal Links</th>
|
|
266
|
-
<td>${page.links.internalLinks}</td>
|
|
267
|
-
</tr>
|
|
268
|
-
<tr>
|
|
269
|
-
<th>External Links</th>
|
|
270
|
-
<td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
|
|
271
|
-
</tr>
|
|
272
|
-
<tr>
|
|
273
|
-
<th>Images</th>
|
|
274
|
-
<td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
|
|
275
|
-
</tr>
|
|
276
|
-
</table>
|
|
277
|
-
|
|
278
|
-
<h2>Structured Data</h2>
|
|
279
|
-
<table class="data-table">
|
|
280
|
-
<tr>
|
|
281
|
-
<th>Status</th>
|
|
282
|
-
<td>
|
|
283
|
-
${page.structuredData.present
|
|
284
|
-
? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
|
|
285
|
-
: 'Not detected'
|
|
286
|
-
}
|
|
287
|
-
</td>
|
|
288
|
-
</tr>
|
|
289
|
-
${page.structuredData.present ? `
|
|
290
|
-
<tr>
|
|
291
|
-
<th>Types Found</th>
|
|
292
|
-
<td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
|
|
293
|
-
</tr>
|
|
294
|
-
` : ''}
|
|
295
|
-
</table>
|
|
296
|
-
</body>
|
|
297
|
-
</html>`;
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
export function renderAnalysisMarkdown(result: AnalysisResult): string {
|
|
301
|
-
const summary = [
|
|
302
|
-
'# Crawlith SEO Analysis Report',
|
|
303
|
-
'',
|
|
304
|
-
'## 📊 Summary',
|
|
305
|
-
`- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
|
|
306
|
-
`- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
|
|
307
|
-
`- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
|
|
308
|
-
`- Thin Pages Found: ${result.site_summary.thin_pages}`,
|
|
309
|
-
`- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
|
|
310
|
-
'',
|
|
311
|
-
'## 📄 Page Details',
|
|
312
|
-
'',
|
|
313
|
-
'| URL | SEO Score | Thin Score | Title Status | Meta Status |',
|
|
314
|
-
'| :--- | :--- | :--- | :--- | :--- |',
|
|
315
|
-
];
|
|
316
|
-
|
|
317
|
-
result.pages.forEach((page) => {
|
|
318
|
-
summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
|
|
319
|
-
});
|
|
320
|
-
|
|
321
|
-
return summary.join('\n');
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
export function renderAnalysisCsv(result: AnalysisResult): string {
|
|
325
|
-
const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
|
|
326
|
-
const rows = result.pages.map((p) => {
|
|
327
|
-
const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
|
|
328
|
-
return [
|
|
329
|
-
p.url,
|
|
330
|
-
p.seoScore,
|
|
331
|
-
p.thinScore,
|
|
332
|
-
statusStr,
|
|
333
|
-
`"${(p.title.value || '').replace(/"/g, '""')}"`,
|
|
334
|
-
p.title.length,
|
|
335
|
-
`"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
|
|
336
|
-
p.metaDescription.length,
|
|
337
|
-
p.content.wordCount,
|
|
338
|
-
p.links.internalLinks,
|
|
339
|
-
p.links.externalLinks
|
|
340
|
-
].join(',');
|
|
341
|
-
});
|
|
342
|
-
|
|
343
|
-
return [headers.join(','), ...rows].join('\n');
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
function escapeHtml(value: string): string {
|
|
347
|
-
return value.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
function analyzePages(rootUrl: string, pages: CrawlPage[]): PageAnalysis[] {
|
|
351
|
-
const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
|
|
352
|
-
const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
|
|
353
|
-
const titles = applyDuplicateStatuses(titleCandidates);
|
|
354
|
-
const metas = applyDuplicateStatuses(metaCandidates);
|
|
355
|
-
|
|
356
|
-
const sentenceCountFrequency = new Map<number, number>();
|
|
357
|
-
const baseContent = pages.map((page) => analyzeContent(page.html || ''));
|
|
358
|
-
for (const item of baseContent) {
|
|
359
|
-
sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
return pages.map((page, index) => {
|
|
363
|
-
const html = page.html || '';
|
|
364
|
-
const title = titles[index];
|
|
365
|
-
const metaDescription = metas[index];
|
|
366
|
-
const h1 = analyzeH1(html, title.value);
|
|
367
|
-
const content = baseContent[index];
|
|
368
|
-
const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
|
|
369
|
-
const thinScore = calculateThinContentScore(content, duplicationScore);
|
|
370
|
-
const images = analyzeImageAlts(html);
|
|
371
|
-
const links = analyzeLinks(html, page.url, rootUrl);
|
|
372
|
-
const structuredData = analyzeStructuredData(html);
|
|
373
|
-
|
|
374
|
-
const analysis: PageAnalysis = {
|
|
375
|
-
url: page.url,
|
|
376
|
-
status: page.status || 0,
|
|
377
|
-
title,
|
|
378
|
-
metaDescription,
|
|
379
|
-
h1,
|
|
380
|
-
content,
|
|
381
|
-
thinScore,
|
|
382
|
-
images,
|
|
383
|
-
links,
|
|
384
|
-
structuredData,
|
|
385
|
-
seoScore: 0,
|
|
386
|
-
meta: {
|
|
387
|
-
canonical: page.canonical,
|
|
388
|
-
noindex: page.noindex,
|
|
389
|
-
nofollow: page.nofollow
|
|
390
|
-
}
|
|
391
|
-
};
|
|
392
|
-
|
|
393
|
-
analysis.seoScore = scorePageSeo(analysis);
|
|
394
|
-
return analysis;
|
|
395
|
-
});
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
function filterPageModules(
|
|
399
|
-
page: PageAnalysis,
|
|
400
|
-
modules: { seo: boolean; content: boolean; accessibility: boolean }
|
|
401
|
-
): PageAnalysis {
|
|
402
|
-
const keepSeo = modules.seo;
|
|
403
|
-
const keepContent = modules.content;
|
|
404
|
-
const keepAccessibility = modules.accessibility;
|
|
405
|
-
|
|
406
|
-
return {
|
|
407
|
-
...page,
|
|
408
|
-
title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
|
|
409
|
-
metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
|
|
410
|
-
h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
|
|
411
|
-
links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
|
|
412
|
-
structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
|
|
413
|
-
content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
|
|
414
|
-
thinScore: keepContent ? page.thinScore : 0,
|
|
415
|
-
images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
|
|
416
|
-
};
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
async function loadCrawlData(rootUrl: string, fromCrawl?: string): Promise<CrawlData> {
|
|
420
|
-
// If fromCrawl is provided, we could theoretically load JSON, but
|
|
421
|
-
// we now default to DB fetching for all operations.
|
|
422
|
-
|
|
423
|
-
if (fromCrawl) {
|
|
424
|
-
try {
|
|
425
|
-
const content = await fs.readFile(fromCrawl, 'utf-8');
|
|
426
|
-
const raw = JSON.parse(content) as Record<string, unknown>;
|
|
427
|
-
const pages = parsePages(raw);
|
|
428
|
-
const graph = graphFromPages(rootUrl, pages, raw);
|
|
429
|
-
const metrics = calculateMetrics(graph, 5);
|
|
430
|
-
return { pages, metrics, graph };
|
|
431
|
-
} catch (_e) {
|
|
432
|
-
// Fallback downwards if file doesn't exist
|
|
433
|
-
}
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
const db = getDb();
|
|
437
|
-
const siteRepo = new SiteRepository(db);
|
|
438
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
439
|
-
const pageRepo = new PageRepository(db);
|
|
440
|
-
|
|
441
|
-
const urlObj = new URL(rootUrl);
|
|
442
|
-
const domain = urlObj.hostname.replace('www.', '');
|
|
443
|
-
const site = siteRepo.firstOrCreateSite(domain);
|
|
444
|
-
|
|
445
|
-
const snapshot = snapshotRepo.getLatestSnapshot(site.id, 'completed');
|
|
446
|
-
if (!snapshot) {
|
|
447
|
-
throw new Error(`No completed snapshot found for ${rootUrl} in database.`);
|
|
448
|
-
}
|
|
449
|
-
|
|
450
|
-
const graph = loadGraphFromSnapshot(snapshot.id);
|
|
451
|
-
const metrics = calculateMetrics(graph, 5);
|
|
452
|
-
|
|
453
|
-
// We also need the `pages` array for analysis.
|
|
454
|
-
// It needs `html` which might not be fully available unless we look up from the DB or Graph.
|
|
455
|
-
// Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
|
|
456
|
-
// Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
|
|
457
|
-
// We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
|
|
458
|
-
|
|
459
|
-
const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
|
|
460
|
-
const pages: CrawlPage[] = dbPages.map((p: any) => ({
|
|
461
|
-
url: p.normalized_url,
|
|
462
|
-
status: p.http_status || 0,
|
|
463
|
-
html: p.html || '',
|
|
464
|
-
depth: p.depth || 0
|
|
465
|
-
}));
|
|
466
|
-
|
|
467
|
-
return { pages, metrics, graph };
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
function parsePages(raw: Record<string, unknown>): CrawlPage[] {
|
|
471
|
-
if (Array.isArray(raw.pages)) {
|
|
472
|
-
return raw.pages.map((page) => {
|
|
473
|
-
const p = page as Record<string, unknown>;
|
|
474
|
-
return {
|
|
475
|
-
url: String(p.url || ''),
|
|
476
|
-
status: Number(p.status || 0),
|
|
477
|
-
html: typeof p.html === 'string' ? p.html : '',
|
|
478
|
-
depth: Number(p.depth || 0)
|
|
479
|
-
};
|
|
480
|
-
}).filter((page) => Boolean(page.url));
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
if (Array.isArray(raw.nodes)) {
|
|
484
|
-
return raw.nodes.map((node) => {
|
|
485
|
-
const n = node as Record<string, unknown>;
|
|
486
|
-
return {
|
|
487
|
-
url: String(n.url || ''),
|
|
488
|
-
status: Number(n.status || 0),
|
|
489
|
-
html: typeof n.html === 'string' ? n.html : '',
|
|
490
|
-
depth: Number(n.depth || 0)
|
|
491
|
-
};
|
|
492
|
-
}).filter((page) => Boolean(page.url));
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
return [];
|
|
496
|
-
}
|
|
497
|
-
|
|
498
|
-
function graphFromPages(rootUrl: string, pages: CrawlPage[], raw: Record<string, unknown>): Graph {
|
|
499
|
-
const graph = new Graph();
|
|
500
|
-
|
|
501
|
-
for (const page of pages) {
|
|
502
|
-
graph.addNode(page.url, page.depth || 0, page.status || 0);
|
|
503
|
-
}
|
|
504
|
-
|
|
505
|
-
if (Array.isArray(raw.edges)) {
|
|
506
|
-
for (const edge of raw.edges) {
|
|
507
|
-
const e = edge as Record<string, unknown>;
|
|
508
|
-
if (typeof e.source === 'string' && typeof e.target === 'string') {
|
|
509
|
-
graph.addNode(e.source, 0, 0);
|
|
510
|
-
graph.addNode(e.target, 0, 0);
|
|
511
|
-
graph.addEdge(e.source, e.target);
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
return graph;
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
for (const page of pages) {
|
|
518
|
-
if (!page.html) continue;
|
|
519
|
-
const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
|
|
520
|
-
if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0) continue;
|
|
521
|
-
}
|
|
522
|
-
|
|
523
|
-
return graph;
|
|
524
|
-
}
|
|
525
|
-
|
|
526
|
-
async function runLiveCrawl(url: string, options: AnalyzeOptions): Promise<CrawlData> {
|
|
527
|
-
const snapshotId = await crawl(url, {
|
|
528
|
-
limit: 1,
|
|
529
|
-
depth: 0,
|
|
530
|
-
rate: options.rate,
|
|
531
|
-
proxyUrl: options.proxyUrl,
|
|
532
|
-
userAgent: options.userAgent,
|
|
533
|
-
maxRedirects: options.maxRedirects,
|
|
534
|
-
debug: options.debug
|
|
535
|
-
});
|
|
536
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
537
|
-
const pages = graph.getNodes().map((node) => ({
|
|
538
|
-
url: node.url,
|
|
539
|
-
status: node.status,
|
|
540
|
-
html: node.html || '', // Include HTML
|
|
541
|
-
depth: node.depth
|
|
542
|
-
}));
|
|
543
|
-
return {
|
|
544
|
-
pages,
|
|
545
|
-
metrics: calculateMetrics(graph, 1),
|
|
546
|
-
graph
|
|
547
|
-
};
|
|
548
|
-
}
|