@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import { Graph, GraphNode } from '../graph/graph.js';
|
|
2
|
+
|
|
3
|
+
export interface HITSOptions {
|
|
4
|
+
iterations?: number;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Computes Hub and Authority scores using the HITS algorithm.
|
|
9
|
+
* Operates purely on the internal link graph.
|
|
10
|
+
*/
|
|
11
|
+
export function computeHITS(graph: Graph, options: HITSOptions = {}): void {
|
|
12
|
+
const iterations = options.iterations || 20;
|
|
13
|
+
const nodes = graph.getNodes();
|
|
14
|
+
|
|
15
|
+
// 1. Filter eligible nodes
|
|
16
|
+
// Eligibility: status 200, non-redirect (redirectChain empty), not noindex, non-external
|
|
17
|
+
const eligibleNodes = nodes.filter(n =>
|
|
18
|
+
n.status === 200 &&
|
|
19
|
+
(!n.redirectChain || n.redirectChain.length === 0) &&
|
|
20
|
+
!n.noindex
|
|
21
|
+
);
|
|
22
|
+
|
|
23
|
+
if (eligibleNodes.length === 0) return;
|
|
24
|
+
|
|
25
|
+
const urlToNode = new Map<string, GraphNode>();
|
|
26
|
+
for (const node of eligibleNodes) {
|
|
27
|
+
urlToNode.set(node.url, node);
|
|
28
|
+
// 2. Initialization
|
|
29
|
+
node.authorityScore = 1.0;
|
|
30
|
+
node.hubScore = 1.0;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const allEdges = graph.getEdges();
|
|
34
|
+
// Filter edges: internal links only (both source and target must be in eligibleNodes), no self-links
|
|
35
|
+
const eligibleEdges = allEdges.filter(e =>
|
|
36
|
+
e.source !== e.target &&
|
|
37
|
+
urlToNode.has(e.source) &&
|
|
38
|
+
urlToNode.has(e.target)
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
// Group edges for efficient iteration
|
|
42
|
+
const incoming = new Map<string, { source: string, weight: number }[]>();
|
|
43
|
+
const outgoing = new Map<string, { target: string, weight: number }[]>();
|
|
44
|
+
|
|
45
|
+
for (const edge of eligibleEdges) {
|
|
46
|
+
if (!incoming.has(edge.target)) incoming.set(edge.target, []);
|
|
47
|
+
incoming.get(edge.target)!.push({ source: edge.source, weight: edge.weight });
|
|
48
|
+
|
|
49
|
+
if (!outgoing.has(edge.source)) outgoing.set(edge.source, []);
|
|
50
|
+
outgoing.get(edge.source)!.push({ target: edge.target, weight: edge.weight });
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// 3. Iteration
|
|
54
|
+
for (let i = 0; i < iterations; i++) {
|
|
55
|
+
// Update Authorities
|
|
56
|
+
let normAuth = 0;
|
|
57
|
+
for (const node of eligibleNodes) {
|
|
58
|
+
const inLinks = incoming.get(node.url) || [];
|
|
59
|
+
let newAuth = 0;
|
|
60
|
+
for (const link of inLinks) {
|
|
61
|
+
const sourceNode = urlToNode.get(link.source)!;
|
|
62
|
+
newAuth += (sourceNode.hubScore || 0) * link.weight;
|
|
63
|
+
}
|
|
64
|
+
node.authorityScore = newAuth;
|
|
65
|
+
normAuth += newAuth * newAuth;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Normalize Authorities (L2 norm)
|
|
69
|
+
normAuth = Math.sqrt(normAuth);
|
|
70
|
+
if (normAuth > 0) {
|
|
71
|
+
for (const node of eligibleNodes) {
|
|
72
|
+
node.authorityScore = (node.authorityScore || 0) / normAuth;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Update Hubs
|
|
77
|
+
let normHub = 0;
|
|
78
|
+
for (const node of eligibleNodes) {
|
|
79
|
+
const outLinks = outgoing.get(node.url) || [];
|
|
80
|
+
let newHub = 0;
|
|
81
|
+
for (const link of outLinks) {
|
|
82
|
+
const targetNode = urlToNode.get(link.target)!;
|
|
83
|
+
newHub += (targetNode.authorityScore || 0) * link.weight;
|
|
84
|
+
}
|
|
85
|
+
node.hubScore = newHub;
|
|
86
|
+
normHub += newHub * newHub;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Normalize Hubs (L2 norm)
|
|
90
|
+
normHub = Math.sqrt(normHub);
|
|
91
|
+
if (normHub > 0) {
|
|
92
|
+
for (const node of eligibleNodes) {
|
|
93
|
+
node.hubScore = (node.hubScore || 0) / normHub;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// 4. Classification Logic
|
|
99
|
+
classifyLinkRoles(eligibleNodes);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function classifyLinkRoles(nodes: GraphNode[]): void {
|
|
103
|
+
if (nodes.length === 0) return;
|
|
104
|
+
|
|
105
|
+
const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
|
|
106
|
+
const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
|
|
107
|
+
|
|
108
|
+
// Use 75th percentile as "high" threshold
|
|
109
|
+
const medianAuth = authScores[Math.floor(authScores.length / 2)];
|
|
110
|
+
const medianHub = hubScores[Math.floor(hubScores.length / 2)];
|
|
111
|
+
|
|
112
|
+
for (const node of nodes) {
|
|
113
|
+
const auth = node.authorityScore || 0;
|
|
114
|
+
const hub = node.hubScore || 0;
|
|
115
|
+
|
|
116
|
+
const isHighAuth = auth > medianAuth && auth > 0.0001;
|
|
117
|
+
const isHighHub = hub > medianHub && hub > 0.0001;
|
|
118
|
+
|
|
119
|
+
if (isHighAuth && isHighHub) {
|
|
120
|
+
node.linkRole = 'power';
|
|
121
|
+
} else if (isHighAuth) {
|
|
122
|
+
node.linkRole = 'authority';
|
|
123
|
+
} else if (isHighHub) {
|
|
124
|
+
node.linkRole = 'hub';
|
|
125
|
+
} else if (auth > 0.0001 && hub > 0.0001) {
|
|
126
|
+
node.linkRole = 'balanced';
|
|
127
|
+
} else {
|
|
128
|
+
node.linkRole = 'peripheral';
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
|
|
2
|
+
export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
|
|
3
|
+
|
|
4
|
+
export interface SitegraphNode {
|
|
5
|
+
url: string;
|
|
6
|
+
depth: number;
|
|
7
|
+
inLinks: number;
|
|
8
|
+
outLinks: number;
|
|
9
|
+
status: number;
|
|
10
|
+
discoveredViaSitemap?: boolean;
|
|
11
|
+
robotsExcluded?: boolean;
|
|
12
|
+
canonicalUrl?: string;
|
|
13
|
+
isHomepage?: boolean;
|
|
14
|
+
wordCount?: number;
|
|
15
|
+
hasStructuredData?: boolean;
|
|
16
|
+
pageType?: string;
|
|
17
|
+
noindex?: boolean;
|
|
18
|
+
duplicateContent?: boolean;
|
|
19
|
+
isProductOrCommercial?: boolean;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface SitegraphEdge {
|
|
23
|
+
source: string;
|
|
24
|
+
target: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface OrphanScoringOptions {
|
|
28
|
+
enabled: boolean;
|
|
29
|
+
severityEnabled: boolean;
|
|
30
|
+
includeSoftOrphans: boolean;
|
|
31
|
+
minInbound: number;
|
|
32
|
+
rootUrl?: string;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export type AnnotatedNode = SitegraphNode & {
|
|
36
|
+
orphan: boolean;
|
|
37
|
+
orphanType?: OrphanType;
|
|
38
|
+
orphanSeverity?: number;
|
|
39
|
+
impactLevel?: ImpactLevel;
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
const LOW_VALUE_PATTERNS = [
|
|
43
|
+
/[?&](page|p)=\d+/i,
|
|
44
|
+
/\/(page|tag|tags|category|categories)\//i,
|
|
45
|
+
/[?&](q|query|search|filter|sort)=/i,
|
|
46
|
+
/\/search(\/|\?|$)/i
|
|
47
|
+
];
|
|
48
|
+
|
|
49
|
+
function isLowValuePage(node: SitegraphNode): boolean {
|
|
50
|
+
const type = (node.pageType || '').toLowerCase();
|
|
51
|
+
if (['pagination', 'tag', 'category', 'filter', 'search', 'archive'].includes(type)) {
|
|
52
|
+
return true;
|
|
53
|
+
}
|
|
54
|
+
if (node.noindex) {
|
|
55
|
+
return true;
|
|
56
|
+
}
|
|
57
|
+
return LOW_VALUE_PATTERNS.some((pattern) => pattern.test(node.url));
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function clampScore(score: number): number {
|
|
61
|
+
return Math.max(0, Math.min(100, Math.round(score)));
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function mapImpactLevel(score: number): ImpactLevel {
|
|
65
|
+
if (score <= 39) return 'low';
|
|
66
|
+
if (score <= 69) return 'medium';
|
|
67
|
+
if (score <= 89) return 'high';
|
|
68
|
+
return 'critical';
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphNode): number {
|
|
72
|
+
let score = 0;
|
|
73
|
+
|
|
74
|
+
switch (orphanType) {
|
|
75
|
+
case 'hard':
|
|
76
|
+
score = 90;
|
|
77
|
+
break;
|
|
78
|
+
case 'crawl-only':
|
|
79
|
+
score = 80;
|
|
80
|
+
break;
|
|
81
|
+
case 'near':
|
|
82
|
+
score = node.inLinks <= 1 ? 70 : 60;
|
|
83
|
+
break;
|
|
84
|
+
case 'soft':
|
|
85
|
+
score = 50;
|
|
86
|
+
break;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
let positiveModifier = 0;
|
|
90
|
+
if ((node.wordCount || 0) > 800) positiveModifier += 10;
|
|
91
|
+
if (node.hasStructuredData) positiveModifier += 10;
|
|
92
|
+
if (node.depth <= 2) positiveModifier += 10;
|
|
93
|
+
if (node.isProductOrCommercial) positiveModifier += 10;
|
|
94
|
+
positiveModifier = Math.min(20, positiveModifier);
|
|
95
|
+
|
|
96
|
+
let negativeModifier = 0;
|
|
97
|
+
if ((node.wordCount || 0) > 0 && (node.wordCount || 0) < 300) negativeModifier += 20;
|
|
98
|
+
if (node.noindex) negativeModifier += 20;
|
|
99
|
+
if (node.duplicateContent) negativeModifier += 20;
|
|
100
|
+
if ((node.pageType || '').toLowerCase() === 'archive' || (node.pageType || '').toLowerCase() === 'pagination') negativeModifier += 20;
|
|
101
|
+
negativeModifier = Math.min(20, negativeModifier);
|
|
102
|
+
|
|
103
|
+
score += positiveModifier;
|
|
104
|
+
score -= negativeModifier;
|
|
105
|
+
|
|
106
|
+
return clampScore(score);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function consolidateInboundByCanonical(nodes: SitegraphNode[]): Map<string, number> {
|
|
110
|
+
const canonicalInbound = new Map<string, number>();
|
|
111
|
+
for (const node of nodes) {
|
|
112
|
+
const canonical = node.canonicalUrl || node.url;
|
|
113
|
+
canonicalInbound.set(canonical, (canonicalInbound.get(canonical) || 0) + node.inLinks);
|
|
114
|
+
}
|
|
115
|
+
return canonicalInbound;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
|
|
119
|
+
if (!options.enabled) {
|
|
120
|
+
return nodes.map((node) => ({ ...node, orphan: false }));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const canonicalInbound = consolidateInboundByCanonical(nodes);
|
|
124
|
+
const nodeByUrl = new Map(nodes.map((node) => [node.url, node]));
|
|
125
|
+
|
|
126
|
+
return nodes.map((node) => {
|
|
127
|
+
const isHomepage = node.isHomepage || (options.rootUrl ? node.url === options.rootUrl : node.depth === 0);
|
|
128
|
+
if (isHomepage || node.robotsExcluded) {
|
|
129
|
+
return { ...node, orphan: false };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const canonical = node.canonicalUrl || node.url;
|
|
133
|
+
const inbound = canonicalInbound.get(canonical) || 0;
|
|
134
|
+
|
|
135
|
+
let orphanType: OrphanType | undefined;
|
|
136
|
+
|
|
137
|
+
if (inbound === 0) {
|
|
138
|
+
orphanType = node.discoveredViaSitemap ? 'crawl-only' : 'hard';
|
|
139
|
+
} else if (inbound <= options.minInbound) {
|
|
140
|
+
orphanType = 'near';
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (!orphanType && options.includeSoftOrphans && inbound > 0) {
|
|
144
|
+
const inboundSources = edges
|
|
145
|
+
.filter((edge) => edge.target === node.url)
|
|
146
|
+
.map((edge) => nodeByUrl.get(edge.source))
|
|
147
|
+
.filter((source): source is SitegraphNode => Boolean(source));
|
|
148
|
+
|
|
149
|
+
if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) {
|
|
150
|
+
orphanType = 'soft';
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (!orphanType) {
|
|
155
|
+
return { ...node, orphan: false };
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if (!options.severityEnabled) {
|
|
159
|
+
return {
|
|
160
|
+
...node,
|
|
161
|
+
orphan: true,
|
|
162
|
+
orphanType
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const orphanSeverity = calculateOrphanSeverity(orphanType, { ...node, inLinks: inbound });
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
...node,
|
|
170
|
+
orphan: true,
|
|
171
|
+
orphanType,
|
|
172
|
+
orphanSeverity,
|
|
173
|
+
impactLevel: mapImpactLevel(orphanSeverity)
|
|
174
|
+
};
|
|
175
|
+
});
|
|
176
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { fileURLToPath } from 'node:url';
|
|
3
|
+
import { dirname, join } from 'node:path';
|
|
4
|
+
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = dirname(__filename);
|
|
7
|
+
|
|
8
|
+
let version = '0.0.1';
|
|
9
|
+
|
|
10
|
+
try {
|
|
11
|
+
const pkgPath = join(__dirname, '../../package.json');
|
|
12
|
+
const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
|
|
13
|
+
version = pkg.version;
|
|
14
|
+
} catch {
|
|
15
|
+
// Fallback to internal default
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export { version };
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
|
2
|
+
|
|
3
|
+
exports[`orphan detection and severity scoring > canonical consolidation, robots exclusion, and deterministic JSON output snapshot 1`] = `
|
|
4
|
+
"[
|
|
5
|
+
{
|
|
6
|
+
"url": "https://example.com/canonical",
|
|
7
|
+
"depth": 1,
|
|
8
|
+
"inLinks": 0,
|
|
9
|
+
"outLinks": 0,
|
|
10
|
+
"status": 200,
|
|
11
|
+
"orphan": true,
|
|
12
|
+
"orphanType": "near",
|
|
13
|
+
"orphanSeverity": 80,
|
|
14
|
+
"impactLevel": "high"
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"url": "https://example.com/variant?a=1",
|
|
18
|
+
"depth": 1,
|
|
19
|
+
"inLinks": 1,
|
|
20
|
+
"outLinks": 0,
|
|
21
|
+
"status": 200,
|
|
22
|
+
"canonicalUrl": "https://example.com/canonical",
|
|
23
|
+
"orphan": true,
|
|
24
|
+
"orphanType": "near",
|
|
25
|
+
"orphanSeverity": 80,
|
|
26
|
+
"impactLevel": "high"
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"url": "https://example.com/blocked",
|
|
30
|
+
"depth": 1,
|
|
31
|
+
"inLinks": 0,
|
|
32
|
+
"outLinks": 0,
|
|
33
|
+
"status": 200,
|
|
34
|
+
"robotsExcluded": true,
|
|
35
|
+
"orphan": false
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"url": "https://example.com/redirect-target",
|
|
39
|
+
"depth": 1,
|
|
40
|
+
"inLinks": 1,
|
|
41
|
+
"outLinks": 0,
|
|
42
|
+
"status": 200,
|
|
43
|
+
"orphan": true,
|
|
44
|
+
"orphanType": "near",
|
|
45
|
+
"orphanSeverity": 80,
|
|
46
|
+
"impactLevel": "high"
|
|
47
|
+
}
|
|
48
|
+
]"
|
|
49
|
+
`;
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { describe, expect, test } from 'vitest';
|
|
2
|
+
import { analyzeTitle, analyzeMetaDescription, applyDuplicateStatuses, analyzeH1 } from '../src/analysis/seo.js';
|
|
3
|
+
import { analyzeContent, calculateThinContentScore } from '../src/analysis/content.js';
|
|
4
|
+
import { analyzeStructuredData } from '../src/analysis/structuredData.js';
|
|
5
|
+
import { analyzeLinks } from '../src/analysis/links.js';
|
|
6
|
+
import { analyzeImageAlts } from '../src/analysis/images.js';
|
|
7
|
+
|
|
8
|
+
describe('SEO module', () => {
|
|
9
|
+
test('analyze title edge cases', () => {
|
|
10
|
+
expect(analyzeTitle('<html></html>').status).toBe('missing');
|
|
11
|
+
expect(analyzeTitle('<title>short</title>').status).toBe('too_short');
|
|
12
|
+
expect(analyzeTitle(`<title>${'a'.repeat(61)}</title>`).status).toBe('too_long');
|
|
13
|
+
expect(analyzeTitle(`<title>${'a'.repeat(55)}</title>`).status).toBe('ok');
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
test('duplicate detection', () => {
|
|
17
|
+
const values = applyDuplicateStatuses([
|
|
18
|
+
{ value: 'Same', length: 4, status: 'ok' as const },
|
|
19
|
+
{ value: 'same', length: 4, status: 'ok' as const },
|
|
20
|
+
{ value: null, length: 0, status: 'missing' as const }
|
|
21
|
+
]);
|
|
22
|
+
expect(values[0].status).toBe('duplicate');
|
|
23
|
+
expect(values[1].status).toBe('duplicate');
|
|
24
|
+
expect(values[2].status).toBe('missing');
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
test('meta description boundaries', () => {
|
|
28
|
+
expect(analyzeMetaDescription('<meta name="description" content="">').status).toBe('missing');
|
|
29
|
+
expect(analyzeMetaDescription('<html></html>').status).toBe('missing');
|
|
30
|
+
expect(analyzeMetaDescription('<meta name="description" content="short">').status).toBe('too_short');
|
|
31
|
+
expect(analyzeMetaDescription(`<meta name="description" content="${'x'.repeat(150)}">`).status).toBe('ok');
|
|
32
|
+
expect(analyzeMetaDescription(`<meta name="description" content="${'x'.repeat(170)}">`).status).toBe('too_long');
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
test('h1 variations', () => {
|
|
36
|
+
expect(analyzeH1('<h1>One</h1>', 'Title').status).toBe('ok');
|
|
37
|
+
expect(analyzeH1('<h1>One</h1><h1>Two</h1>', 'Title').status).toBe('warning');
|
|
38
|
+
const noH1 = analyzeH1('<p>none</p>', 'Title');
|
|
39
|
+
expect(noH1.status).toBe('critical');
|
|
40
|
+
const same = analyzeH1('<h1>same</h1>', 'Same');
|
|
41
|
+
expect(same.matchesTitle).toBe(true);
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
describe('content module', () => {
|
|
46
|
+
test('word count strips nav/footer/script/style', () => {
|
|
47
|
+
const html = '<body><nav>skip me</nav><p>keep words here</p><footer>skip</footer><script>var x</script><style>.x{}</style></body>';
|
|
48
|
+
const result = analyzeContent(html);
|
|
49
|
+
expect(result.wordCount).toBe(3);
|
|
50
|
+
expect(result.uniqueSentenceCount).toBe(1);
|
|
51
|
+
expect(result.textHtmlRatio).toBeGreaterThan(0);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
test('thin score boundaries', () => {
|
|
55
|
+
expect(calculateThinContentScore({ wordCount: 600, textHtmlRatio: 0.5, uniqueSentenceCount: 4 }, 0)).toBe(0);
|
|
56
|
+
expect(calculateThinContentScore({ wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 1 }, 100)).toBe(100);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
test('content handles malformed/empty html', () => {
|
|
60
|
+
expect(analyzeContent('').wordCount).toBe(0);
|
|
61
|
+
expect(analyzeContent('<div><span>broken').wordCount).toBeGreaterThanOrEqual(1);
|
|
62
|
+
});
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
describe('structured data', () => {
|
|
66
|
+
test('valid and invalid JSON-LD parsing', () => {
|
|
67
|
+
const valid = analyzeStructuredData('<script type="application/ld+json">{"@type":"Article"}</script>');
|
|
68
|
+
expect(valid.present).toBe(true);
|
|
69
|
+
expect(valid.valid).toBe(true);
|
|
70
|
+
expect(valid.types).toContain('Article');
|
|
71
|
+
|
|
72
|
+
const invalid = analyzeStructuredData('<script type="application/ld+json">{invalid}</script>');
|
|
73
|
+
expect(invalid.present).toBe(true);
|
|
74
|
+
expect(invalid.valid).toBe(false);
|
|
75
|
+
|
|
76
|
+
const missing = analyzeStructuredData('<p>none</p>');
|
|
77
|
+
expect(missing.present).toBe(false);
|
|
78
|
+
});
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
describe('links and images', () => {
|
|
82
|
+
test('link ratio calculation', () => {
|
|
83
|
+
const html = '<a href="/a">A</a><a href="https://other.com">B</a><a href="https://other.com" rel="nofollow">C</a>';
|
|
84
|
+
const links = analyzeLinks(html, 'https://example.com/page', 'https://example.com');
|
|
85
|
+
expect(links.internalLinks).toBe(1);
|
|
86
|
+
expect(links.externalLinks).toBe(2);
|
|
87
|
+
expect(links.nofollowCount).toBe(1);
|
|
88
|
+
expect(links.externalRatio).toBeCloseTo(2 / 3);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
test('image alt detection', () => {
|
|
92
|
+
const html = '<img src="a"><img src="b" alt=""><img src="c" alt="ok">';
|
|
93
|
+
const imgs = analyzeImageAlts(html);
|
|
94
|
+
expect(imgs.totalImages).toBe(3);
|
|
95
|
+
expect(imgs.missingAlt).toBe(1);
|
|
96
|
+
expect(imgs.emptyAlt).toBe(1);
|
|
97
|
+
});
|
|
98
|
+
});
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { describe, expect, test } from 'vitest';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import fs from 'node:fs/promises';
|
|
4
|
+
import { analyzeSite, renderAnalysisHtml } from '../src/analysis/analyze.js';
|
|
5
|
+
|
|
6
|
+
describe('analyze integration', () => {
|
|
7
|
+
const fixturePath = path.resolve(import.meta.dirname, 'fixtures/analyze-crawl.json');
|
|
8
|
+
|
|
9
|
+
test('analyzes full crawl fixture and schema', async () => {
|
|
10
|
+
const result = await analyzeSite('https://example.com', { fromCrawl: fixturePath });
|
|
11
|
+
|
|
12
|
+
expect(result.site_summary.pages_analyzed).toBe(3);
|
|
13
|
+
expect(result.site_summary.duplicate_titles).toBe(2);
|
|
14
|
+
expect(result.site_summary.avg_seo_score).toBeGreaterThanOrEqual(0);
|
|
15
|
+
expect(result.pages[0]).toHaveProperty('title');
|
|
16
|
+
expect(result.pages[0]).toHaveProperty('content');
|
|
17
|
+
expect(result.pages[0]).toHaveProperty('links');
|
|
18
|
+
expect(result.site_scores.overallScore).toBeGreaterThanOrEqual(0);
|
|
19
|
+
expect(result.site_scores.overallScore).toBeLessThanOrEqual(100);
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
test('module filter flags behavior', async () => {
|
|
23
|
+
const seoOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, seo: true });
|
|
24
|
+
expect(seoOnly.pages[0].content.wordCount).toBe(0);
|
|
25
|
+
expect(seoOnly.pages[0].images.totalImages).toBe(0);
|
|
26
|
+
|
|
27
|
+
const contentOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, content: true });
|
|
28
|
+
expect(contentOnly.pages[0].title.status).toBe('missing');
|
|
29
|
+
expect(contentOnly.pages[0].thinScore).toBeGreaterThanOrEqual(0);
|
|
30
|
+
|
|
31
|
+
const accessibilityOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, accessibility: true });
|
|
32
|
+
expect(accessibilityOnly.pages[0].images.totalImages).toBeGreaterThan(0);
|
|
33
|
+
expect(accessibilityOnly.pages[0].title.status).toBe('missing');
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
test('html report generation', async () => {
|
|
37
|
+
const result = await analyzeSite('https://example.com', { fromCrawl: fixturePath });
|
|
38
|
+
const html = renderAnalysisHtml(result);
|
|
39
|
+
expect(html).toContain('<table');
|
|
40
|
+
expect(html).toContain('Analysis');
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
test('default database loading', async () => {
|
|
44
|
+
// Force in-memory DB for this test
|
|
45
|
+
process.env.CRAWLITH_DB_PATH = ':memory:';
|
|
46
|
+
|
|
47
|
+
// Close existing DB connection if any to ensure fresh start
|
|
48
|
+
const { getDb, closeDb } = await import('../src/db/index.js');
|
|
49
|
+
closeDb();
|
|
50
|
+
|
|
51
|
+
// Setup repositories
|
|
52
|
+
const { SiteRepository } = await import('../src/db/repositories/SiteRepository.js');
|
|
53
|
+
const { SnapshotRepository } = await import('../src/db/repositories/SnapshotRepository.js');
|
|
54
|
+
const { PageRepository } = await import('../src/db/repositories/PageRepository.js');
|
|
55
|
+
|
|
56
|
+
const db = getDb();
|
|
57
|
+
const siteRepo = new SiteRepository(db);
|
|
58
|
+
const snapshotRepo = new SnapshotRepository(db);
|
|
59
|
+
const pageRepo = new PageRepository(db);
|
|
60
|
+
|
|
61
|
+
// Create site and snapshot
|
|
62
|
+
const siteId = siteRepo.createSite('example.com');
|
|
63
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
|
|
64
|
+
|
|
65
|
+
// Parse fixture and load pages into db
|
|
66
|
+
const rawYaml = await fs.readFile(fixturePath, 'utf-8');
|
|
67
|
+
const rawData = JSON.parse(rawYaml);
|
|
68
|
+
(rawData.pages || rawData.nodes).forEach((p: any) => {
|
|
69
|
+
pageRepo.upsertPage({
|
|
70
|
+
site_id: siteId,
|
|
71
|
+
normalized_url: p.url,
|
|
72
|
+
last_seen_snapshot_id: snapshotId,
|
|
73
|
+
http_status: p.status || 200,
|
|
74
|
+
html: p.html || '',
|
|
75
|
+
depth: p.depth || 0,
|
|
76
|
+
});
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 3, edge_count: 0 });
|
|
80
|
+
|
|
81
|
+
try {
|
|
82
|
+
const result = await analyzeSite('https://example.com', {});
|
|
83
|
+
expect(result.site_summary.pages_analyzed).toBe(3);
|
|
84
|
+
} finally {
|
|
85
|
+
closeDb();
|
|
86
|
+
delete process.env.CRAWLITH_DB_PATH;
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
test('handles large html and js-only content', async () => {
|
|
91
|
+
const hugeText = '<html><body><script>document.write("x")</script>' + '<p>word </p>'.repeat(1000) + '</body></html>';
|
|
92
|
+
const tmpFile = path.resolve(import.meta.dirname, 'fixtures/large-analyze.json');
|
|
93
|
+
await fs.writeFile(tmpFile, JSON.stringify({ pages: [{ url: 'https://example.com/', status: 200, depth: 0, html: hugeText }] }));
|
|
94
|
+
const result = await analyzeSite('https://example.com', { fromCrawl: tmpFile });
|
|
95
|
+
expect(result.pages[0].content.wordCount).toBe(1000);
|
|
96
|
+
await fs.unlink(tmpFile);
|
|
97
|
+
});
|
|
98
|
+
});
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
2
|
+
import { resolveDns } from '../../src/audit/dns.js';
|
|
3
|
+
import dns from 'node:dns/promises';
|
|
4
|
+
|
|
5
|
+
vi.mock('node:dns/promises');
|
|
6
|
+
|
|
7
|
+
describe('DNS Diagnostics', () => {
|
|
8
|
+
it('should resolve all records', async () => {
|
|
9
|
+
vi.spyOn(dns, 'resolve4').mockResolvedValue(['1.1.1.1']);
|
|
10
|
+
vi.spyOn(dns, 'resolve6').mockResolvedValue(['2606::1']);
|
|
11
|
+
vi.spyOn(dns, 'resolveCname').mockRejectedValue(new Error('ENODATA'));
|
|
12
|
+
vi.spyOn(dns, 'reverse').mockResolvedValue(['one.one.one.one']);
|
|
13
|
+
|
|
14
|
+
const result = await resolveDns('example.com');
|
|
15
|
+
expect(result.a).toEqual(['1.1.1.1']);
|
|
16
|
+
expect(result.aaaa).toEqual(['2606::1']);
|
|
17
|
+
expect(result.ipv6Support).toBe(true);
|
|
18
|
+
expect(result.reverse).toEqual(['one.one.one.one']);
|
|
19
|
+
expect(result.resolutionTime).toBeGreaterThanOrEqual(0);
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it('should handle failures gracefully', async () => {
|
|
23
|
+
vi.spyOn(dns, 'resolve4').mockRejectedValue(new Error('ENOTFOUND'));
|
|
24
|
+
vi.spyOn(dns, 'resolve6').mockRejectedValue(new Error('ENOTFOUND'));
|
|
25
|
+
vi.spyOn(dns, 'resolveCname').mockRejectedValue(new Error('ENOTFOUND'));
|
|
26
|
+
|
|
27
|
+
const result = await resolveDns('invalid.com');
|
|
28
|
+
expect(result.a).toEqual([]);
|
|
29
|
+
expect(result.ipCount).toBe(0);
|
|
30
|
+
});
|
|
31
|
+
});
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { analyzeHeaders } from '../../src/audit/headers.js';
|
|
3
|
+
|
|
4
|
+
describe('Headers Analysis', () => {
|
|
5
|
+
it('should detect all secure headers', () => {
|
|
6
|
+
const headers = {
|
|
7
|
+
'strict-transport-security': 'max-age=31536000; includeSubDomains',
|
|
8
|
+
'content-security-policy': "default-src 'self'",
|
|
9
|
+
'x-frame-options': 'DENY',
|
|
10
|
+
'x-content-type-options': 'nosniff',
|
|
11
|
+
'referrer-policy': 'strict-origin-when-cross-origin',
|
|
12
|
+
'permissions-policy': 'geolocation=()'
|
|
13
|
+
};
|
|
14
|
+
const result = analyzeHeaders(headers);
|
|
15
|
+
expect(result.score).toBe(100);
|
|
16
|
+
expect(result.strictTransportSecurity.valid).toBe(true);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it('should handle missing headers', () => {
|
|
20
|
+
const headers = {};
|
|
21
|
+
const result = analyzeHeaders(headers);
|
|
22
|
+
expect(result.score).toBe(0);
|
|
23
|
+
expect(result.strictTransportSecurity.present).toBe(false);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it('should validate HSTS properly', () => {
|
|
27
|
+
const headers = {
|
|
28
|
+
'strict-transport-security': 'max-age=0'
|
|
29
|
+
};
|
|
30
|
+
// valid requires max-age
|
|
31
|
+
const result = analyzeHeaders(headers);
|
|
32
|
+
expect(result.strictTransportSecurity.valid).toBe(true);
|
|
33
|
+
// Wait, checkHSTS: includes('max-age=') is true. includes('includeSubDomains') is false.
|
|
34
|
+
// Issues will contain 'Missing includeSubDomains'.
|
|
35
|
+
expect(result.strictTransportSecurity.issues).toContain('Missing includeSubDomains');
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
it('should validate invalid HSTS', () => {
|
|
39
|
+
const headers = {
|
|
40
|
+
'strict-transport-security': 'invalid'
|
|
41
|
+
};
|
|
42
|
+
const result = analyzeHeaders(headers);
|
|
43
|
+
expect(result.strictTransportSecurity.valid).toBe(false);
|
|
44
|
+
});
|
|
45
|
+
});
|