@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Computes Hub and Authority scores using the HITS algorithm.
|
|
3
|
+
* Operates purely on the internal link graph.
|
|
4
|
+
*/
|
|
5
|
+
export function computeHITS(graph, options = {}) {
|
|
6
|
+
const iterations = options.iterations || 20;
|
|
7
|
+
const nodes = graph.getNodes();
|
|
8
|
+
// 1. Filter eligible nodes
|
|
9
|
+
// Eligibility: status 200, non-redirect (redirectChain empty), not noindex, non-external
|
|
10
|
+
const eligibleNodes = nodes.filter(n => n.status === 200 &&
|
|
11
|
+
(!n.redirectChain || n.redirectChain.length === 0) &&
|
|
12
|
+
!n.noindex);
|
|
13
|
+
if (eligibleNodes.length === 0)
|
|
14
|
+
return;
|
|
15
|
+
const urlToNode = new Map();
|
|
16
|
+
for (const node of eligibleNodes) {
|
|
17
|
+
urlToNode.set(node.url, node);
|
|
18
|
+
// 2. Initialization
|
|
19
|
+
node.authorityScore = 1.0;
|
|
20
|
+
node.hubScore = 1.0;
|
|
21
|
+
}
|
|
22
|
+
const allEdges = graph.getEdges();
|
|
23
|
+
// Filter edges: internal links only (both source and target must be in eligibleNodes), no self-links
|
|
24
|
+
const eligibleEdges = allEdges.filter(e => e.source !== e.target &&
|
|
25
|
+
urlToNode.has(e.source) &&
|
|
26
|
+
urlToNode.has(e.target));
|
|
27
|
+
// Group edges for efficient iteration
|
|
28
|
+
const incoming = new Map();
|
|
29
|
+
const outgoing = new Map();
|
|
30
|
+
for (const edge of eligibleEdges) {
|
|
31
|
+
if (!incoming.has(edge.target))
|
|
32
|
+
incoming.set(edge.target, []);
|
|
33
|
+
incoming.get(edge.target).push({ source: edge.source, weight: edge.weight });
|
|
34
|
+
if (!outgoing.has(edge.source))
|
|
35
|
+
outgoing.set(edge.source, []);
|
|
36
|
+
outgoing.get(edge.source).push({ target: edge.target, weight: edge.weight });
|
|
37
|
+
}
|
|
38
|
+
// 3. Iteration
|
|
39
|
+
for (let i = 0; i < iterations; i++) {
|
|
40
|
+
// Update Authorities
|
|
41
|
+
let normAuth = 0;
|
|
42
|
+
for (const node of eligibleNodes) {
|
|
43
|
+
const inLinks = incoming.get(node.url) || [];
|
|
44
|
+
let newAuth = 0;
|
|
45
|
+
for (const link of inLinks) {
|
|
46
|
+
const sourceNode = urlToNode.get(link.source);
|
|
47
|
+
newAuth += (sourceNode.hubScore || 0) * link.weight;
|
|
48
|
+
}
|
|
49
|
+
node.authorityScore = newAuth;
|
|
50
|
+
normAuth += newAuth * newAuth;
|
|
51
|
+
}
|
|
52
|
+
// Normalize Authorities (L2 norm)
|
|
53
|
+
normAuth = Math.sqrt(normAuth);
|
|
54
|
+
if (normAuth > 0) {
|
|
55
|
+
for (const node of eligibleNodes) {
|
|
56
|
+
node.authorityScore = (node.authorityScore || 0) / normAuth;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
// Update Hubs
|
|
60
|
+
let normHub = 0;
|
|
61
|
+
for (const node of eligibleNodes) {
|
|
62
|
+
const outLinks = outgoing.get(node.url) || [];
|
|
63
|
+
let newHub = 0;
|
|
64
|
+
for (const link of outLinks) {
|
|
65
|
+
const targetNode = urlToNode.get(link.target);
|
|
66
|
+
newHub += (targetNode.authorityScore || 0) * link.weight;
|
|
67
|
+
}
|
|
68
|
+
node.hubScore = newHub;
|
|
69
|
+
normHub += newHub * newHub;
|
|
70
|
+
}
|
|
71
|
+
// Normalize Hubs (L2 norm)
|
|
72
|
+
normHub = Math.sqrt(normHub);
|
|
73
|
+
if (normHub > 0) {
|
|
74
|
+
for (const node of eligibleNodes) {
|
|
75
|
+
node.hubScore = (node.hubScore || 0) / normHub;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// 4. Classification Logic
|
|
80
|
+
classifyLinkRoles(eligibleNodes);
|
|
81
|
+
}
|
|
82
|
+
function classifyLinkRoles(nodes) {
|
|
83
|
+
if (nodes.length === 0)
|
|
84
|
+
return;
|
|
85
|
+
const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
|
|
86
|
+
const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
|
|
87
|
+
// Use 75th percentile as "high" threshold
|
|
88
|
+
const medianAuth = authScores[Math.floor(authScores.length / 2)];
|
|
89
|
+
const medianHub = hubScores[Math.floor(hubScores.length / 2)];
|
|
90
|
+
for (const node of nodes) {
|
|
91
|
+
const auth = node.authorityScore || 0;
|
|
92
|
+
const hub = node.hubScore || 0;
|
|
93
|
+
const isHighAuth = auth > medianAuth && auth > 0.0001;
|
|
94
|
+
const isHighHub = hub > medianHub && hub > 0.0001;
|
|
95
|
+
if (isHighAuth && isHighHub) {
|
|
96
|
+
node.linkRole = 'power';
|
|
97
|
+
}
|
|
98
|
+
else if (isHighAuth) {
|
|
99
|
+
node.linkRole = 'authority';
|
|
100
|
+
}
|
|
101
|
+
else if (isHighHub) {
|
|
102
|
+
node.linkRole = 'hub';
|
|
103
|
+
}
|
|
104
|
+
else if (auth > 0.0001 && hub > 0.0001) {
|
|
105
|
+
node.linkRole = 'balanced';
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
node.linkRole = 'peripheral';
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
|
|
2
|
+
export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
|
|
3
|
+
export interface SitegraphNode {
|
|
4
|
+
url: string;
|
|
5
|
+
depth: number;
|
|
6
|
+
inLinks: number;
|
|
7
|
+
outLinks: number;
|
|
8
|
+
status: number;
|
|
9
|
+
discoveredViaSitemap?: boolean;
|
|
10
|
+
robotsExcluded?: boolean;
|
|
11
|
+
canonicalUrl?: string;
|
|
12
|
+
isHomepage?: boolean;
|
|
13
|
+
wordCount?: number;
|
|
14
|
+
hasStructuredData?: boolean;
|
|
15
|
+
pageType?: string;
|
|
16
|
+
noindex?: boolean;
|
|
17
|
+
duplicateContent?: boolean;
|
|
18
|
+
isProductOrCommercial?: boolean;
|
|
19
|
+
}
|
|
20
|
+
export interface SitegraphEdge {
|
|
21
|
+
source: string;
|
|
22
|
+
target: string;
|
|
23
|
+
}
|
|
24
|
+
export interface OrphanScoringOptions {
|
|
25
|
+
enabled: boolean;
|
|
26
|
+
severityEnabled: boolean;
|
|
27
|
+
includeSoftOrphans: boolean;
|
|
28
|
+
minInbound: number;
|
|
29
|
+
rootUrl?: string;
|
|
30
|
+
}
|
|
31
|
+
export type AnnotatedNode = SitegraphNode & {
|
|
32
|
+
orphan: boolean;
|
|
33
|
+
orphanType?: OrphanType;
|
|
34
|
+
orphanSeverity?: number;
|
|
35
|
+
impactLevel?: ImpactLevel;
|
|
36
|
+
};
|
|
37
|
+
export declare function mapImpactLevel(score: number): ImpactLevel;
|
|
38
|
+
export declare function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphNode): number;
|
|
39
|
+
export declare function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[], options: OrphanScoringOptions): AnnotatedNode[];
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
const LOW_VALUE_PATTERNS = [
|
|
2
|
+
/[?&](page|p)=\d+/i,
|
|
3
|
+
/\/(page|tag|tags|category|categories)\//i,
|
|
4
|
+
/[?&](q|query|search|filter|sort)=/i,
|
|
5
|
+
/\/search(\/|\?|$)/i
|
|
6
|
+
];
|
|
7
|
+
function isLowValuePage(node) {
|
|
8
|
+
const type = (node.pageType || '').toLowerCase();
|
|
9
|
+
if (['pagination', 'tag', 'category', 'filter', 'search', 'archive'].includes(type)) {
|
|
10
|
+
return true;
|
|
11
|
+
}
|
|
12
|
+
if (node.noindex) {
|
|
13
|
+
return true;
|
|
14
|
+
}
|
|
15
|
+
return LOW_VALUE_PATTERNS.some((pattern) => pattern.test(node.url));
|
|
16
|
+
}
|
|
17
|
+
function clampScore(score) {
|
|
18
|
+
return Math.max(0, Math.min(100, Math.round(score)));
|
|
19
|
+
}
|
|
20
|
+
export function mapImpactLevel(score) {
|
|
21
|
+
if (score <= 39)
|
|
22
|
+
return 'low';
|
|
23
|
+
if (score <= 69)
|
|
24
|
+
return 'medium';
|
|
25
|
+
if (score <= 89)
|
|
26
|
+
return 'high';
|
|
27
|
+
return 'critical';
|
|
28
|
+
}
|
|
29
|
+
export function calculateOrphanSeverity(orphanType, node) {
|
|
30
|
+
let score = 0;
|
|
31
|
+
switch (orphanType) {
|
|
32
|
+
case 'hard':
|
|
33
|
+
score = 90;
|
|
34
|
+
break;
|
|
35
|
+
case 'crawl-only':
|
|
36
|
+
score = 80;
|
|
37
|
+
break;
|
|
38
|
+
case 'near':
|
|
39
|
+
score = node.inLinks <= 1 ? 70 : 60;
|
|
40
|
+
break;
|
|
41
|
+
case 'soft':
|
|
42
|
+
score = 50;
|
|
43
|
+
break;
|
|
44
|
+
}
|
|
45
|
+
let positiveModifier = 0;
|
|
46
|
+
if ((node.wordCount || 0) > 800)
|
|
47
|
+
positiveModifier += 10;
|
|
48
|
+
if (node.hasStructuredData)
|
|
49
|
+
positiveModifier += 10;
|
|
50
|
+
if (node.depth <= 2)
|
|
51
|
+
positiveModifier += 10;
|
|
52
|
+
if (node.isProductOrCommercial)
|
|
53
|
+
positiveModifier += 10;
|
|
54
|
+
positiveModifier = Math.min(20, positiveModifier);
|
|
55
|
+
let negativeModifier = 0;
|
|
56
|
+
if ((node.wordCount || 0) > 0 && (node.wordCount || 0) < 300)
|
|
57
|
+
negativeModifier += 20;
|
|
58
|
+
if (node.noindex)
|
|
59
|
+
negativeModifier += 20;
|
|
60
|
+
if (node.duplicateContent)
|
|
61
|
+
negativeModifier += 20;
|
|
62
|
+
if ((node.pageType || '').toLowerCase() === 'archive' || (node.pageType || '').toLowerCase() === 'pagination')
|
|
63
|
+
negativeModifier += 20;
|
|
64
|
+
negativeModifier = Math.min(20, negativeModifier);
|
|
65
|
+
score += positiveModifier;
|
|
66
|
+
score -= negativeModifier;
|
|
67
|
+
return clampScore(score);
|
|
68
|
+
}
|
|
69
|
+
function consolidateInboundByCanonical(nodes) {
|
|
70
|
+
const canonicalInbound = new Map();
|
|
71
|
+
for (const node of nodes) {
|
|
72
|
+
const canonical = node.canonicalUrl || node.url;
|
|
73
|
+
canonicalInbound.set(canonical, (canonicalInbound.get(canonical) || 0) + node.inLinks);
|
|
74
|
+
}
|
|
75
|
+
return canonicalInbound;
|
|
76
|
+
}
|
|
77
|
+
export function annotateOrphans(nodes, edges, options) {
|
|
78
|
+
if (!options.enabled) {
|
|
79
|
+
return nodes.map((node) => ({ ...node, orphan: false }));
|
|
80
|
+
}
|
|
81
|
+
const canonicalInbound = consolidateInboundByCanonical(nodes);
|
|
82
|
+
const nodeByUrl = new Map(nodes.map((node) => [node.url, node]));
|
|
83
|
+
return nodes.map((node) => {
|
|
84
|
+
const isHomepage = node.isHomepage || (options.rootUrl ? node.url === options.rootUrl : node.depth === 0);
|
|
85
|
+
if (isHomepage || node.robotsExcluded) {
|
|
86
|
+
return { ...node, orphan: false };
|
|
87
|
+
}
|
|
88
|
+
const canonical = node.canonicalUrl || node.url;
|
|
89
|
+
const inbound = canonicalInbound.get(canonical) || 0;
|
|
90
|
+
let orphanType;
|
|
91
|
+
if (inbound === 0) {
|
|
92
|
+
orphanType = node.discoveredViaSitemap ? 'crawl-only' : 'hard';
|
|
93
|
+
}
|
|
94
|
+
else if (inbound <= options.minInbound) {
|
|
95
|
+
orphanType = 'near';
|
|
96
|
+
}
|
|
97
|
+
if (!orphanType && options.includeSoftOrphans && inbound > 0) {
|
|
98
|
+
const inboundSources = edges
|
|
99
|
+
.filter((edge) => edge.target === node.url)
|
|
100
|
+
.map((edge) => nodeByUrl.get(edge.source))
|
|
101
|
+
.filter((source) => Boolean(source));
|
|
102
|
+
if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) {
|
|
103
|
+
orphanType = 'soft';
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
if (!orphanType) {
|
|
107
|
+
return { ...node, orphan: false };
|
|
108
|
+
}
|
|
109
|
+
if (!options.severityEnabled) {
|
|
110
|
+
return {
|
|
111
|
+
...node,
|
|
112
|
+
orphan: true,
|
|
113
|
+
orphanType
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
const orphanSeverity = calculateOrphanSeverity(orphanType, { ...node, inLinks: inbound });
|
|
117
|
+
return {
|
|
118
|
+
...node,
|
|
119
|
+
orphan: true,
|
|
120
|
+
orphanType,
|
|
121
|
+
orphanSeverity,
|
|
122
|
+
impactLevel: mapImpactLevel(orphanSeverity)
|
|
123
|
+
};
|
|
124
|
+
});
|
|
125
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { fileURLToPath } from 'node:url';
|
|
3
|
+
import { dirname, join } from 'node:path';
|
|
4
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
5
|
+
const __dirname = dirname(__filename);
|
|
6
|
+
let version = '0.0.1';
|
|
7
|
+
try {
|
|
8
|
+
const pkgPath = join(__dirname, '../../package.json');
|
|
9
|
+
const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
|
|
10
|
+
version = pkg.version;
|
|
11
|
+
}
|
|
12
|
+
catch {
|
|
13
|
+
// Fallback to internal default
|
|
14
|
+
}
|
|
15
|
+
export { version };
|
package/package.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@crawlith/core",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": {
|
|
9
|
+
"import": "./dist/index.js",
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"default": "./dist/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"better-sqlite3": "^12.6.2",
|
|
16
|
+
"chalk": "^5.3.0",
|
|
17
|
+
"cheerio": "^1.0.0-rc.12",
|
|
18
|
+
"p-limit": "^5.0.0",
|
|
19
|
+
"robots-parser": "^3.0.1",
|
|
20
|
+
"undici": "^6.13.0",
|
|
21
|
+
"vite": "7.3.1"
|
|
22
|
+
},
|
|
23
|
+
"devDependencies": {
|
|
24
|
+
"@types/better-sqlite3": "^7.6.13",
|
|
25
|
+
"@types/node": "^20.12.7",
|
|
26
|
+
"typescript": "^5.4.5",
|
|
27
|
+
"vitest": "^4.0.18"
|
|
28
|
+
},
|
|
29
|
+
"scripts": {
|
|
30
|
+
"build": "tsc",
|
|
31
|
+
"test": "vitest run"
|
|
32
|
+
}
|
|
33
|
+
}
|