@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
export function calculateMetrics(graph, _maxDepth) {
|
|
2
|
+
const nodes = graph.getNodes();
|
|
3
|
+
const edges = graph.getEdges();
|
|
4
|
+
const totalPages = nodes.length;
|
|
5
|
+
const totalEdges = edges.length;
|
|
6
|
+
// Authority Score (per node)
|
|
7
|
+
const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
|
|
8
|
+
const getAuthority = (node) => {
|
|
9
|
+
if (maxInLinks === 0)
|
|
10
|
+
return 0;
|
|
11
|
+
return Math.log(1 + node.inLinks) / Math.log(1 + maxInLinks);
|
|
12
|
+
};
|
|
13
|
+
// orphanPages: inLinks === 0 && depth > 0
|
|
14
|
+
const orphanPages = nodes
|
|
15
|
+
.filter(n => n.inLinks === 0 && n.depth > 0)
|
|
16
|
+
.map(n => n.url);
|
|
17
|
+
// nearOrphans: inLinks === 1 && depth >= 3
|
|
18
|
+
const nearOrphans = nodes
|
|
19
|
+
.filter(n => n.inLinks === 1 && n.depth >= 3)
|
|
20
|
+
.map(n => n.url);
|
|
21
|
+
// deepPages: depth >= 4
|
|
22
|
+
const deepPages = nodes
|
|
23
|
+
.filter(n => n.depth >= 4) // Per requirement
|
|
24
|
+
.map(n => n.url);
|
|
25
|
+
// crawlEfficiencyScore: 1 - (deepPagesCount / totalPages)
|
|
26
|
+
const deepPagesCount = deepPages.length;
|
|
27
|
+
const crawlEfficiencyScore = totalPages > 0 ? 1 - (deepPagesCount / totalPages) : 1;
|
|
28
|
+
// averageDepth: sum(depth) / totalPages
|
|
29
|
+
const sumDepth = nodes.reduce((acc, n) => acc + n.depth, 0);
|
|
30
|
+
const averageDepth = totalPages > 0 ? sumDepth / totalPages : 0;
|
|
31
|
+
// structuralEntropy: Shannon entropy over outDegree distribution
|
|
32
|
+
const outDegreeCounts = new Map();
|
|
33
|
+
nodes.forEach(n => {
|
|
34
|
+
outDegreeCounts.set(n.outLinks, (outDegreeCounts.get(n.outLinks) || 0) + 1);
|
|
35
|
+
});
|
|
36
|
+
let structuralEntropy = 0;
|
|
37
|
+
if (totalPages > 0) {
|
|
38
|
+
for (const count of outDegreeCounts.values()) {
|
|
39
|
+
const p = count / totalPages;
|
|
40
|
+
if (p > 0) {
|
|
41
|
+
structuralEntropy -= p * Math.log2(p);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// topAuthorityPages: Top 10 by authority
|
|
46
|
+
const topAuthorityPages = [...nodes]
|
|
47
|
+
.map(n => ({ url: n.url, authority: n.authorityScore ?? getAuthority(n) }))
|
|
48
|
+
.sort((a, b) => b.authority - a.authority)
|
|
49
|
+
.slice(0, 10);
|
|
50
|
+
// topPageRankPages: Top 10 by raw PageRank
|
|
51
|
+
const topPageRankPages = [...nodes]
|
|
52
|
+
.filter(n => n.pageRank !== undefined)
|
|
53
|
+
.map(n => ({ url: n.url, score: n.pageRank }))
|
|
54
|
+
.sort((a, b) => b.score - a.score)
|
|
55
|
+
.slice(0, 10);
|
|
56
|
+
const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
|
|
57
|
+
const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
|
|
58
|
+
return {
|
|
59
|
+
totalPages,
|
|
60
|
+
totalEdges,
|
|
61
|
+
orphanPages,
|
|
62
|
+
nearOrphans,
|
|
63
|
+
deepPages,
|
|
64
|
+
topAuthorityPages,
|
|
65
|
+
averageOutDegree,
|
|
66
|
+
maxDepthFound,
|
|
67
|
+
crawlEfficiencyScore,
|
|
68
|
+
averageDepth,
|
|
69
|
+
structuralEntropy,
|
|
70
|
+
topPageRankPages,
|
|
71
|
+
limitReached: graph.limitReached,
|
|
72
|
+
sessionStats: graph.sessionStats
|
|
73
|
+
};
|
|
74
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { Graph } from './graph.js';
|
|
2
|
+
interface PageRankOptions {
|
|
3
|
+
dampingFactor?: number;
|
|
4
|
+
maxIterations?: number;
|
|
5
|
+
convergenceThreshold?: number;
|
|
6
|
+
soft404WeightThreshold?: number;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Production-Grade Weighted PageRank Engine
|
|
10
|
+
*/
|
|
11
|
+
export declare function computePageRank(graph: Graph, options?: PageRankOptions): void;
|
|
12
|
+
export {};
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Production-Grade Weighted PageRank Engine
|
|
3
|
+
*/
|
|
4
|
+
export function computePageRank(graph, options = {}) {
|
|
5
|
+
const d = options.dampingFactor ?? 0.85;
|
|
6
|
+
const maxIterations = options.maxIterations ?? 40;
|
|
7
|
+
const epsilon = options.convergenceThreshold ?? 1e-5;
|
|
8
|
+
const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
|
|
9
|
+
const allNodes = graph.getNodes();
|
|
10
|
+
const allEdges = graph.getEdges();
|
|
11
|
+
// 1. Filter Eligible Nodes
|
|
12
|
+
const eligibleNodes = allNodes.filter(node => {
|
|
13
|
+
if (node.noindex)
|
|
14
|
+
return false;
|
|
15
|
+
if (node.isCollapsed)
|
|
16
|
+
return false;
|
|
17
|
+
if (node.soft404Score && node.soft404Score > soft404Threshold)
|
|
18
|
+
return false;
|
|
19
|
+
if (node.canonical && node.canonical !== node.url)
|
|
20
|
+
return false;
|
|
21
|
+
if (node.status >= 400)
|
|
22
|
+
return false; // Don't pass rank to broken pages
|
|
23
|
+
return true;
|
|
24
|
+
});
|
|
25
|
+
const nodeCount = eligibleNodes.length;
|
|
26
|
+
if (nodeCount === 0)
|
|
27
|
+
return;
|
|
28
|
+
const nodeUrls = eligibleNodes.map(n => n.url);
|
|
29
|
+
const nodeMap = new Map();
|
|
30
|
+
eligibleNodes.forEach(n => nodeMap.set(n.url, n));
|
|
31
|
+
// Initialize PageRank
|
|
32
|
+
let pr = new Map();
|
|
33
|
+
nodeUrls.forEach(url => pr.set(url, 1 / nodeCount));
|
|
34
|
+
// Pre-calculate weighted outbound sums and inverted adjacency
|
|
35
|
+
const outWeights = new Map();
|
|
36
|
+
const incoming = new Map();
|
|
37
|
+
const sinks = [];
|
|
38
|
+
// Initialize outWeights for all eligible nodes
|
|
39
|
+
nodeUrls.forEach(url => outWeights.set(url, 0));
|
|
40
|
+
for (const edge of allEdges) {
|
|
41
|
+
if (nodeMap.has(edge.source) && nodeMap.has(edge.target)) {
|
|
42
|
+
const weight = edge.weight || 1.0;
|
|
43
|
+
const sources = incoming.get(edge.target) ?? [];
|
|
44
|
+
sources.push({ source: edge.source, weight });
|
|
45
|
+
incoming.set(edge.target, sources);
|
|
46
|
+
outWeights.set(edge.source, (outWeights.get(edge.source) || 0) + weight);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
// Identify sinks
|
|
50
|
+
nodeUrls.forEach(url => {
|
|
51
|
+
if ((outWeights.get(url) || 0) === 0) {
|
|
52
|
+
sinks.push(url);
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
// Iterative Calculation
|
|
56
|
+
for (let i = 0; i < maxIterations; i++) {
|
|
57
|
+
const nextPr = new Map();
|
|
58
|
+
// Calculate total rank from sinks to redistribute
|
|
59
|
+
let sinkRankTotal = 0;
|
|
60
|
+
for (const url of sinks) {
|
|
61
|
+
sinkRankTotal += pr.get(url) || 0;
|
|
62
|
+
}
|
|
63
|
+
const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
|
|
64
|
+
for (const url of nodeUrls) {
|
|
65
|
+
let rankFromLinks = 0;
|
|
66
|
+
const sources = incoming.get(url) || [];
|
|
67
|
+
for (const edge of sources) {
|
|
68
|
+
const sourceRank = pr.get(edge.source) || 0;
|
|
69
|
+
const sourceOutWeight = outWeights.get(edge.source) || 1.0;
|
|
70
|
+
rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
|
|
71
|
+
}
|
|
72
|
+
const newRank = baseRank + d * rankFromLinks;
|
|
73
|
+
nextPr.set(url, newRank);
|
|
74
|
+
}
|
|
75
|
+
// Convergence check
|
|
76
|
+
let maxDelta = 0;
|
|
77
|
+
for (const url of nodeUrls) {
|
|
78
|
+
const delta = Math.abs(nextPr.get(url) - pr.get(url));
|
|
79
|
+
if (delta > maxDelta)
|
|
80
|
+
maxDelta = delta;
|
|
81
|
+
}
|
|
82
|
+
pr = nextPr;
|
|
83
|
+
if (maxDelta < epsilon)
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
// 2. Normalization (0-100)
|
|
87
|
+
const ranks = Array.from(pr.values());
|
|
88
|
+
const minPR = Math.min(...ranks);
|
|
89
|
+
const maxPR = Math.max(...ranks);
|
|
90
|
+
const range = maxPR - minPR;
|
|
91
|
+
for (const node of eligibleNodes) {
|
|
92
|
+
const rawRank = pr.get(node.url);
|
|
93
|
+
node.pageRank = rawRank;
|
|
94
|
+
if (range > 1e-12) {
|
|
95
|
+
node.pageRankScore = 100 * (rawRank - minPR) / range;
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
// If there's no range, all eligible pages are equally important.
|
|
99
|
+
node.pageRankScore = 100;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export declare class SimHash {
|
|
2
|
+
private static FNV_PRIME;
|
|
3
|
+
private static FNV_OFFSET_BASIS;
|
|
4
|
+
private static MAX_UINT64;
|
|
5
|
+
/**
|
|
6
|
+
* Generates a 64-bit FNV-1a hash for a given string token.
|
|
7
|
+
*/
|
|
8
|
+
static fnv1a64(token: string): bigint;
|
|
9
|
+
/**
|
|
10
|
+
* Generates a 64-bit SimHash from an array of tokens.
|
|
11
|
+
*/
|
|
12
|
+
static generate(tokens: string[]): bigint;
|
|
13
|
+
/**
|
|
14
|
+
* Computes the Hamming distance between two 64-bit hashes.
|
|
15
|
+
*/
|
|
16
|
+
static hammingDistance(a: bigint, b: bigint): number;
|
|
17
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
export class SimHash {
|
|
2
|
+
static FNV_PRIME = 1099511628211n;
|
|
3
|
+
static FNV_OFFSET_BASIS = 14695981039346656037n;
|
|
4
|
+
static MAX_UINT64 = 0xffffffffffffffffn;
|
|
5
|
+
/**
|
|
6
|
+
* Generates a 64-bit FNV-1a hash for a given string token.
|
|
7
|
+
*/
|
|
8
|
+
static fnv1a64(token) {
|
|
9
|
+
let hash = this.FNV_OFFSET_BASIS;
|
|
10
|
+
const len = token.length;
|
|
11
|
+
for (let i = 0; i < len; i++) {
|
|
12
|
+
hash ^= BigInt(token.charCodeAt(i));
|
|
13
|
+
// BigInt safe multiplication modulo 2^64
|
|
14
|
+
hash = (hash * this.FNV_PRIME) & this.MAX_UINT64;
|
|
15
|
+
}
|
|
16
|
+
return hash;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Generates a 64-bit SimHash from an array of tokens.
|
|
20
|
+
*/
|
|
21
|
+
static generate(tokens) {
|
|
22
|
+
const v = new Int32Array(64);
|
|
23
|
+
for (const token of tokens) {
|
|
24
|
+
const hash = this.fnv1a64(token);
|
|
25
|
+
for (let i = 0n; i < 64n; i++) {
|
|
26
|
+
const bit = (hash >> i) & 1n;
|
|
27
|
+
if (bit === 1n) {
|
|
28
|
+
v[Number(i)]++;
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
v[Number(i)]--;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
let simhash = 0n;
|
|
36
|
+
for (let i = 0n; i < 64n; i++) {
|
|
37
|
+
if (v[Number(i)] > 0) {
|
|
38
|
+
simhash |= (1n << i);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return simhash;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Computes the Hamming distance between two 64-bit hashes.
|
|
45
|
+
*/
|
|
46
|
+
static hammingDistance(a, b) {
|
|
47
|
+
let xor = a ^ b;
|
|
48
|
+
let distance = 0;
|
|
49
|
+
while (xor > 0n) {
|
|
50
|
+
// Kernighan's bit counting
|
|
51
|
+
xor &= xor - 1n;
|
|
52
|
+
distance++;
|
|
53
|
+
}
|
|
54
|
+
return distance;
|
|
55
|
+
}
|
|
56
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export * from './crawler/crawl.js';
|
|
2
|
+
export * from './crawler/metricsRunner.js';
|
|
3
|
+
export * from './graph/metrics.js';
|
|
4
|
+
export * from './report/html.js';
|
|
5
|
+
export * from './report/sitegraph_template.js';
|
|
6
|
+
export * from './report/sitegraphExport.js';
|
|
7
|
+
export * from './graph/graph.js';
|
|
8
|
+
export * from './diff/compare.js';
|
|
9
|
+
export * from './scoring/orphanSeverity.js';
|
|
10
|
+
export * from './graph/pagerank.js';
|
|
11
|
+
export * from './graph/duplicate.js';
|
|
12
|
+
export * from './graph/cluster.js';
|
|
13
|
+
export * from './scoring/hits.js';
|
|
14
|
+
export * from './analysis/analyze.js';
|
|
15
|
+
export * from './analysis/content.js';
|
|
16
|
+
export * from './analysis/seo.js';
|
|
17
|
+
export * from './analysis/images.js';
|
|
18
|
+
export * from './analysis/links.js';
|
|
19
|
+
export * from './audit/index.js';
|
|
20
|
+
export * from './audit/types.js';
|
|
21
|
+
export * from './db/index.js';
|
|
22
|
+
export * from './db/graphLoader.js';
|
|
23
|
+
export * from './db/repositories/SiteRepository.js';
|
|
24
|
+
export * from './db/repositories/SnapshotRepository.js';
|
|
25
|
+
export * from './db/repositories/PageRepository.js';
|
|
26
|
+
export * from './db/repositories/EdgeRepository.js';
|
|
27
|
+
export * from './db/repositories/MetricsRepository.js';
|
|
28
|
+
export * from './lock/lockManager.js';
|
|
29
|
+
export * from './lock/hashKey.js';
|
|
30
|
+
export * from './utils/version.js';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export * from './crawler/crawl.js';
|
|
2
|
+
export * from './crawler/metricsRunner.js';
|
|
3
|
+
export * from './graph/metrics.js';
|
|
4
|
+
export * from './report/html.js';
|
|
5
|
+
export * from './report/sitegraph_template.js';
|
|
6
|
+
export * from './report/sitegraphExport.js';
|
|
7
|
+
export * from './graph/graph.js';
|
|
8
|
+
export * from './diff/compare.js';
|
|
9
|
+
export * from './scoring/orphanSeverity.js';
|
|
10
|
+
export * from './graph/pagerank.js';
|
|
11
|
+
export * from './graph/duplicate.js';
|
|
12
|
+
export * from './graph/cluster.js';
|
|
13
|
+
export * from './scoring/hits.js';
|
|
14
|
+
export * from './analysis/analyze.js';
|
|
15
|
+
export * from './analysis/content.js';
|
|
16
|
+
export * from './analysis/seo.js';
|
|
17
|
+
export * from './analysis/images.js';
|
|
18
|
+
export * from './analysis/links.js';
|
|
19
|
+
export * from './audit/index.js';
|
|
20
|
+
export * from './audit/types.js';
|
|
21
|
+
export * from './db/index.js';
|
|
22
|
+
export * from './db/graphLoader.js';
|
|
23
|
+
export * from './db/repositories/SiteRepository.js';
|
|
24
|
+
export * from './db/repositories/SnapshotRepository.js';
|
|
25
|
+
export * from './db/repositories/PageRepository.js';
|
|
26
|
+
export * from './db/repositories/EdgeRepository.js';
|
|
27
|
+
export * from './db/repositories/MetricsRepository.js';
|
|
28
|
+
export * from './lock/lockManager.js';
|
|
29
|
+
export * from './lock/hashKey.js';
|
|
30
|
+
export * from './utils/version.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function generateLockKey(commandName: string, targetUrl: string, options: any): string;
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import crypto from 'node:crypto';
|
|
2
|
+
import { normalizeUrl } from '../crawler/normalize.js';
|
|
3
|
+
// Flags that change the nature of the crawl and should be part of the lock key
|
|
4
|
+
const RELEVANT_FLAGS = [
|
|
5
|
+
'limit',
|
|
6
|
+
'depth',
|
|
7
|
+
'output',
|
|
8
|
+
'sitemap',
|
|
9
|
+
'incremental',
|
|
10
|
+
'detectSoft404',
|
|
11
|
+
'detectTraps',
|
|
12
|
+
'includeSubdomains',
|
|
13
|
+
'allow',
|
|
14
|
+
'deny',
|
|
15
|
+
'proxy',
|
|
16
|
+
'ua',
|
|
17
|
+
'maxRedirects',
|
|
18
|
+
'rate',
|
|
19
|
+
'maxBytes',
|
|
20
|
+
'concurrency'
|
|
21
|
+
];
|
|
22
|
+
export function generateLockKey(commandName, targetUrl, options) {
|
|
23
|
+
// Respect the query stripping option consistent with sitegraph logic
|
|
24
|
+
const stripQuery = !options.query;
|
|
25
|
+
const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
|
|
26
|
+
// Extract relevant options in a deterministic order
|
|
27
|
+
const lockOptions = {};
|
|
28
|
+
for (const key of RELEVANT_FLAGS) {
|
|
29
|
+
if (options[key] !== undefined) {
|
|
30
|
+
lockOptions[key] = options[key];
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
// Create composite key object
|
|
34
|
+
const compositeKey = {
|
|
35
|
+
command: commandName,
|
|
36
|
+
target: normalizedTarget,
|
|
37
|
+
options: lockOptions
|
|
38
|
+
};
|
|
39
|
+
// Stringify and hash
|
|
40
|
+
// Since we inserted keys in a deterministic order (RELEVANT_FLAGS order),
|
|
41
|
+
// JSON.stringify will produce a stable string in V8/Node.js.
|
|
42
|
+
const stableString = JSON.stringify(compositeKey);
|
|
43
|
+
return crypto.createHash('sha256').update(stableString).digest('hex');
|
|
44
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export declare class LockManager {
|
|
2
|
+
private static lockFilePath;
|
|
3
|
+
private static get lockDir();
|
|
4
|
+
static acquireLock(commandName: string, targetUrl: string, options: any, force?: boolean): Promise<void>;
|
|
5
|
+
static releaseLock(): void;
|
|
6
|
+
private static registerHandlers;
|
|
7
|
+
}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import { existsSync, unlinkSync, readFileSync } from 'node:fs';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import chalk from 'chalk';
|
|
6
|
+
import { generateLockKey } from './hashKey.js';
|
|
7
|
+
import { isPidAlive } from './pidCheck.js';
|
|
8
|
+
export class LockManager {
|
|
9
|
+
static lockFilePath = null;
|
|
10
|
+
static get lockDir() {
|
|
11
|
+
return path.join(os.homedir(), '.crawlith', 'locks');
|
|
12
|
+
}
|
|
13
|
+
static async acquireLock(commandName, targetUrl, options, force = false) {
|
|
14
|
+
const lockHash = generateLockKey(commandName, targetUrl, options);
|
|
15
|
+
// Ensure lock directory exists
|
|
16
|
+
// We can use sync or async here. Since this is one-time setup, async is fine.
|
|
17
|
+
await fs.mkdir(this.lockDir, { recursive: true });
|
|
18
|
+
const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
|
|
19
|
+
// Check existing lock
|
|
20
|
+
if (existsSync(lockPath)) {
|
|
21
|
+
let isStale;
|
|
22
|
+
let pid;
|
|
23
|
+
try {
|
|
24
|
+
const lockContent = readFileSync(lockPath, 'utf-8');
|
|
25
|
+
const lockData = JSON.parse(lockContent);
|
|
26
|
+
pid = lockData.pid;
|
|
27
|
+
isStale = !isPidAlive(pid);
|
|
28
|
+
}
|
|
29
|
+
catch (_e) {
|
|
30
|
+
// Corrupted -> Treat as stale
|
|
31
|
+
isStale = true;
|
|
32
|
+
pid = 0; // Fallback, though unused if isStale is true
|
|
33
|
+
}
|
|
34
|
+
if (force) {
|
|
35
|
+
console.warn(chalk.yellow('Force mode enabled. Overriding existing lock.'));
|
|
36
|
+
try {
|
|
37
|
+
unlinkSync(lockPath);
|
|
38
|
+
}
|
|
39
|
+
catch { /* ignore */ }
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
if (!isStale) {
|
|
43
|
+
console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (PID ${pid})`));
|
|
44
|
+
process.exit(1);
|
|
45
|
+
}
|
|
46
|
+
else {
|
|
47
|
+
console.log(chalk.gray('Detected stale lock. Continuing execution.'));
|
|
48
|
+
try {
|
|
49
|
+
unlinkSync(lockPath);
|
|
50
|
+
}
|
|
51
|
+
catch { /* ignore */ }
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
// Create new lock
|
|
56
|
+
try {
|
|
57
|
+
const data = {
|
|
58
|
+
pid: process.pid,
|
|
59
|
+
startedAt: Date.now(),
|
|
60
|
+
command: commandName,
|
|
61
|
+
target: targetUrl,
|
|
62
|
+
args: options
|
|
63
|
+
};
|
|
64
|
+
// 'wx' flag ensures atomic creation, failing if file exists
|
|
65
|
+
await fs.writeFile(lockPath, JSON.stringify(data, null, 2), { flag: 'wx', encoding: 'utf-8' });
|
|
66
|
+
this.lockFilePath = lockPath;
|
|
67
|
+
this.registerHandlers();
|
|
68
|
+
}
|
|
69
|
+
catch (error) {
|
|
70
|
+
if (error.code === 'EEXIST') {
|
|
71
|
+
// Race condition: another process created lock between our check and open
|
|
72
|
+
console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
|
|
73
|
+
process.exit(1);
|
|
74
|
+
}
|
|
75
|
+
throw error;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
static releaseLock() {
|
|
79
|
+
if (this.lockFilePath && existsSync(this.lockFilePath)) {
|
|
80
|
+
try {
|
|
81
|
+
unlinkSync(this.lockFilePath);
|
|
82
|
+
this.lockFilePath = null;
|
|
83
|
+
}
|
|
84
|
+
catch (_error) {
|
|
85
|
+
// Ignore errors during cleanup
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
static registerHandlers() {
|
|
90
|
+
// Ensure cleanup only happens once
|
|
91
|
+
const cleanup = () => {
|
|
92
|
+
this.releaseLock();
|
|
93
|
+
};
|
|
94
|
+
// process.on('exit') is only called when process.exit() is called or event loop empties.
|
|
95
|
+
// It requires synchronous cleanup.
|
|
96
|
+
process.on('exit', cleanup);
|
|
97
|
+
// Signals
|
|
98
|
+
process.on('SIGINT', () => {
|
|
99
|
+
cleanup();
|
|
100
|
+
process.exit(130);
|
|
101
|
+
});
|
|
102
|
+
process.on('SIGTERM', () => {
|
|
103
|
+
cleanup();
|
|
104
|
+
process.exit(143);
|
|
105
|
+
});
|
|
106
|
+
process.on('uncaughtException', (err) => {
|
|
107
|
+
console.error(chalk.red('Uncaught Exception:'), err);
|
|
108
|
+
cleanup();
|
|
109
|
+
process.exit(1);
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function isPidAlive(pid: number): boolean;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export function isPidAlive(pid) {
|
|
2
|
+
try {
|
|
3
|
+
process.kill(pid, 0);
|
|
4
|
+
return true;
|
|
5
|
+
}
|
|
6
|
+
catch (error) {
|
|
7
|
+
if (error.code === 'EPERM') {
|
|
8
|
+
// Process exists but no permission to signal -> Alive
|
|
9
|
+
return true;
|
|
10
|
+
}
|
|
11
|
+
// Process does not exist (ESRCH) or other error
|
|
12
|
+
return false;
|
|
13
|
+
}
|
|
14
|
+
}
|