@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
import { Graph, GraphNode } from './graph.js';
|
|
2
|
+
import { SimHash } from './simhash.js';
|
|
3
|
+
|
|
4
|
+
export interface DuplicateOptions {
|
|
5
|
+
collapse?: boolean;
|
|
6
|
+
simhashThreshold?: number; // Hamming distance threshold (default: 3)
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
interface DuplicateCluster {
|
|
10
|
+
id: string;
|
|
11
|
+
type: 'exact' | 'near' | 'template_heavy';
|
|
12
|
+
nodes: GraphNode[];
|
|
13
|
+
representative?: string;
|
|
14
|
+
severity?: 'low' | 'medium' | 'high';
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Detects exact and near duplicates, identifies canonical conflicts,
|
|
19
|
+
* and performs non-destructive collapse of edges.
|
|
20
|
+
*/
|
|
21
|
+
export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
|
|
22
|
+
const collapse = options.collapse !== false; // Default to true
|
|
23
|
+
const threshold = options.simhashThreshold ?? 3;
|
|
24
|
+
|
|
25
|
+
const exactClusters: DuplicateCluster[] = [];
|
|
26
|
+
const nearClusters: DuplicateCluster[] = [];
|
|
27
|
+
|
|
28
|
+
const nodes = graph.getNodes();
|
|
29
|
+
|
|
30
|
+
// Phase 1 & 2: Exact Duplicate Detection
|
|
31
|
+
const exactMap = new Map<string, GraphNode[]>();
|
|
32
|
+
for (const node of nodes) {
|
|
33
|
+
if (!node.contentHash || node.status !== 200) continue;
|
|
34
|
+
|
|
35
|
+
// Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs)
|
|
36
|
+
let arr = exactMap.get(node.contentHash);
|
|
37
|
+
if (!arr) {
|
|
38
|
+
arr = [];
|
|
39
|
+
exactMap.set(node.contentHash, arr);
|
|
40
|
+
}
|
|
41
|
+
arr.push(node);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Nodes that are NOT part of an exact duplicate group are candidates for near duplicate checks
|
|
45
|
+
const nearCandidates: GraphNode[] = [];
|
|
46
|
+
let clusterCounter = 1;
|
|
47
|
+
|
|
48
|
+
for (const [_hash, group] of exactMap.entries()) {
|
|
49
|
+
if (group.length > 1) {
|
|
50
|
+
const id = `cluster_exact_${clusterCounter++}`;
|
|
51
|
+
exactClusters.push({ id, type: 'exact', nodes: group });
|
|
52
|
+
// Mark nodes
|
|
53
|
+
for (const n of group) {
|
|
54
|
+
n.duplicateClusterId = id;
|
|
55
|
+
n.duplicateType = 'exact';
|
|
56
|
+
}
|
|
57
|
+
} else {
|
|
58
|
+
nearCandidates.push(group[0]);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Phase 3: Near Duplicate Detection (SimHash with Bands)
|
|
63
|
+
// 64-bit simhash -> split into 4 bands of 16 bits.
|
|
64
|
+
const bandsMaps = [
|
|
65
|
+
new Map<number, GraphNode[]>(),
|
|
66
|
+
new Map<number, GraphNode[]>(),
|
|
67
|
+
new Map<number, GraphNode[]>(),
|
|
68
|
+
new Map<number, GraphNode[]>()
|
|
69
|
+
];
|
|
70
|
+
|
|
71
|
+
for (const node of nearCandidates) {
|
|
72
|
+
if (!node.simhash) continue;
|
|
73
|
+
const simhash = BigInt(node.simhash);
|
|
74
|
+
|
|
75
|
+
// Extract 16 bit bands
|
|
76
|
+
const b0 = Number(simhash & 0xFFFFn);
|
|
77
|
+
const b1 = Number((simhash >> 16n) & 0xFFFFn);
|
|
78
|
+
const b2 = Number((simhash >> 32n) & 0xFFFFn);
|
|
79
|
+
const b3 = Number((simhash >> 48n) & 0xFFFFn);
|
|
80
|
+
|
|
81
|
+
const bands = [b0, b1, b2, b3];
|
|
82
|
+
for (let i = 0; i < 4; i++) {
|
|
83
|
+
let arr = bandsMaps[i].get(bands[i]);
|
|
84
|
+
if (!arr) {
|
|
85
|
+
arr = [];
|
|
86
|
+
bandsMaps[i].set(bands[i], arr);
|
|
87
|
+
}
|
|
88
|
+
arr.push(node);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Find candidate pairs
|
|
93
|
+
const nearGroupMap = new Map<string, Set<GraphNode>>(); // node.url -> cluster set
|
|
94
|
+
const checkedPairs = new Set<string>();
|
|
95
|
+
|
|
96
|
+
for (let i = 0; i < 4; i++) {
|
|
97
|
+
for (const [_bandVal, bucketNodes] of bandsMaps[i].entries()) {
|
|
98
|
+
if (bucketNodes.length < 2) continue; // nothing to compare
|
|
99
|
+
|
|
100
|
+
// Compare all nodes in this bucket
|
|
101
|
+
for (let j = 0; j < bucketNodes.length; j++) {
|
|
102
|
+
for (let k = j + 1; k < bucketNodes.length; k++) {
|
|
103
|
+
const n1 = bucketNodes[j];
|
|
104
|
+
const n2 = bucketNodes[k];
|
|
105
|
+
|
|
106
|
+
// Ensure n1 < n2 lexicographically to avoid duplicate pairs
|
|
107
|
+
const [a, b] = n1.url < n2.url ? [n1, n2] : [n2, n1];
|
|
108
|
+
const pairKey = `${a.url}|${b.url}`;
|
|
109
|
+
|
|
110
|
+
if (checkedPairs.has(pairKey)) continue;
|
|
111
|
+
checkedPairs.add(pairKey);
|
|
112
|
+
|
|
113
|
+
const dist = SimHash.hammingDistance(BigInt(a.simhash!), BigInt(b.simhash!));
|
|
114
|
+
if (dist <= threshold) {
|
|
115
|
+
// They are near duplicates.
|
|
116
|
+
// Find or create their cluster set using union-find or reference propagation
|
|
117
|
+
const setA = nearGroupMap.get(a.url);
|
|
118
|
+
const setB = nearGroupMap.get(b.url);
|
|
119
|
+
|
|
120
|
+
if (!setA && !setB) {
|
|
121
|
+
const newSet = new Set<GraphNode>([a, b]);
|
|
122
|
+
nearGroupMap.set(a.url, newSet);
|
|
123
|
+
nearGroupMap.set(b.url, newSet);
|
|
124
|
+
} else if (setA && !setB) {
|
|
125
|
+
setA.add(b);
|
|
126
|
+
nearGroupMap.set(b.url, setA);
|
|
127
|
+
} else if (setB && !setA) {
|
|
128
|
+
setB.add(a);
|
|
129
|
+
nearGroupMap.set(a.url, setB);
|
|
130
|
+
} else if (setA && setB && setA !== setB) {
|
|
131
|
+
// Merge sets
|
|
132
|
+
for (const node of setB) {
|
|
133
|
+
setA.add(node);
|
|
134
|
+
nearGroupMap.set(node.url, setA);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Compile near duplicate clusters (deduplicated by Set reference)
|
|
144
|
+
const uniqueNearSets = new Set<Set<GraphNode>>();
|
|
145
|
+
for (const group of nearGroupMap.values()) {
|
|
146
|
+
uniqueNearSets.add(group);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
for (const groupSet of uniqueNearSets) {
|
|
150
|
+
if (groupSet.size > 1) {
|
|
151
|
+
const id = `cluster_near_${clusterCounter++}`;
|
|
152
|
+
const groupArr = Array.from(groupSet);
|
|
153
|
+
nearClusters.push({ id, type: 'near', nodes: groupArr });
|
|
154
|
+
for (const n of groupArr) {
|
|
155
|
+
n.duplicateClusterId = id;
|
|
156
|
+
n.duplicateType = 'near';
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const allClusters = [...exactClusters, ...nearClusters];
|
|
162
|
+
|
|
163
|
+
// Phase 4: Template-Heavy Detection
|
|
164
|
+
// Mark classes as 'template_heavy' if ratio < 0.3
|
|
165
|
+
for (const cluster of allClusters) {
|
|
166
|
+
const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
|
|
167
|
+
if (avgRatio < 0.3) {
|
|
168
|
+
cluster.type = 'template_heavy';
|
|
169
|
+
cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Phase 5: Canonical Conflict & Representative Selection
|
|
174
|
+
for (const cluster of allClusters) {
|
|
175
|
+
const canonicals = new Set<string>();
|
|
176
|
+
let hasMissing = false;
|
|
177
|
+
|
|
178
|
+
for (const n of cluster.nodes) {
|
|
179
|
+
if (!n.canonical) hasMissing = true;
|
|
180
|
+
// We compare full absolute canonical URLs (assuming they are normalized during crawl)
|
|
181
|
+
else canonicals.add(n.canonical);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (hasMissing || canonicals.size > 1) {
|
|
185
|
+
cluster.severity = 'high';
|
|
186
|
+
} else if (cluster.type === 'near') {
|
|
187
|
+
cluster.severity = 'medium';
|
|
188
|
+
} else {
|
|
189
|
+
cluster.severity = 'low';
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Phase 6: Select Representative
|
|
193
|
+
// 1. Valid Canonical target in cluster
|
|
194
|
+
// 2. Highest internal in-degree
|
|
195
|
+
// 3. Shortest URL
|
|
196
|
+
// 4. First discovered (relying on array order, which is from BFS map roughly)
|
|
197
|
+
let representativeNode = cluster.nodes[0];
|
|
198
|
+
|
|
199
|
+
// Evaluate best rep
|
|
200
|
+
const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
|
|
201
|
+
const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
|
|
202
|
+
|
|
203
|
+
if (validCanonicals.length > 0) {
|
|
204
|
+
representativeNode = validCanonicals[0]; // If multiple, just pick first matching self
|
|
205
|
+
} else {
|
|
206
|
+
representativeNode = cluster.nodes.reduce((best, current) => {
|
|
207
|
+
if (current.inLinks > best.inLinks) return current;
|
|
208
|
+
if (current.inLinks < best.inLinks) return best;
|
|
209
|
+
if (current.url.length < best.url.length) return current;
|
|
210
|
+
return best;
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
cluster.representative = representativeNode.url;
|
|
215
|
+
|
|
216
|
+
cluster.nodes.forEach(n => {
|
|
217
|
+
n.isClusterPrimary = n.url === representativeNode.url;
|
|
218
|
+
n.isCollapsed = false; // default for JSON
|
|
219
|
+
n.collapseInto = undefined;
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
// Push to Graph's final cluster list
|
|
223
|
+
graph.duplicateClusters.push({
|
|
224
|
+
id: cluster.id,
|
|
225
|
+
type: cluster.type,
|
|
226
|
+
size: cluster.nodes.length,
|
|
227
|
+
representative: representativeNode.url,
|
|
228
|
+
severity: cluster.severity!
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
// Controlled Collapse
|
|
232
|
+
if (collapse) {
|
|
233
|
+
for (const n of cluster.nodes) {
|
|
234
|
+
if (n.url !== representativeNode.url) {
|
|
235
|
+
n.isCollapsed = true;
|
|
236
|
+
n.collapseInto = representativeNode.url;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Final Edge Transfer if Collapsing
|
|
243
|
+
if (collapse) {
|
|
244
|
+
const edges = graph.getEdges();
|
|
245
|
+
const updatedEdges = new Map<string, number>();
|
|
246
|
+
|
|
247
|
+
for (const edge of edges) {
|
|
248
|
+
const sourceNode = graph.nodes.get(edge.source);
|
|
249
|
+
const targetNode = graph.nodes.get(edge.target);
|
|
250
|
+
|
|
251
|
+
if (!sourceNode || !targetNode) continue;
|
|
252
|
+
|
|
253
|
+
// We do NOT modify source structure for out-bound edges of collapsed nodes?
|
|
254
|
+
// Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative."
|
|
255
|
+
// Actually, if a node links TO a collapsed node, we repoint the edge to the representative.
|
|
256
|
+
// If a collapsed node links to X, we ignore it (PageRank will filter it out).
|
|
257
|
+
|
|
258
|
+
const actualSource = edge.source;
|
|
259
|
+
// repoint target
|
|
260
|
+
const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
|
|
261
|
+
|
|
262
|
+
// Skip self-referential edges caused by repointing
|
|
263
|
+
if (actualSource === actualTarget) continue;
|
|
264
|
+
|
|
265
|
+
const edgeKey = `${actualSource}|${actualTarget}`;
|
|
266
|
+
const existingWeight = updatedEdges.get(edgeKey) || 0;
|
|
267
|
+
updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); // deduplicate
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Update graph edges in-place
|
|
271
|
+
graph.edges = updatedEdges;
|
|
272
|
+
|
|
273
|
+
// Re-calculate inLinks and outLinks based on collapsed edges
|
|
274
|
+
for (const node of graph.getNodes()) {
|
|
275
|
+
node.inLinks = 0;
|
|
276
|
+
node.outLinks = 0;
|
|
277
|
+
}
|
|
278
|
+
for (const [edgeKey, _weight] of updatedEdges.entries()) {
|
|
279
|
+
const [src, tgt] = edgeKey.split('|');
|
|
280
|
+
const sn = graph.nodes.get(src);
|
|
281
|
+
const tn = graph.nodes.get(tgt);
|
|
282
|
+
if (sn) sn.outLinks++;
|
|
283
|
+
if (tn) tn.inLinks++;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
export interface GraphNode {
|
|
2
|
+
url: string;
|
|
3
|
+
depth: number;
|
|
4
|
+
inLinks: number;
|
|
5
|
+
outLinks: number;
|
|
6
|
+
status: number;
|
|
7
|
+
canonical?: string;
|
|
8
|
+
noindex?: boolean;
|
|
9
|
+
nofollow?: boolean;
|
|
10
|
+
brokenLinks?: string[];
|
|
11
|
+
redirectChain?: string[];
|
|
12
|
+
incrementalStatus?: 'new' | 'changed' | 'unchanged' | 'deleted';
|
|
13
|
+
etag?: string;
|
|
14
|
+
lastModified?: string;
|
|
15
|
+
contentHash?: string;
|
|
16
|
+
html?: string;
|
|
17
|
+
pageRank?: number;
|
|
18
|
+
pageRankScore?: number;
|
|
19
|
+
authorityScore?: number;
|
|
20
|
+
hubScore?: number;
|
|
21
|
+
duplicateClusterId?: string;
|
|
22
|
+
duplicateType?: 'exact' | 'near' | 'template_heavy' | 'none';
|
|
23
|
+
isClusterPrimary?: boolean;
|
|
24
|
+
isCollapsed?: boolean;
|
|
25
|
+
collapseInto?: string;
|
|
26
|
+
simhash?: string;
|
|
27
|
+
uniqueTokenRatio?: number;
|
|
28
|
+
soft404Score?: number;
|
|
29
|
+
soft404Signals?: string[];
|
|
30
|
+
crawlTrapFlag?: boolean;
|
|
31
|
+
crawlTrapRisk?: number;
|
|
32
|
+
trapType?: string;
|
|
33
|
+
securityError?: string;
|
|
34
|
+
retries?: number;
|
|
35
|
+
clusterId?: number;
|
|
36
|
+
bytesReceived?: number;
|
|
37
|
+
linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export interface GraphEdge {
|
|
41
|
+
source: string;
|
|
42
|
+
target: string;
|
|
43
|
+
weight: number;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export interface ClusterInfo {
|
|
47
|
+
id: number;
|
|
48
|
+
count: number;
|
|
49
|
+
primaryUrl: string;
|
|
50
|
+
risk: 'low' | 'medium' | 'high';
|
|
51
|
+
sharedPathPrefix?: string;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface CrawlStats {
|
|
55
|
+
pagesFetched: number;
|
|
56
|
+
pagesCached: number;
|
|
57
|
+
pagesSkipped: number;
|
|
58
|
+
totalFound: number;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export class Graph {
|
|
62
|
+
nodes: Map<string, GraphNode> = new Map();
|
|
63
|
+
// Using string "source|target" to ensure uniqueness efficiently. Mapping to weight.
|
|
64
|
+
edges: Map<string, number> = new Map();
|
|
65
|
+
limitReached: boolean = false;
|
|
66
|
+
sessionStats: CrawlStats = {
|
|
67
|
+
pagesFetched: 0,
|
|
68
|
+
pagesCached: 0,
|
|
69
|
+
pagesSkipped: 0,
|
|
70
|
+
totalFound: 0
|
|
71
|
+
};
|
|
72
|
+
trapClusters: { pattern: string; type: string; count: number }[] = [];
|
|
73
|
+
duplicateClusters: { id: string; type: 'exact' | 'near' | 'template_heavy'; size: number; representative: string; severity: 'low' | 'medium' | 'high' }[] = [];
|
|
74
|
+
contentClusters: ClusterInfo[] = [];
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Adds a node to the graph if it doesn't exist.
|
|
78
|
+
* If it exists, updates the status if the new status is non-zero (meaning we crawled it).
|
|
79
|
+
* Depth is only set on creation (BFS guarantees shortest path first).
|
|
80
|
+
*/
|
|
81
|
+
addNode(url: string, depth: number, status: number = 0) {
|
|
82
|
+
const existing = this.nodes.get(url);
|
|
83
|
+
if (!existing) {
|
|
84
|
+
this.nodes.set(url, {
|
|
85
|
+
url,
|
|
86
|
+
depth,
|
|
87
|
+
status,
|
|
88
|
+
inLinks: 0,
|
|
89
|
+
outLinks: 0
|
|
90
|
+
});
|
|
91
|
+
} else {
|
|
92
|
+
// Update status if we have a real one now (e.g. was 0/pending, now crawled)
|
|
93
|
+
if (status !== 0) {
|
|
94
|
+
existing.status = status;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
updateNodeData(url: string, data: Partial<GraphNode>) {
|
|
100
|
+
const existing = this.nodes.get(url);
|
|
101
|
+
if (existing) {
|
|
102
|
+
Object.assign(existing, data);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Adds a directed edge between two nodes.
|
|
108
|
+
* Both nodes must exist in the graph.
|
|
109
|
+
* Updates inLinks and outLinks counts.
|
|
110
|
+
*/
|
|
111
|
+
addEdge(source: string, target: string, weight: number = 1.0) {
|
|
112
|
+
const sourceNode = this.nodes.get(source);
|
|
113
|
+
const targetNode = this.nodes.get(target);
|
|
114
|
+
|
|
115
|
+
if (sourceNode && targetNode) {
|
|
116
|
+
const edgeKey = `${source}|${target}`;
|
|
117
|
+
if (!this.edges.has(edgeKey)) {
|
|
118
|
+
this.edges.set(edgeKey, weight);
|
|
119
|
+
sourceNode.outLinks++;
|
|
120
|
+
targetNode.inLinks++;
|
|
121
|
+
} else {
|
|
122
|
+
// If edge exists, keep highest weight (or could sum, but usually we just want the 'best' relationship)
|
|
123
|
+
const currentWeight = this.edges.get(edgeKey) || 0;
|
|
124
|
+
if (weight > currentWeight) {
|
|
125
|
+
this.edges.set(edgeKey, weight);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
getNodes(): GraphNode[] {
|
|
132
|
+
return Array.from(this.nodes.values());
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
getEdges(): GraphEdge[] {
|
|
136
|
+
return Array.from(this.edges.entries()).map(([edge, weight]) => {
|
|
137
|
+
const [source, target] = edge.split('|');
|
|
138
|
+
return { source, target, weight };
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
toJSON() {
|
|
143
|
+
return {
|
|
144
|
+
nodes: this.getNodes(),
|
|
145
|
+
edges: this.getEdges(),
|
|
146
|
+
duplicateClusters: this.duplicateClusters,
|
|
147
|
+
contentClusters: this.contentClusters
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
static fromJSON(json: any): Graph {
|
|
152
|
+
const graph = new Graph();
|
|
153
|
+
if (json.nodes) {
|
|
154
|
+
for (const node of json.nodes) {
|
|
155
|
+
graph.nodes.set(node.url, { ...node });
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
if (json.edges) {
|
|
159
|
+
for (const edge of json.edges) {
|
|
160
|
+
const key = `${edge.source}|${edge.target}`;
|
|
161
|
+
graph.edges.set(key, edge.weight || 1.0);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
if (json.duplicateClusters) {
|
|
165
|
+
graph.duplicateClusters = json.duplicateClusters;
|
|
166
|
+
}
|
|
167
|
+
if (json.contentClusters) {
|
|
168
|
+
graph.contentClusters = json.contentClusters;
|
|
169
|
+
}
|
|
170
|
+
return graph;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { Graph, GraphNode } from './graph.js';
|
|
2
|
+
|
|
3
|
+
export interface Metrics {
|
|
4
|
+
totalPages: number;
|
|
5
|
+
totalEdges: number;
|
|
6
|
+
orphanPages: string[];
|
|
7
|
+
nearOrphans: string[];
|
|
8
|
+
deepPages: string[];
|
|
9
|
+
topAuthorityPages: { url: string; authority: number }[];
|
|
10
|
+
averageOutDegree: number;
|
|
11
|
+
maxDepthFound: number;
|
|
12
|
+
crawlEfficiencyScore: number;
|
|
13
|
+
averageDepth: number;
|
|
14
|
+
structuralEntropy: number;
|
|
15
|
+
topPageRankPages: { url: string; score: number }[];
|
|
16
|
+
limitReached: boolean;
|
|
17
|
+
sessionStats?: {
|
|
18
|
+
pagesFetched: number;
|
|
19
|
+
pagesCached: number;
|
|
20
|
+
pagesSkipped: number;
|
|
21
|
+
totalFound: number;
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function calculateMetrics(graph: Graph, _maxDepth: number): Metrics {
|
|
26
|
+
const nodes = graph.getNodes();
|
|
27
|
+
const edges = graph.getEdges();
|
|
28
|
+
|
|
29
|
+
const totalPages = nodes.length;
|
|
30
|
+
const totalEdges = edges.length;
|
|
31
|
+
|
|
32
|
+
// Authority Score (per node)
|
|
33
|
+
const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
|
|
34
|
+
const getAuthority = (node: GraphNode) => {
|
|
35
|
+
if (maxInLinks === 0) return 0;
|
|
36
|
+
return Math.log(1 + node.inLinks) / Math.log(1 + maxInLinks);
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// orphanPages: inLinks === 0 && depth > 0
|
|
40
|
+
const orphanPages = nodes
|
|
41
|
+
.filter(n => n.inLinks === 0 && n.depth > 0)
|
|
42
|
+
.map(n => n.url);
|
|
43
|
+
|
|
44
|
+
// nearOrphans: inLinks === 1 && depth >= 3
|
|
45
|
+
const nearOrphans = nodes
|
|
46
|
+
.filter(n => n.inLinks === 1 && n.depth >= 3)
|
|
47
|
+
.map(n => n.url);
|
|
48
|
+
|
|
49
|
+
// deepPages: depth >= 4
|
|
50
|
+
const deepPages = nodes
|
|
51
|
+
.filter(n => n.depth >= 4) // Per requirement
|
|
52
|
+
.map(n => n.url);
|
|
53
|
+
|
|
54
|
+
// crawlEfficiencyScore: 1 - (deepPagesCount / totalPages)
|
|
55
|
+
const deepPagesCount = deepPages.length;
|
|
56
|
+
const crawlEfficiencyScore = totalPages > 0 ? 1 - (deepPagesCount / totalPages) : 1;
|
|
57
|
+
|
|
58
|
+
// averageDepth: sum(depth) / totalPages
|
|
59
|
+
const sumDepth = nodes.reduce((acc, n) => acc + n.depth, 0);
|
|
60
|
+
const averageDepth = totalPages > 0 ? sumDepth / totalPages : 0;
|
|
61
|
+
|
|
62
|
+
// structuralEntropy: Shannon entropy over outDegree distribution
|
|
63
|
+
const outDegreeCounts = new Map<number, number>();
|
|
64
|
+
nodes.forEach(n => {
|
|
65
|
+
outDegreeCounts.set(n.outLinks, (outDegreeCounts.get(n.outLinks) || 0) + 1);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
let structuralEntropy = 0;
|
|
69
|
+
if (totalPages > 0) {
|
|
70
|
+
for (const count of outDegreeCounts.values()) {
|
|
71
|
+
const p = count / totalPages;
|
|
72
|
+
if (p > 0) {
|
|
73
|
+
structuralEntropy -= p * Math.log2(p);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// topAuthorityPages: Top 10 by authority
|
|
79
|
+
const topAuthorityPages = [...nodes]
|
|
80
|
+
.map(n => ({ url: n.url, authority: n.authorityScore ?? getAuthority(n) }))
|
|
81
|
+
.sort((a, b) => b.authority - a.authority)
|
|
82
|
+
.slice(0, 10);
|
|
83
|
+
|
|
84
|
+
// topPageRankPages: Top 10 by raw PageRank
|
|
85
|
+
const topPageRankPages = [...nodes]
|
|
86
|
+
.filter(n => n.pageRank !== undefined)
|
|
87
|
+
.map(n => ({ url: n.url, score: n.pageRank! }))
|
|
88
|
+
.sort((a, b) => b.score - a.score)
|
|
89
|
+
.slice(0, 10);
|
|
90
|
+
|
|
91
|
+
const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
|
|
92
|
+
const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
totalPages,
|
|
96
|
+
totalEdges,
|
|
97
|
+
orphanPages,
|
|
98
|
+
nearOrphans,
|
|
99
|
+
deepPages,
|
|
100
|
+
topAuthorityPages,
|
|
101
|
+
averageOutDegree,
|
|
102
|
+
maxDepthFound,
|
|
103
|
+
crawlEfficiencyScore,
|
|
104
|
+
averageDepth,
|
|
105
|
+
structuralEntropy,
|
|
106
|
+
topPageRankPages,
|
|
107
|
+
limitReached: graph.limitReached,
|
|
108
|
+
sessionStats: graph.sessionStats
|
|
109
|
+
};
|
|
110
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import { Graph, GraphNode } from './graph.js';
|
|
2
|
+
|
|
3
|
+
interface PageRankOptions {
|
|
4
|
+
dampingFactor?: number;
|
|
5
|
+
maxIterations?: number;
|
|
6
|
+
convergenceThreshold?: number;
|
|
7
|
+
soft404WeightThreshold?: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Production-Grade Weighted PageRank Engine
|
|
12
|
+
*/
|
|
13
|
+
export function computePageRank(graph: Graph, options: PageRankOptions = {}) {
|
|
14
|
+
const d = options.dampingFactor ?? 0.85;
|
|
15
|
+
const maxIterations = options.maxIterations ?? 40;
|
|
16
|
+
const epsilon = options.convergenceThreshold ?? 1e-5;
|
|
17
|
+
const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
|
|
18
|
+
|
|
19
|
+
const allNodes = graph.getNodes();
|
|
20
|
+
const allEdges = graph.getEdges();
|
|
21
|
+
|
|
22
|
+
// 1. Filter Eligible Nodes
|
|
23
|
+
const eligibleNodes = allNodes.filter(node => {
|
|
24
|
+
if (node.noindex) return false;
|
|
25
|
+
if (node.isCollapsed) return false;
|
|
26
|
+
if (node.soft404Score && node.soft404Score > soft404Threshold) return false;
|
|
27
|
+
if (node.canonical && node.canonical !== node.url) return false;
|
|
28
|
+
if (node.status >= 400) return false; // Don't pass rank to broken pages
|
|
29
|
+
return true;
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
const nodeCount = eligibleNodes.length;
|
|
33
|
+
if (nodeCount === 0) return;
|
|
34
|
+
|
|
35
|
+
const nodeUrls = eligibleNodes.map(n => n.url);
|
|
36
|
+
const nodeMap = new Map<string, GraphNode>();
|
|
37
|
+
eligibleNodes.forEach(n => nodeMap.set(n.url, n));
|
|
38
|
+
|
|
39
|
+
// Initialize PageRank
|
|
40
|
+
let pr = new Map<string, number>();
|
|
41
|
+
nodeUrls.forEach(url => pr.set(url, 1 / nodeCount));
|
|
42
|
+
|
|
43
|
+
// Pre-calculate weighted outbound sums and inverted adjacency
|
|
44
|
+
const outWeights = new Map<string, number>();
|
|
45
|
+
const incoming = new Map<string, { source: string; weight: number }[]>();
|
|
46
|
+
const sinks: string[] = [];
|
|
47
|
+
|
|
48
|
+
// Initialize outWeights for all eligible nodes
|
|
49
|
+
nodeUrls.forEach(url => outWeights.set(url, 0));
|
|
50
|
+
|
|
51
|
+
for (const edge of allEdges) {
|
|
52
|
+
if (nodeMap.has(edge.source) && nodeMap.has(edge.target)) {
|
|
53
|
+
const weight = edge.weight || 1.0;
|
|
54
|
+
|
|
55
|
+
const sources = incoming.get(edge.target) ?? [];
|
|
56
|
+
sources.push({ source: edge.source, weight });
|
|
57
|
+
incoming.set(edge.target, sources);
|
|
58
|
+
|
|
59
|
+
outWeights.set(edge.source, (outWeights.get(edge.source) || 0) + weight);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Identify sinks
|
|
64
|
+
nodeUrls.forEach(url => {
|
|
65
|
+
if ((outWeights.get(url) || 0) === 0) {
|
|
66
|
+
sinks.push(url);
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
// Iterative Calculation
|
|
71
|
+
for (let i = 0; i < maxIterations; i++) {
|
|
72
|
+
const nextPr = new Map<string, number>();
|
|
73
|
+
|
|
74
|
+
// Calculate total rank from sinks to redistribute
|
|
75
|
+
let sinkRankTotal = 0;
|
|
76
|
+
for (const url of sinks) {
|
|
77
|
+
sinkRankTotal += pr.get(url) || 0;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
|
|
81
|
+
|
|
82
|
+
for (const url of nodeUrls) {
|
|
83
|
+
let rankFromLinks = 0;
|
|
84
|
+
const sources = incoming.get(url) || [];
|
|
85
|
+
|
|
86
|
+
for (const edge of sources) {
|
|
87
|
+
const sourceRank = pr.get(edge.source) || 0;
|
|
88
|
+
const sourceOutWeight = outWeights.get(edge.source) || 1.0;
|
|
89
|
+
rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const newRank = baseRank + d * rankFromLinks;
|
|
93
|
+
nextPr.set(url, newRank);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Convergence check
|
|
97
|
+
let maxDelta = 0;
|
|
98
|
+
for (const url of nodeUrls) {
|
|
99
|
+
const delta = Math.abs(nextPr.get(url)! - pr.get(url)!);
|
|
100
|
+
if (delta > maxDelta) maxDelta = delta;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
pr = nextPr;
|
|
104
|
+
|
|
105
|
+
if (maxDelta < epsilon) break;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// 2. Normalization (0-100)
|
|
109
|
+
const ranks = Array.from(pr.values());
|
|
110
|
+
const minPR = Math.min(...ranks);
|
|
111
|
+
const maxPR = Math.max(...ranks);
|
|
112
|
+
const range = maxPR - minPR;
|
|
113
|
+
|
|
114
|
+
for (const node of eligibleNodes) {
|
|
115
|
+
const rawRank = pr.get(node.url)!;
|
|
116
|
+
node.pageRank = rawRank;
|
|
117
|
+
|
|
118
|
+
if (range > 1e-12) {
|
|
119
|
+
node.pageRankScore = 100 * (rawRank - minPR) / range;
|
|
120
|
+
} else {
|
|
121
|
+
// If there's no range, all eligible pages are equally important.
|
|
122
|
+
node.pageRankScore = 100;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|