@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/src/graph/cluster.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { Graph, GraphNode, ClusterInfo } from './graph.js';
|
|
2
2
|
import { SimHash } from './simhash.js';
|
|
3
|
+
import { load } from 'cheerio';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* Detects content clusters using 64-bit SimHash and Hamming Distance.
|
|
@@ -18,24 +19,23 @@ export function detectContentClusters(
|
|
|
18
19
|
// Banding Optimization (4 bands of 16 bits)
|
|
19
20
|
// Note: For threshold > 3, this is a heuristic and may miss some pairs,
|
|
20
21
|
// but it dramatically reduces the search space as requested.
|
|
21
|
-
const
|
|
22
|
-
const bandWidth = 16;
|
|
23
|
-
const buckets: Map<number, Set<string>>[] = Array.from({ length: bands }, () => new Map());
|
|
22
|
+
const buckets: Map<number, Set<string>>[] = Array.from({ length: SimHash.BANDS }, () => new Map());
|
|
24
23
|
|
|
25
24
|
for (const node of nodes) {
|
|
26
25
|
const hash = BigInt(node.simhash!);
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
const bandValues = SimHash.getBands(hash);
|
|
27
|
+
|
|
28
|
+
bandValues.forEach((bandValue, b) => {
|
|
29
29
|
if (!buckets[b].has(bandValue)) {
|
|
30
30
|
buckets[b].set(bandValue, new Set());
|
|
31
31
|
}
|
|
32
32
|
buckets[b].get(bandValue)!.add(node.url);
|
|
33
|
-
}
|
|
33
|
+
});
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
const checkedPairs = new Set<string>();
|
|
37
37
|
|
|
38
|
-
for (let b = 0; b <
|
|
38
|
+
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
39
39
|
for (const bucket of buckets[b].values()) {
|
|
40
40
|
if (bucket.size < 2) continue;
|
|
41
41
|
const bucketNodes = Array.from(bucket);
|
|
@@ -154,14 +154,68 @@ function selectPrimaryUrl(urls: string[], graph: Graph): string {
|
|
|
154
154
|
* Calculates cannibalization risk based on title and H1 similarity within the cluster.
|
|
155
155
|
*/
|
|
156
156
|
function calculateClusterRisk(nodes: GraphNode[]): 'low' | 'medium' | 'high' {
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
//
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
157
|
+
if (nodes.length <= 1) return 'low';
|
|
158
|
+
|
|
159
|
+
// Count title and H1 occurrences
|
|
160
|
+
const titleCounts = new Map<string, number>();
|
|
161
|
+
const h1Counts = new Map<string, number>();
|
|
162
|
+
let processedCount = 0;
|
|
163
|
+
|
|
164
|
+
for (const node of nodes) {
|
|
165
|
+
if (!node.html) continue;
|
|
166
|
+
|
|
167
|
+
try {
|
|
168
|
+
const $ = load(node.html);
|
|
169
|
+
const title = $('title').text().trim().toLowerCase();
|
|
170
|
+
const h1 = $('h1').first().text().trim().toLowerCase();
|
|
171
|
+
|
|
172
|
+
if (title) {
|
|
173
|
+
titleCounts.set(title, (titleCounts.get(title) || 0) + 1);
|
|
174
|
+
}
|
|
175
|
+
if (h1) {
|
|
176
|
+
h1Counts.set(h1, (h1Counts.get(h1) || 0) + 1);
|
|
177
|
+
}
|
|
178
|
+
processedCount++;
|
|
179
|
+
} catch {
|
|
180
|
+
// Ignore parsing errors
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// If we couldn't parse enough content (e.g., no HTML stored), fallback to size-based heuristic
|
|
185
|
+
if (processedCount < nodes.length * 0.5) {
|
|
186
|
+
if (nodes.length > 5) return 'high';
|
|
187
|
+
if (nodes.length > 2) return 'medium';
|
|
188
|
+
return 'low';
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Calculate duplicate ratios
|
|
192
|
+
let duplicateTitleCount = 0;
|
|
193
|
+
let duplicateH1Count = 0;
|
|
194
|
+
|
|
195
|
+
for (const count of titleCounts.values()) {
|
|
196
|
+
if (count > 1) duplicateTitleCount += count;
|
|
197
|
+
}
|
|
198
|
+
for (const count of h1Counts.values()) {
|
|
199
|
+
if (count > 1) duplicateH1Count += count;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const titleDupeRatio = duplicateTitleCount / nodes.length;
|
|
203
|
+
const h1DupeRatio = duplicateH1Count / nodes.length;
|
|
204
|
+
|
|
205
|
+
// Heuristic 1: High Risk
|
|
206
|
+
// Significant overlap in Titles OR H1s (e.g., > 30% of cluster members are duplicates)
|
|
207
|
+
if (titleDupeRatio > 0.3 || h1DupeRatio > 0.3) {
|
|
208
|
+
return 'high';
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Heuristic 2: Medium Risk
|
|
212
|
+
// Any overlap, or very large clusters (potential template issues or thin content)
|
|
213
|
+
if (titleDupeRatio > 0 || h1DupeRatio > 0 || nodes.length > 10) {
|
|
214
|
+
return 'medium';
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Heuristic 3: Low Risk
|
|
218
|
+
// Unique content and manageable cluster size
|
|
165
219
|
return 'low';
|
|
166
220
|
}
|
|
167
221
|
|
package/src/graph/duplicate.ts
CHANGED
|
@@ -21,18 +21,36 @@ interface DuplicateCluster {
|
|
|
21
21
|
export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
|
|
22
22
|
const collapse = options.collapse !== false; // Default to true
|
|
23
23
|
const threshold = options.simhashThreshold ?? 3;
|
|
24
|
-
|
|
25
|
-
const exactClusters: DuplicateCluster[] = [];
|
|
26
|
-
const nearClusters: DuplicateCluster[] = [];
|
|
27
|
-
|
|
28
24
|
const nodes = graph.getNodes();
|
|
25
|
+
let clusterCounter = 1;
|
|
29
26
|
|
|
30
27
|
// Phase 1 & 2: Exact Duplicate Detection
|
|
28
|
+
const { exactClusters, nearCandidates, nextId: nextId1 } = findExactDuplicates(nodes, clusterCounter);
|
|
29
|
+
clusterCounter = nextId1;
|
|
30
|
+
|
|
31
|
+
// Phase 3: Near Duplicate Detection
|
|
32
|
+
const { nearClusters } = findNearDuplicates(nearCandidates, threshold, clusterCounter);
|
|
33
|
+
|
|
34
|
+
const allClusters = [...exactClusters, ...nearClusters];
|
|
35
|
+
|
|
36
|
+
// Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative)
|
|
37
|
+
processClusters(allClusters, graph, collapse);
|
|
38
|
+
|
|
39
|
+
// Final Edge Transfer if Collapsing
|
|
40
|
+
if (collapse) {
|
|
41
|
+
collapseEdges(graph);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function findExactDuplicates(nodes: GraphNode[], startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
|
|
46
|
+
const exactMap = groupNodesByContentHash(nodes);
|
|
47
|
+
return createExactClusters(exactMap, startId);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function groupNodesByContentHash(nodes: GraphNode[]): Map<string, GraphNode[]> {
|
|
31
51
|
const exactMap = new Map<string, GraphNode[]>();
|
|
32
52
|
for (const node of nodes) {
|
|
33
53
|
if (!node.contentHash || node.status !== 200) continue;
|
|
34
|
-
|
|
35
|
-
// Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs)
|
|
36
54
|
let arr = exactMap.get(node.contentHash);
|
|
37
55
|
if (!arr) {
|
|
38
56
|
arr = [];
|
|
@@ -40,16 +58,18 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
|
|
|
40
58
|
}
|
|
41
59
|
arr.push(node);
|
|
42
60
|
}
|
|
61
|
+
return exactMap;
|
|
62
|
+
}
|
|
43
63
|
|
|
44
|
-
|
|
64
|
+
function createExactClusters(exactMap: Map<string, GraphNode[]>, startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
|
|
65
|
+
const exactClusters: DuplicateCluster[] = [];
|
|
45
66
|
const nearCandidates: GraphNode[] = [];
|
|
46
|
-
let clusterCounter =
|
|
67
|
+
let clusterCounter = startId;
|
|
47
68
|
|
|
48
69
|
for (const [_hash, group] of exactMap.entries()) {
|
|
49
70
|
if (group.length > 1) {
|
|
50
71
|
const id = `cluster_exact_${clusterCounter++}`;
|
|
51
72
|
exactClusters.push({ id, type: 'exact', nodes: group });
|
|
52
|
-
// Mark nodes
|
|
53
73
|
for (const n of group) {
|
|
54
74
|
n.duplicateClusterId = id;
|
|
55
75
|
n.duplicateType = 'exact';
|
|
@@ -59,228 +79,272 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
|
|
|
59
79
|
}
|
|
60
80
|
}
|
|
61
81
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
return { exactClusters, nearCandidates, nextId: clusterCounter };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function findNearDuplicates(candidates: GraphNode[], threshold: number, startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
|
|
86
|
+
const { bandsMaps, simhashes } = buildSimHashBuckets(candidates);
|
|
87
|
+
const { parent, involvedIndices } = findConnectedComponents(bandsMaps, simhashes, candidates.length, threshold);
|
|
88
|
+
return extractClusters(parent, involvedIndices, candidates, startId);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function buildSimHashBuckets(candidates: GraphNode[]): { bandsMaps: Map<number, number[]>[], simhashes: BigUint64Array, validIndices: number[] } {
|
|
92
|
+
const n = candidates.length;
|
|
93
|
+
const simhashes = new BigUint64Array(n);
|
|
94
|
+
const validIndices: number[] = [];
|
|
95
|
+
|
|
96
|
+
for (let i = 0; i < n; i++) {
|
|
97
|
+
if (candidates[i].simhash) {
|
|
98
|
+
simhashes[i] = BigInt(candidates[i].simhash!);
|
|
99
|
+
validIndices.push(i);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const bandsMaps: Map<number, number[]>[] = Array.from({ length: SimHash.BANDS }, () => new Map());
|
|
104
|
+
|
|
105
|
+
for (const idx of validIndices) {
|
|
106
|
+
const bands = SimHash.getBands(simhashes[idx]);
|
|
107
|
+
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
108
|
+
let arr = bandsMaps[b].get(bands[b]);
|
|
84
109
|
if (!arr) {
|
|
85
110
|
arr = [];
|
|
86
|
-
bandsMaps[
|
|
111
|
+
bandsMaps[b].set(bands[b], arr);
|
|
112
|
+
}
|
|
113
|
+
arr.push(idx);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return { bandsMaps, simhashes, validIndices };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function findConnectedComponents(bandsMaps: Map<number, number[]>[], simhashes: BigUint64Array, n: number, threshold: number): { parent: Uint32Array, involvedIndices: Set<number> } {
|
|
121
|
+
// Union-Find Arrays (Integer-based)
|
|
122
|
+
const parent = new Uint32Array(n);
|
|
123
|
+
const rank = new Uint8Array(n);
|
|
124
|
+
for (let i = 0; i < n; i++) {
|
|
125
|
+
parent[i] = i;
|
|
126
|
+
rank[i] = 0;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function find(i: number): number {
|
|
130
|
+
let root = i;
|
|
131
|
+
while (parent[root] !== root) {
|
|
132
|
+
root = parent[root];
|
|
133
|
+
}
|
|
134
|
+
let curr = i;
|
|
135
|
+
while (curr !== root) {
|
|
136
|
+
const next = parent[curr];
|
|
137
|
+
parent[curr] = root;
|
|
138
|
+
curr = next;
|
|
139
|
+
}
|
|
140
|
+
return root;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function union(i: number, j: number) {
|
|
144
|
+
const rootI = find(i);
|
|
145
|
+
const rootJ = find(j);
|
|
146
|
+
if (rootI !== rootJ) {
|
|
147
|
+
const rankI = rank[rootI];
|
|
148
|
+
const rankJ = rank[rootJ];
|
|
149
|
+
if (rankI < rankJ) {
|
|
150
|
+
parent[rootI] = rootJ;
|
|
151
|
+
} else if (rankI > rankJ) {
|
|
152
|
+
parent[rootJ] = rootI;
|
|
153
|
+
} else {
|
|
154
|
+
parent[rootJ] = rootI;
|
|
155
|
+
rank[rootI]++;
|
|
87
156
|
}
|
|
88
|
-
arr.push(node);
|
|
89
157
|
}
|
|
90
158
|
}
|
|
91
159
|
|
|
92
|
-
|
|
93
|
-
const nearGroupMap = new Map<string, Set<GraphNode>>(); // node.url -> cluster set
|
|
94
|
-
const checkedPairs = new Set<string>();
|
|
160
|
+
const involvedIndices = new Set<number>();
|
|
95
161
|
|
|
96
|
-
for (let
|
|
97
|
-
for (const
|
|
98
|
-
if (
|
|
162
|
+
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
163
|
+
for (const bucketIndices of bandsMaps[b].values()) {
|
|
164
|
+
if (bucketIndices.length < 2) continue;
|
|
99
165
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
const
|
|
104
|
-
const n2 = bucketNodes[k];
|
|
166
|
+
for (let j = 0; j < bucketIndices.length; j++) {
|
|
167
|
+
for (let k = j + 1; k < bucketIndices.length; k++) {
|
|
168
|
+
const idx1 = bucketIndices[j];
|
|
169
|
+
const idx2 = bucketIndices[k];
|
|
105
170
|
|
|
106
|
-
|
|
107
|
-
const
|
|
108
|
-
const pairKey = `${a.url}|${b.url}`;
|
|
171
|
+
const root1 = find(idx1);
|
|
172
|
+
const root2 = find(idx2);
|
|
109
173
|
|
|
110
|
-
if (
|
|
111
|
-
checkedPairs.add(pairKey);
|
|
174
|
+
if (root1 === root2) continue; // Already connected, skip expensive distance check
|
|
112
175
|
|
|
113
|
-
const dist = SimHash.hammingDistance(
|
|
176
|
+
const dist = SimHash.hammingDistance(simhashes[idx1], simhashes[idx2]);
|
|
114
177
|
if (dist <= threshold) {
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
const setB = nearGroupMap.get(b.url);
|
|
119
|
-
|
|
120
|
-
if (!setA && !setB) {
|
|
121
|
-
const newSet = new Set<GraphNode>([a, b]);
|
|
122
|
-
nearGroupMap.set(a.url, newSet);
|
|
123
|
-
nearGroupMap.set(b.url, newSet);
|
|
124
|
-
} else if (setA && !setB) {
|
|
125
|
-
setA.add(b);
|
|
126
|
-
nearGroupMap.set(b.url, setA);
|
|
127
|
-
} else if (setB && !setA) {
|
|
128
|
-
setB.add(a);
|
|
129
|
-
nearGroupMap.set(a.url, setB);
|
|
130
|
-
} else if (setA && setB && setA !== setB) {
|
|
131
|
-
// Merge sets
|
|
132
|
-
for (const node of setB) {
|
|
133
|
-
setA.add(node);
|
|
134
|
-
nearGroupMap.set(node.url, setA);
|
|
135
|
-
}
|
|
136
|
-
}
|
|
178
|
+
union(root1, root2);
|
|
179
|
+
involvedIndices.add(idx1);
|
|
180
|
+
involvedIndices.add(idx2);
|
|
137
181
|
}
|
|
138
182
|
}
|
|
139
183
|
}
|
|
140
184
|
}
|
|
141
185
|
}
|
|
142
186
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
187
|
+
return { parent, involvedIndices };
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function extractClusters(parent: Uint32Array, involvedIndices: Set<number>, candidates: GraphNode[], startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
|
|
191
|
+
const nearClusters: DuplicateCluster[] = [];
|
|
192
|
+
let clusterCounter = startId;
|
|
193
|
+
|
|
194
|
+
function find(i: number): number {
|
|
195
|
+
let root = i;
|
|
196
|
+
while (parent[root] !== root) {
|
|
197
|
+
root = parent[root];
|
|
198
|
+
}
|
|
199
|
+
let curr = i;
|
|
200
|
+
while (curr !== root) {
|
|
201
|
+
const next = parent[curr];
|
|
202
|
+
parent[curr] = root;
|
|
203
|
+
curr = next;
|
|
204
|
+
}
|
|
205
|
+
return root;
|
|
147
206
|
}
|
|
148
207
|
|
|
149
|
-
|
|
150
|
-
|
|
208
|
+
// Compile clusters
|
|
209
|
+
const clusterMap = new Map<number, number[]>();
|
|
210
|
+
for (const idx of involvedIndices) {
|
|
211
|
+
const root = find(idx);
|
|
212
|
+
let group = clusterMap.get(root);
|
|
213
|
+
if (!group) {
|
|
214
|
+
group = [];
|
|
215
|
+
clusterMap.set(root, group);
|
|
216
|
+
}
|
|
217
|
+
group.push(idx);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
for (const groupIndices of clusterMap.values()) {
|
|
221
|
+
if (groupIndices.length > 1) {
|
|
151
222
|
const id = `cluster_near_${clusterCounter++}`;
|
|
152
|
-
const
|
|
153
|
-
nearClusters.push({ id, type: 'near', nodes:
|
|
154
|
-
for (const n of
|
|
223
|
+
const groupNodes = groupIndices.map(idx => candidates[idx]);
|
|
224
|
+
nearClusters.push({ id, type: 'near', nodes: groupNodes });
|
|
225
|
+
for (const n of groupNodes) {
|
|
155
226
|
n.duplicateClusterId = id;
|
|
156
227
|
n.duplicateType = 'near';
|
|
157
228
|
}
|
|
158
229
|
}
|
|
159
230
|
}
|
|
160
231
|
|
|
161
|
-
|
|
232
|
+
return { nearClusters, nextId: clusterCounter };
|
|
233
|
+
}
|
|
162
234
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
|
|
167
|
-
if (avgRatio < 0.3) {
|
|
168
|
-
cluster.type = 'template_heavy';
|
|
169
|
-
cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
|
|
170
|
-
}
|
|
235
|
+
function processClusters(clusters: DuplicateCluster[], graph: Graph, collapse: boolean) {
|
|
236
|
+
for (const cluster of clusters) {
|
|
237
|
+
processSingleCluster(cluster, graph, collapse);
|
|
171
238
|
}
|
|
239
|
+
}
|
|
172
240
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
241
|
+
function processSingleCluster(cluster: DuplicateCluster, graph: Graph, collapse: boolean) {
|
|
242
|
+
checkTemplateHeavy(cluster);
|
|
243
|
+
cluster.severity = calculateSeverity(cluster);
|
|
244
|
+
const representative = selectRepresentative(cluster);
|
|
245
|
+
cluster.representative = representative.url;
|
|
246
|
+
applyClusterToGraph(cluster, representative, graph, collapse);
|
|
247
|
+
}
|
|
177
248
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
249
|
+
function checkTemplateHeavy(cluster: DuplicateCluster) {
|
|
250
|
+
const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
|
|
251
|
+
if (avgRatio < 0.3) {
|
|
252
|
+
cluster.type = 'template_heavy';
|
|
253
|
+
cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
|
|
254
|
+
}
|
|
255
|
+
}
|
|
183
256
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
257
|
+
function calculateSeverity(cluster: DuplicateCluster): 'low' | 'medium' | 'high' {
|
|
258
|
+
const canonicals = new Set<string>();
|
|
259
|
+
let hasMissing = false;
|
|
260
|
+
|
|
261
|
+
for (const n of cluster.nodes) {
|
|
262
|
+
if (!n.canonical) hasMissing = true;
|
|
263
|
+
else canonicals.add(n.canonical);
|
|
264
|
+
}
|
|
191
265
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
266
|
+
if (hasMissing || canonicals.size > 1) {
|
|
267
|
+
return 'high';
|
|
268
|
+
} else if (cluster.type === 'near') {
|
|
269
|
+
return 'medium';
|
|
270
|
+
} else {
|
|
271
|
+
return 'low';
|
|
272
|
+
}
|
|
273
|
+
}
|
|
198
274
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
275
|
+
function selectRepresentative(cluster: DuplicateCluster): GraphNode {
|
|
276
|
+
const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
|
|
277
|
+
const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
|
|
202
278
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
representativeNode = cluster.nodes.reduce((best, current) => {
|
|
207
|
-
if (current.inLinks > best.inLinks) return current;
|
|
208
|
-
if (current.inLinks < best.inLinks) return best;
|
|
209
|
-
if (current.url.length < best.url.length) return current;
|
|
210
|
-
return best;
|
|
211
|
-
});
|
|
212
|
-
}
|
|
279
|
+
if (validCanonicals.length > 0) {
|
|
280
|
+
return validCanonicals[0];
|
|
281
|
+
}
|
|
213
282
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
283
|
+
return cluster.nodes.reduce((best, current) => {
|
|
284
|
+
if (current.inLinks > best.inLinks) return current;
|
|
285
|
+
if (current.inLinks < best.inLinks) return best;
|
|
286
|
+
if (current.url.length < best.url.length) return current;
|
|
287
|
+
return best;
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function applyClusterToGraph(cluster: DuplicateCluster, representative: GraphNode, graph: Graph, collapse: boolean) {
|
|
292
|
+
cluster.nodes.forEach(n => {
|
|
293
|
+
n.isClusterPrimary = n.url === representative.url;
|
|
294
|
+
n.isCollapsed = false;
|
|
295
|
+
n.collapseInto = undefined;
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
graph.duplicateClusters.push({
|
|
299
|
+
id: cluster.id,
|
|
300
|
+
type: cluster.type,
|
|
301
|
+
size: cluster.nodes.length,
|
|
302
|
+
representative: representative.url,
|
|
303
|
+
severity: cluster.severity!
|
|
304
|
+
});
|
|
305
|
+
|
|
306
|
+
if (collapse) {
|
|
307
|
+
for (const n of cluster.nodes) {
|
|
308
|
+
if (n.url !== representative.url) {
|
|
309
|
+
n.isCollapsed = true;
|
|
310
|
+
n.collapseInto = representative.url;
|
|
238
311
|
}
|
|
239
312
|
}
|
|
240
313
|
}
|
|
314
|
+
}
|
|
241
315
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
const updatedEdges = new Map<string, number>();
|
|
246
|
-
|
|
247
|
-
for (const edge of edges) {
|
|
248
|
-
const sourceNode = graph.nodes.get(edge.source);
|
|
249
|
-
const targetNode = graph.nodes.get(edge.target);
|
|
316
|
+
function collapseEdges(graph: Graph) {
|
|
317
|
+
const edges = graph.getEdges();
|
|
318
|
+
const updatedEdges = new Map<string, number>();
|
|
250
319
|
|
|
251
|
-
|
|
320
|
+
for (const edge of edges) {
|
|
321
|
+
const sourceNode = graph.nodes.get(edge.source);
|
|
322
|
+
const targetNode = graph.nodes.get(edge.target);
|
|
252
323
|
|
|
253
|
-
|
|
254
|
-
// Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative."
|
|
255
|
-
// Actually, if a node links TO a collapsed node, we repoint the edge to the representative.
|
|
256
|
-
// If a collapsed node links to X, we ignore it (PageRank will filter it out).
|
|
324
|
+
if (!sourceNode || !targetNode) continue;
|
|
257
325
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
|
|
326
|
+
const actualSource = edge.source;
|
|
327
|
+
const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
|
|
261
328
|
|
|
262
|
-
|
|
263
|
-
if (actualSource === actualTarget) continue;
|
|
329
|
+
if (actualSource === actualTarget) continue;
|
|
264
330
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
331
|
+
const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
|
|
332
|
+
const existingWeight = updatedEdges.get(edgeKey) || 0;
|
|
333
|
+
updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
|
|
334
|
+
}
|
|
269
335
|
|
|
270
|
-
|
|
271
|
-
graph.edges = updatedEdges;
|
|
336
|
+
graph.edges = updatedEdges;
|
|
272
337
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
}
|
|
338
|
+
// Re-calculate inLinks and outLinks based on collapsed edges
|
|
339
|
+
for (const node of graph.getNodes()) {
|
|
340
|
+
node.inLinks = 0;
|
|
341
|
+
node.outLinks = 0;
|
|
342
|
+
}
|
|
343
|
+
for (const [edgeKey, _weight] of updatedEdges.entries()) {
|
|
344
|
+
const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
|
|
345
|
+
const sn = graph.nodes.get(src);
|
|
346
|
+
const tn = graph.nodes.get(tgt);
|
|
347
|
+
if (sn) sn.outLinks++;
|
|
348
|
+
if (tn) tn.inLinks++;
|
|
285
349
|
}
|
|
286
350
|
}
|