@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -1,5 +1,6 @@
1
1
  import { Graph, GraphNode, ClusterInfo } from './graph.js';
2
2
  import { SimHash } from './simhash.js';
3
+ import { load } from 'cheerio';
3
4
 
4
5
  /**
5
6
  * Detects content clusters using 64-bit SimHash and Hamming Distance.
@@ -18,24 +19,23 @@ export function detectContentClusters(
18
19
  // Banding Optimization (4 bands of 16 bits)
19
20
  // Note: For threshold > 3, this is a heuristic and may miss some pairs,
20
21
  // but it dramatically reduces the search space as requested.
21
- const bands = 4;
22
- const bandWidth = 16;
23
- const buckets: Map<number, Set<string>>[] = Array.from({ length: bands }, () => new Map());
22
+ const buckets: Map<number, Set<string>>[] = Array.from({ length: SimHash.BANDS }, () => new Map());
24
23
 
25
24
  for (const node of nodes) {
26
25
  const hash = BigInt(node.simhash!);
27
- for (let b = 0; b < bands; b++) {
28
- const bandValue = Number((hash >> BigInt(b * bandWidth)) & 0xFFFFn);
26
+ const bandValues = SimHash.getBands(hash);
27
+
28
+ bandValues.forEach((bandValue, b) => {
29
29
  if (!buckets[b].has(bandValue)) {
30
30
  buckets[b].set(bandValue, new Set());
31
31
  }
32
32
  buckets[b].get(bandValue)!.add(node.url);
33
- }
33
+ });
34
34
  }
35
35
 
36
36
  const checkedPairs = new Set<string>();
37
37
 
38
- for (let b = 0; b < bands; b++) {
38
+ for (let b = 0; b < SimHash.BANDS; b++) {
39
39
  for (const bucket of buckets[b].values()) {
40
40
  if (bucket.size < 2) continue;
41
41
  const bucketNodes = Array.from(bucket);
@@ -154,14 +154,68 @@ function selectPrimaryUrl(urls: string[], graph: Graph): string {
154
154
  * Calculates cannibalization risk based on title and H1 similarity within the cluster.
155
155
  */
156
156
  function calculateClusterRisk(nodes: GraphNode[]): 'low' | 'medium' | 'high' {
157
- // Logic: Check if there's significant overlap in Titles or H1s among cluster members.
158
- // This is a heuristic as requested.
159
- // Simplified heuristic: risk is based on cluster density and size
160
- // Large clusters of highly similar content are high risk.
161
-
162
- // Fallback to a safe categorization
163
- if (nodes.length > 5) return 'high';
164
- if (nodes.length > 2) return 'medium';
157
+ if (nodes.length <= 1) return 'low';
158
+
159
+ // Count title and H1 occurrences
160
+ const titleCounts = new Map<string, number>();
161
+ const h1Counts = new Map<string, number>();
162
+ let processedCount = 0;
163
+
164
+ for (const node of nodes) {
165
+ if (!node.html) continue;
166
+
167
+ try {
168
+ const $ = load(node.html);
169
+ const title = $('title').text().trim().toLowerCase();
170
+ const h1 = $('h1').first().text().trim().toLowerCase();
171
+
172
+ if (title) {
173
+ titleCounts.set(title, (titleCounts.get(title) || 0) + 1);
174
+ }
175
+ if (h1) {
176
+ h1Counts.set(h1, (h1Counts.get(h1) || 0) + 1);
177
+ }
178
+ processedCount++;
179
+ } catch {
180
+ // Ignore parsing errors
181
+ }
182
+ }
183
+
184
+ // If we couldn't parse enough content (e.g., no HTML stored), fallback to size-based heuristic
185
+ if (processedCount < nodes.length * 0.5) {
186
+ if (nodes.length > 5) return 'high';
187
+ if (nodes.length > 2) return 'medium';
188
+ return 'low';
189
+ }
190
+
191
+ // Calculate duplicate ratios
192
+ let duplicateTitleCount = 0;
193
+ let duplicateH1Count = 0;
194
+
195
+ for (const count of titleCounts.values()) {
196
+ if (count > 1) duplicateTitleCount += count;
197
+ }
198
+ for (const count of h1Counts.values()) {
199
+ if (count > 1) duplicateH1Count += count;
200
+ }
201
+
202
+ const titleDupeRatio = duplicateTitleCount / nodes.length;
203
+ const h1DupeRatio = duplicateH1Count / nodes.length;
204
+
205
+ // Heuristic 1: High Risk
206
+ // Significant overlap in Titles OR H1s (e.g., > 30% of cluster members are duplicates)
207
+ if (titleDupeRatio > 0.3 || h1DupeRatio > 0.3) {
208
+ return 'high';
209
+ }
210
+
211
+ // Heuristic 2: Medium Risk
212
+ // Any overlap, or very large clusters (potential template issues or thin content)
213
+ if (titleDupeRatio > 0 || h1DupeRatio > 0 || nodes.length > 10) {
214
+ return 'medium';
215
+ }
216
+
217
+ // Heuristic 3: Low Risk
218
+ // Unique content and manageable cluster size
165
219
  return 'low';
166
220
  }
167
221
 
@@ -21,18 +21,36 @@ interface DuplicateCluster {
21
21
  export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
22
22
  const collapse = options.collapse !== false; // Default to true
23
23
  const threshold = options.simhashThreshold ?? 3;
24
-
25
- const exactClusters: DuplicateCluster[] = [];
26
- const nearClusters: DuplicateCluster[] = [];
27
-
28
24
  const nodes = graph.getNodes();
25
+ let clusterCounter = 1;
29
26
 
30
27
  // Phase 1 & 2: Exact Duplicate Detection
28
+ const { exactClusters, nearCandidates, nextId: nextId1 } = findExactDuplicates(nodes, clusterCounter);
29
+ clusterCounter = nextId1;
30
+
31
+ // Phase 3: Near Duplicate Detection
32
+ const { nearClusters } = findNearDuplicates(nearCandidates, threshold, clusterCounter);
33
+
34
+ const allClusters = [...exactClusters, ...nearClusters];
35
+
36
+ // Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative)
37
+ processClusters(allClusters, graph, collapse);
38
+
39
+ // Final Edge Transfer if Collapsing
40
+ if (collapse) {
41
+ collapseEdges(graph);
42
+ }
43
+ }
44
+
45
+ function findExactDuplicates(nodes: GraphNode[], startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
46
+ const exactMap = groupNodesByContentHash(nodes);
47
+ return createExactClusters(exactMap, startId);
48
+ }
49
+
50
+ function groupNodesByContentHash(nodes: GraphNode[]): Map<string, GraphNode[]> {
31
51
  const exactMap = new Map<string, GraphNode[]>();
32
52
  for (const node of nodes) {
33
53
  if (!node.contentHash || node.status !== 200) continue;
34
-
35
- // Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs)
36
54
  let arr = exactMap.get(node.contentHash);
37
55
  if (!arr) {
38
56
  arr = [];
@@ -40,16 +58,18 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
40
58
  }
41
59
  arr.push(node);
42
60
  }
61
+ return exactMap;
62
+ }
43
63
 
44
- // Nodes that are NOT part of an exact duplicate group are candidates for near duplicate checks
64
+ function createExactClusters(exactMap: Map<string, GraphNode[]>, startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
65
+ const exactClusters: DuplicateCluster[] = [];
45
66
  const nearCandidates: GraphNode[] = [];
46
- let clusterCounter = 1;
67
+ let clusterCounter = startId;
47
68
 
48
69
  for (const [_hash, group] of exactMap.entries()) {
49
70
  if (group.length > 1) {
50
71
  const id = `cluster_exact_${clusterCounter++}`;
51
72
  exactClusters.push({ id, type: 'exact', nodes: group });
52
- // Mark nodes
53
73
  for (const n of group) {
54
74
  n.duplicateClusterId = id;
55
75
  n.duplicateType = 'exact';
@@ -59,228 +79,272 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
59
79
  }
60
80
  }
61
81
 
62
- // Phase 3: Near Duplicate Detection (SimHash with Bands)
63
- // 64-bit simhash -> split into 4 bands of 16 bits.
64
- const bandsMaps = [
65
- new Map<number, GraphNode[]>(),
66
- new Map<number, GraphNode[]>(),
67
- new Map<number, GraphNode[]>(),
68
- new Map<number, GraphNode[]>()
69
- ];
70
-
71
- for (const node of nearCandidates) {
72
- if (!node.simhash) continue;
73
- const simhash = BigInt(node.simhash);
74
-
75
- // Extract 16 bit bands
76
- const b0 = Number(simhash & 0xFFFFn);
77
- const b1 = Number((simhash >> 16n) & 0xFFFFn);
78
- const b2 = Number((simhash >> 32n) & 0xFFFFn);
79
- const b3 = Number((simhash >> 48n) & 0xFFFFn);
80
-
81
- const bands = [b0, b1, b2, b3];
82
- for (let i = 0; i < 4; i++) {
83
- let arr = bandsMaps[i].get(bands[i]);
82
+ return { exactClusters, nearCandidates, nextId: clusterCounter };
83
+ }
84
+
85
+ function findNearDuplicates(candidates: GraphNode[], threshold: number, startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
86
+ const { bandsMaps, simhashes } = buildSimHashBuckets(candidates);
87
+ const { parent, involvedIndices } = findConnectedComponents(bandsMaps, simhashes, candidates.length, threshold);
88
+ return extractClusters(parent, involvedIndices, candidates, startId);
89
+ }
90
+
91
+ function buildSimHashBuckets(candidates: GraphNode[]): { bandsMaps: Map<number, number[]>[], simhashes: BigUint64Array, validIndices: number[] } {
92
+ const n = candidates.length;
93
+ const simhashes = new BigUint64Array(n);
94
+ const validIndices: number[] = [];
95
+
96
+ for (let i = 0; i < n; i++) {
97
+ if (candidates[i].simhash) {
98
+ simhashes[i] = BigInt(candidates[i].simhash!);
99
+ validIndices.push(i);
100
+ }
101
+ }
102
+
103
+ const bandsMaps: Map<number, number[]>[] = Array.from({ length: SimHash.BANDS }, () => new Map());
104
+
105
+ for (const idx of validIndices) {
106
+ const bands = SimHash.getBands(simhashes[idx]);
107
+ for (let b = 0; b < SimHash.BANDS; b++) {
108
+ let arr = bandsMaps[b].get(bands[b]);
84
109
  if (!arr) {
85
110
  arr = [];
86
- bandsMaps[i].set(bands[i], arr);
111
+ bandsMaps[b].set(bands[b], arr);
112
+ }
113
+ arr.push(idx);
114
+ }
115
+ }
116
+
117
+ return { bandsMaps, simhashes, validIndices };
118
+ }
119
+
120
+ function findConnectedComponents(bandsMaps: Map<number, number[]>[], simhashes: BigUint64Array, n: number, threshold: number): { parent: Uint32Array, involvedIndices: Set<number> } {
121
+ // Union-Find Arrays (Integer-based)
122
+ const parent = new Uint32Array(n);
123
+ const rank = new Uint8Array(n);
124
+ for (let i = 0; i < n; i++) {
125
+ parent[i] = i;
126
+ rank[i] = 0;
127
+ }
128
+
129
+ function find(i: number): number {
130
+ let root = i;
131
+ while (parent[root] !== root) {
132
+ root = parent[root];
133
+ }
134
+ let curr = i;
135
+ while (curr !== root) {
136
+ const next = parent[curr];
137
+ parent[curr] = root;
138
+ curr = next;
139
+ }
140
+ return root;
141
+ }
142
+
143
+ function union(i: number, j: number) {
144
+ const rootI = find(i);
145
+ const rootJ = find(j);
146
+ if (rootI !== rootJ) {
147
+ const rankI = rank[rootI];
148
+ const rankJ = rank[rootJ];
149
+ if (rankI < rankJ) {
150
+ parent[rootI] = rootJ;
151
+ } else if (rankI > rankJ) {
152
+ parent[rootJ] = rootI;
153
+ } else {
154
+ parent[rootJ] = rootI;
155
+ rank[rootI]++;
87
156
  }
88
- arr.push(node);
89
157
  }
90
158
  }
91
159
 
92
- // Find candidate pairs
93
- const nearGroupMap = new Map<string, Set<GraphNode>>(); // node.url -> cluster set
94
- const checkedPairs = new Set<string>();
160
+ const involvedIndices = new Set<number>();
95
161
 
96
- for (let i = 0; i < 4; i++) {
97
- for (const [_bandVal, bucketNodes] of bandsMaps[i].entries()) {
98
- if (bucketNodes.length < 2) continue; // nothing to compare
162
+ for (let b = 0; b < SimHash.BANDS; b++) {
163
+ for (const bucketIndices of bandsMaps[b].values()) {
164
+ if (bucketIndices.length < 2) continue;
99
165
 
100
- // Compare all nodes in this bucket
101
- for (let j = 0; j < bucketNodes.length; j++) {
102
- for (let k = j + 1; k < bucketNodes.length; k++) {
103
- const n1 = bucketNodes[j];
104
- const n2 = bucketNodes[k];
166
+ for (let j = 0; j < bucketIndices.length; j++) {
167
+ for (let k = j + 1; k < bucketIndices.length; k++) {
168
+ const idx1 = bucketIndices[j];
169
+ const idx2 = bucketIndices[k];
105
170
 
106
- // Ensure n1 < n2 lexicographically to avoid duplicate pairs
107
- const [a, b] = n1.url < n2.url ? [n1, n2] : [n2, n1];
108
- const pairKey = `${a.url}|${b.url}`;
171
+ const root1 = find(idx1);
172
+ const root2 = find(idx2);
109
173
 
110
- if (checkedPairs.has(pairKey)) continue;
111
- checkedPairs.add(pairKey);
174
+ if (root1 === root2) continue; // Already connected, skip expensive distance check
112
175
 
113
- const dist = SimHash.hammingDistance(BigInt(a.simhash!), BigInt(b.simhash!));
176
+ const dist = SimHash.hammingDistance(simhashes[idx1], simhashes[idx2]);
114
177
  if (dist <= threshold) {
115
- // They are near duplicates.
116
- // Find or create their cluster set using union-find or reference propagation
117
- const setA = nearGroupMap.get(a.url);
118
- const setB = nearGroupMap.get(b.url);
119
-
120
- if (!setA && !setB) {
121
- const newSet = new Set<GraphNode>([a, b]);
122
- nearGroupMap.set(a.url, newSet);
123
- nearGroupMap.set(b.url, newSet);
124
- } else if (setA && !setB) {
125
- setA.add(b);
126
- nearGroupMap.set(b.url, setA);
127
- } else if (setB && !setA) {
128
- setB.add(a);
129
- nearGroupMap.set(a.url, setB);
130
- } else if (setA && setB && setA !== setB) {
131
- // Merge sets
132
- for (const node of setB) {
133
- setA.add(node);
134
- nearGroupMap.set(node.url, setA);
135
- }
136
- }
178
+ union(root1, root2);
179
+ involvedIndices.add(idx1);
180
+ involvedIndices.add(idx2);
137
181
  }
138
182
  }
139
183
  }
140
184
  }
141
185
  }
142
186
 
143
- // Compile near duplicate clusters (deduplicated by Set reference)
144
- const uniqueNearSets = new Set<Set<GraphNode>>();
145
- for (const group of nearGroupMap.values()) {
146
- uniqueNearSets.add(group);
187
+ return { parent, involvedIndices };
188
+ }
189
+
190
+ function extractClusters(parent: Uint32Array, involvedIndices: Set<number>, candidates: GraphNode[], startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
191
+ const nearClusters: DuplicateCluster[] = [];
192
+ let clusterCounter = startId;
193
+
194
+ function find(i: number): number {
195
+ let root = i;
196
+ while (parent[root] !== root) {
197
+ root = parent[root];
198
+ }
199
+ let curr = i;
200
+ while (curr !== root) {
201
+ const next = parent[curr];
202
+ parent[curr] = root;
203
+ curr = next;
204
+ }
205
+ return root;
147
206
  }
148
207
 
149
- for (const groupSet of uniqueNearSets) {
150
- if (groupSet.size > 1) {
208
+ // Compile clusters
209
+ const clusterMap = new Map<number, number[]>();
210
+ for (const idx of involvedIndices) {
211
+ const root = find(idx);
212
+ let group = clusterMap.get(root);
213
+ if (!group) {
214
+ group = [];
215
+ clusterMap.set(root, group);
216
+ }
217
+ group.push(idx);
218
+ }
219
+
220
+ for (const groupIndices of clusterMap.values()) {
221
+ if (groupIndices.length > 1) {
151
222
  const id = `cluster_near_${clusterCounter++}`;
152
- const groupArr = Array.from(groupSet);
153
- nearClusters.push({ id, type: 'near', nodes: groupArr });
154
- for (const n of groupArr) {
223
+ const groupNodes = groupIndices.map(idx => candidates[idx]);
224
+ nearClusters.push({ id, type: 'near', nodes: groupNodes });
225
+ for (const n of groupNodes) {
155
226
  n.duplicateClusterId = id;
156
227
  n.duplicateType = 'near';
157
228
  }
158
229
  }
159
230
  }
160
231
 
161
- const allClusters = [...exactClusters, ...nearClusters];
232
+ return { nearClusters, nextId: clusterCounter };
233
+ }
162
234
 
163
- // Phase 4: Template-Heavy Detection
164
- // Mark classes as 'template_heavy' if ratio < 0.3
165
- for (const cluster of allClusters) {
166
- const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
167
- if (avgRatio < 0.3) {
168
- cluster.type = 'template_heavy';
169
- cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
170
- }
235
+ function processClusters(clusters: DuplicateCluster[], graph: Graph, collapse: boolean) {
236
+ for (const cluster of clusters) {
237
+ processSingleCluster(cluster, graph, collapse);
171
238
  }
239
+ }
172
240
 
173
- // Phase 5: Canonical Conflict & Representative Selection
174
- for (const cluster of allClusters) {
175
- const canonicals = new Set<string>();
176
- let hasMissing = false;
241
+ function processSingleCluster(cluster: DuplicateCluster, graph: Graph, collapse: boolean) {
242
+ checkTemplateHeavy(cluster);
243
+ cluster.severity = calculateSeverity(cluster);
244
+ const representative = selectRepresentative(cluster);
245
+ cluster.representative = representative.url;
246
+ applyClusterToGraph(cluster, representative, graph, collapse);
247
+ }
177
248
 
178
- for (const n of cluster.nodes) {
179
- if (!n.canonical) hasMissing = true;
180
- // We compare full absolute canonical URLs (assuming they are normalized during crawl)
181
- else canonicals.add(n.canonical);
182
- }
249
+ function checkTemplateHeavy(cluster: DuplicateCluster) {
250
+ const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
251
+ if (avgRatio < 0.3) {
252
+ cluster.type = 'template_heavy';
253
+ cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
254
+ }
255
+ }
183
256
 
184
- if (hasMissing || canonicals.size > 1) {
185
- cluster.severity = 'high';
186
- } else if (cluster.type === 'near') {
187
- cluster.severity = 'medium';
188
- } else {
189
- cluster.severity = 'low';
190
- }
257
+ function calculateSeverity(cluster: DuplicateCluster): 'low' | 'medium' | 'high' {
258
+ const canonicals = new Set<string>();
259
+ let hasMissing = false;
260
+
261
+ for (const n of cluster.nodes) {
262
+ if (!n.canonical) hasMissing = true;
263
+ else canonicals.add(n.canonical);
264
+ }
191
265
 
192
- // Phase 6: Select Representative
193
- // 1. Valid Canonical target in cluster
194
- // 2. Highest internal in-degree
195
- // 3. Shortest URL
196
- // 4. First discovered (relying on array order, which is from BFS map roughly)
197
- let representativeNode = cluster.nodes[0];
266
+ if (hasMissing || canonicals.size > 1) {
267
+ return 'high';
268
+ } else if (cluster.type === 'near') {
269
+ return 'medium';
270
+ } else {
271
+ return 'low';
272
+ }
273
+ }
198
274
 
199
- // Evaluate best rep
200
- const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
201
- const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
275
+ function selectRepresentative(cluster: DuplicateCluster): GraphNode {
276
+ const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
277
+ const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
202
278
 
203
- if (validCanonicals.length > 0) {
204
- representativeNode = validCanonicals[0]; // If multiple, just pick first matching self
205
- } else {
206
- representativeNode = cluster.nodes.reduce((best, current) => {
207
- if (current.inLinks > best.inLinks) return current;
208
- if (current.inLinks < best.inLinks) return best;
209
- if (current.url.length < best.url.length) return current;
210
- return best;
211
- });
212
- }
279
+ if (validCanonicals.length > 0) {
280
+ return validCanonicals[0];
281
+ }
213
282
 
214
- cluster.representative = representativeNode.url;
215
-
216
- cluster.nodes.forEach(n => {
217
- n.isClusterPrimary = n.url === representativeNode.url;
218
- n.isCollapsed = false; // default for JSON
219
- n.collapseInto = undefined;
220
- });
221
-
222
- // Push to Graph's final cluster list
223
- graph.duplicateClusters.push({
224
- id: cluster.id,
225
- type: cluster.type,
226
- size: cluster.nodes.length,
227
- representative: representativeNode.url,
228
- severity: cluster.severity!
229
- });
230
-
231
- // Controlled Collapse
232
- if (collapse) {
233
- for (const n of cluster.nodes) {
234
- if (n.url !== representativeNode.url) {
235
- n.isCollapsed = true;
236
- n.collapseInto = representativeNode.url;
237
- }
283
+ return cluster.nodes.reduce((best, current) => {
284
+ if (current.inLinks > best.inLinks) return current;
285
+ if (current.inLinks < best.inLinks) return best;
286
+ if (current.url.length < best.url.length) return current;
287
+ return best;
288
+ });
289
+ }
290
+
291
+ function applyClusterToGraph(cluster: DuplicateCluster, representative: GraphNode, graph: Graph, collapse: boolean) {
292
+ cluster.nodes.forEach(n => {
293
+ n.isClusterPrimary = n.url === representative.url;
294
+ n.isCollapsed = false;
295
+ n.collapseInto = undefined;
296
+ });
297
+
298
+ graph.duplicateClusters.push({
299
+ id: cluster.id,
300
+ type: cluster.type,
301
+ size: cluster.nodes.length,
302
+ representative: representative.url,
303
+ severity: cluster.severity!
304
+ });
305
+
306
+ if (collapse) {
307
+ for (const n of cluster.nodes) {
308
+ if (n.url !== representative.url) {
309
+ n.isCollapsed = true;
310
+ n.collapseInto = representative.url;
238
311
  }
239
312
  }
240
313
  }
314
+ }
241
315
 
242
- // Final Edge Transfer if Collapsing
243
- if (collapse) {
244
- const edges = graph.getEdges();
245
- const updatedEdges = new Map<string, number>();
246
-
247
- for (const edge of edges) {
248
- const sourceNode = graph.nodes.get(edge.source);
249
- const targetNode = graph.nodes.get(edge.target);
316
+ function collapseEdges(graph: Graph) {
317
+ const edges = graph.getEdges();
318
+ const updatedEdges = new Map<string, number>();
250
319
 
251
- if (!sourceNode || !targetNode) continue;
320
+ for (const edge of edges) {
321
+ const sourceNode = graph.nodes.get(edge.source);
322
+ const targetNode = graph.nodes.get(edge.target);
252
323
 
253
- // We do NOT modify source structure for out-bound edges of collapsed nodes?
254
- // Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative."
255
- // Actually, if a node links TO a collapsed node, we repoint the edge to the representative.
256
- // If a collapsed node links to X, we ignore it (PageRank will filter it out).
324
+ if (!sourceNode || !targetNode) continue;
257
325
 
258
- const actualSource = edge.source;
259
- // repoint target
260
- const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
326
+ const actualSource = edge.source;
327
+ const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
261
328
 
262
- // Skip self-referential edges caused by repointing
263
- if (actualSource === actualTarget) continue;
329
+ if (actualSource === actualTarget) continue;
264
330
 
265
- const edgeKey = `${actualSource}|${actualTarget}`;
266
- const existingWeight = updatedEdges.get(edgeKey) || 0;
267
- updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); // deduplicate
268
- }
331
+ const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
332
+ const existingWeight = updatedEdges.get(edgeKey) || 0;
333
+ updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
334
+ }
269
335
 
270
- // Update graph edges in-place
271
- graph.edges = updatedEdges;
336
+ graph.edges = updatedEdges;
272
337
 
273
- // Re-calculate inLinks and outLinks based on collapsed edges
274
- for (const node of graph.getNodes()) {
275
- node.inLinks = 0;
276
- node.outLinks = 0;
277
- }
278
- for (const [edgeKey, _weight] of updatedEdges.entries()) {
279
- const [src, tgt] = edgeKey.split('|');
280
- const sn = graph.nodes.get(src);
281
- const tn = graph.nodes.get(tgt);
282
- if (sn) sn.outLinks++;
283
- if (tn) tn.inLinks++;
284
- }
338
+ // Re-calculate inLinks and outLinks based on collapsed edges
339
+ for (const node of graph.getNodes()) {
340
+ node.inLinks = 0;
341
+ node.outLinks = 0;
342
+ }
343
+ for (const [edgeKey, _weight] of updatedEdges.entries()) {
344
+ const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
345
+ const sn = graph.nodes.get(src);
346
+ const tn = graph.nodes.get(tgt);
347
+ if (sn) sn.outLinks++;
348
+ if (tn) tn.inLinks++;
285
349
  }
286
350
  }