@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -0,0 +1,48 @@
1
+ export type CrawlEvent = {
2
+ type: 'crawl:start';
3
+ url: string;
4
+ } | {
5
+ type: 'crawl:success';
6
+ url: string;
7
+ status: number;
8
+ durationMs: number;
9
+ depth?: number;
10
+ } | {
11
+ type: 'crawl:error';
12
+ url: string;
13
+ error: string;
14
+ depth?: number;
15
+ } | {
16
+ type: 'crawl:limit-reached';
17
+ limit: number;
18
+ } | {
19
+ type: 'queue:enqueue';
20
+ url: string;
21
+ depth: number;
22
+ } | {
23
+ type: 'metrics:start';
24
+ phase: string;
25
+ } | {
26
+ type: 'metrics:complete';
27
+ durationMs: number;
28
+ } | {
29
+ type: 'debug';
30
+ message: string;
31
+ context?: unknown;
32
+ } | {
33
+ type: 'info';
34
+ message: string;
35
+ context?: unknown;
36
+ } | {
37
+ type: 'warn';
38
+ message: string;
39
+ context?: unknown;
40
+ } | {
41
+ type: 'error';
42
+ message: string;
43
+ error?: unknown;
44
+ context?: unknown;
45
+ };
46
+ export interface EngineContext {
47
+ emit: (event: CrawlEvent) => void;
48
+ }
package/dist/events.js ADDED
@@ -0,0 +1 @@
1
+ export {};
@@ -1,4 +1,5 @@
1
1
  import { SimHash } from './simhash.js';
2
+ import { load } from 'cheerio';
2
3
  /**
3
4
  * Detects content clusters using 64-bit SimHash and Hamming Distance.
4
5
  * Uses band optimization to reduce O(n^2) comparisons.
@@ -11,21 +12,19 @@ export function detectContentClusters(graph, threshold = 10, minSize = 3) {
11
12
  // Banding Optimization (4 bands of 16 bits)
12
13
  // Note: For threshold > 3, this is a heuristic and may miss some pairs,
13
14
  // but it dramatically reduces the search space as requested.
14
- const bands = 4;
15
- const bandWidth = 16;
16
- const buckets = Array.from({ length: bands }, () => new Map());
15
+ const buckets = Array.from({ length: SimHash.BANDS }, () => new Map());
17
16
  for (const node of nodes) {
18
17
  const hash = BigInt(node.simhash);
19
- for (let b = 0; b < bands; b++) {
20
- const bandValue = Number((hash >> BigInt(b * bandWidth)) & 0xffffn);
18
+ const bandValues = SimHash.getBands(hash);
19
+ bandValues.forEach((bandValue, b) => {
21
20
  if (!buckets[b].has(bandValue)) {
22
21
  buckets[b].set(bandValue, new Set());
23
22
  }
24
23
  buckets[b].get(bandValue).add(node.url);
25
- }
24
+ });
26
25
  }
27
26
  const checkedPairs = new Set();
28
- for (let b = 0; b < bands; b++) {
27
+ for (let b = 0; b < SimHash.BANDS; b++) {
29
28
  for (const bucket of buckets[b].values()) {
30
29
  if (bucket.size < 2)
31
30
  continue;
@@ -135,15 +134,64 @@ function selectPrimaryUrl(urls, graph) {
135
134
  * Calculates cannibalization risk based on title and H1 similarity within the cluster.
136
135
  */
137
136
  function calculateClusterRisk(nodes) {
138
- // Logic: Check if there's significant overlap in Titles or H1s among cluster members.
139
- // This is a heuristic as requested.
140
- // Simplified heuristic: risk is based on cluster density and size
141
- // Large clusters of highly similar content are high risk.
142
- // Fallback to a safe categorization
143
- if (nodes.length > 5)
137
+ if (nodes.length <= 1)
138
+ return 'low';
139
+ // Count title and H1 occurrences
140
+ const titleCounts = new Map();
141
+ const h1Counts = new Map();
142
+ let processedCount = 0;
143
+ for (const node of nodes) {
144
+ if (!node.html)
145
+ continue;
146
+ try {
147
+ const $ = load(node.html);
148
+ const title = $('title').text().trim().toLowerCase();
149
+ const h1 = $('h1').first().text().trim().toLowerCase();
150
+ if (title) {
151
+ titleCounts.set(title, (titleCounts.get(title) || 0) + 1);
152
+ }
153
+ if (h1) {
154
+ h1Counts.set(h1, (h1Counts.get(h1) || 0) + 1);
155
+ }
156
+ processedCount++;
157
+ }
158
+ catch {
159
+ // Ignore parsing errors
160
+ }
161
+ }
162
+ // If we couldn't parse enough content (e.g., no HTML stored), fallback to size-based heuristic
163
+ if (processedCount < nodes.length * 0.5) {
164
+ if (nodes.length > 5)
165
+ return 'high';
166
+ if (nodes.length > 2)
167
+ return 'medium';
168
+ return 'low';
169
+ }
170
+ // Calculate duplicate ratios
171
+ let duplicateTitleCount = 0;
172
+ let duplicateH1Count = 0;
173
+ for (const count of titleCounts.values()) {
174
+ if (count > 1)
175
+ duplicateTitleCount += count;
176
+ }
177
+ for (const count of h1Counts.values()) {
178
+ if (count > 1)
179
+ duplicateH1Count += count;
180
+ }
181
+ const titleDupeRatio = duplicateTitleCount / nodes.length;
182
+ const h1DupeRatio = duplicateH1Count / nodes.length;
183
+ // Heuristic 1: High Risk
184
+ // Significant overlap in Titles OR H1s (e.g., > 30% of cluster members are duplicates)
185
+ if (titleDupeRatio > 0.3 || h1DupeRatio > 0.3) {
144
186
  return 'high';
145
- if (nodes.length > 2)
187
+ }
188
+ // Heuristic 2: Medium Risk
189
+ // Any overlap, or very large clusters (potential template issues or thin content)
190
+ if (titleDupeRatio > 0 || h1DupeRatio > 0 || nodes.length > 10) {
146
191
  return 'medium';
192
+ }
193
+ // Heuristic 3: Low Risk
194
+ // Unique content and manageable cluster size
147
195
  return 'low';
148
196
  }
149
197
  /**
@@ -1,3 +1,4 @@
1
+ import { Graph } from './graph.js';
1
2
  import { SimHash } from './simhash.js';
2
3
  /**
3
4
  * Detects exact and near duplicates, identifies canonical conflicts,
@@ -6,15 +7,30 @@ import { SimHash } from './simhash.js';
6
7
  export function detectDuplicates(graph, options = {}) {
7
8
  const collapse = options.collapse !== false; // Default to true
8
9
  const threshold = options.simhashThreshold ?? 3;
9
- const exactClusters = [];
10
- const nearClusters = [];
11
10
  const nodes = graph.getNodes();
11
+ let clusterCounter = 1;
12
12
  // Phase 1 & 2: Exact Duplicate Detection
13
+ const { exactClusters, nearCandidates, nextId: nextId1 } = findExactDuplicates(nodes, clusterCounter);
14
+ clusterCounter = nextId1;
15
+ // Phase 3: Near Duplicate Detection
16
+ const { nearClusters } = findNearDuplicates(nearCandidates, threshold, clusterCounter);
17
+ const allClusters = [...exactClusters, ...nearClusters];
18
+ // Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative)
19
+ processClusters(allClusters, graph, collapse);
20
+ // Final Edge Transfer if Collapsing
21
+ if (collapse) {
22
+ collapseEdges(graph);
23
+ }
24
+ }
25
+ function findExactDuplicates(nodes, startId) {
26
+ const exactMap = groupNodesByContentHash(nodes);
27
+ return createExactClusters(exactMap, startId);
28
+ }
29
+ function groupNodesByContentHash(nodes) {
13
30
  const exactMap = new Map();
14
31
  for (const node of nodes) {
15
32
  if (!node.contentHash || node.status !== 200)
16
33
  continue;
17
- // Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs)
18
34
  let arr = exactMap.get(node.contentHash);
19
35
  if (!arr) {
20
36
  arr = [];
@@ -22,14 +38,16 @@ export function detectDuplicates(graph, options = {}) {
22
38
  }
23
39
  arr.push(node);
24
40
  }
25
- // Nodes that are NOT part of an exact duplicate group are candidates for near duplicate checks
41
+ return exactMap;
42
+ }
43
+ function createExactClusters(exactMap, startId) {
44
+ const exactClusters = [];
26
45
  const nearCandidates = [];
27
- let clusterCounter = 1;
46
+ let clusterCounter = startId;
28
47
  for (const [_hash, group] of exactMap.entries()) {
29
48
  if (group.length > 1) {
30
49
  const id = `cluster_exact_${clusterCounter++}`;
31
50
  exactClusters.push({ id, type: 'exact', nodes: group });
32
- // Mark nodes
33
51
  for (const n of group) {
34
52
  n.duplicateClusterId = id;
35
53
  n.duplicateType = 'exact';
@@ -39,213 +57,246 @@ export function detectDuplicates(graph, options = {}) {
39
57
  nearCandidates.push(group[0]);
40
58
  }
41
59
  }
42
- // Phase 3: Near Duplicate Detection (SimHash with Bands)
43
- // 64-bit simhash -> split into 4 bands of 16 bits.
44
- const bandsMaps = [
45
- new Map(),
46
- new Map(),
47
- new Map(),
48
- new Map()
49
- ];
50
- for (const node of nearCandidates) {
51
- if (!node.simhash)
52
- continue;
53
- const simhash = BigInt(node.simhash);
54
- // Extract 16 bit bands
55
- const b0 = Number(simhash & 0xffffn);
56
- const b1 = Number((simhash >> 16n) & 0xffffn);
57
- const b2 = Number((simhash >> 32n) & 0xffffn);
58
- const b3 = Number((simhash >> 48n) & 0xffffn);
59
- const bands = [b0, b1, b2, b3];
60
- for (let i = 0; i < 4; i++) {
61
- let arr = bandsMaps[i].get(bands[i]);
60
+ return { exactClusters, nearCandidates, nextId: clusterCounter };
61
+ }
62
+ function findNearDuplicates(candidates, threshold, startId) {
63
+ const { bandsMaps, simhashes } = buildSimHashBuckets(candidates);
64
+ const { parent, involvedIndices } = findConnectedComponents(bandsMaps, simhashes, candidates.length, threshold);
65
+ return extractClusters(parent, involvedIndices, candidates, startId);
66
+ }
67
+ function buildSimHashBuckets(candidates) {
68
+ const n = candidates.length;
69
+ const simhashes = new BigUint64Array(n);
70
+ const validIndices = [];
71
+ for (let i = 0; i < n; i++) {
72
+ if (candidates[i].simhash) {
73
+ simhashes[i] = BigInt(candidates[i].simhash);
74
+ validIndices.push(i);
75
+ }
76
+ }
77
+ const bandsMaps = Array.from({ length: SimHash.BANDS }, () => new Map());
78
+ for (const idx of validIndices) {
79
+ const bands = SimHash.getBands(simhashes[idx]);
80
+ for (let b = 0; b < SimHash.BANDS; b++) {
81
+ let arr = bandsMaps[b].get(bands[b]);
62
82
  if (!arr) {
63
83
  arr = [];
64
- bandsMaps[i].set(bands[i], arr);
84
+ bandsMaps[b].set(bands[b], arr);
65
85
  }
66
- arr.push(node);
86
+ arr.push(idx);
67
87
  }
68
88
  }
69
- // Find candidate pairs
70
- const nearGroupMap = new Map(); // node.url -> cluster set
71
- const checkedPairs = new Set();
72
- for (let i = 0; i < 4; i++) {
73
- for (const [_bandVal, bucketNodes] of bandsMaps[i].entries()) {
74
- if (bucketNodes.length < 2)
75
- continue; // nothing to compare
76
- // Compare all nodes in this bucket
77
- for (let j = 0; j < bucketNodes.length; j++) {
78
- for (let k = j + 1; k < bucketNodes.length; k++) {
79
- const n1 = bucketNodes[j];
80
- const n2 = bucketNodes[k];
81
- // Ensure n1 < n2 lexicographically to avoid duplicate pairs
82
- const [a, b] = n1.url < n2.url ? [n1, n2] : [n2, n1];
83
- const pairKey = `${a.url}|${b.url}`;
84
- if (checkedPairs.has(pairKey))
85
- continue;
86
- checkedPairs.add(pairKey);
87
- const dist = SimHash.hammingDistance(BigInt(a.simhash), BigInt(b.simhash));
89
+ return { bandsMaps, simhashes, validIndices };
90
+ }
91
+ function findConnectedComponents(bandsMaps, simhashes, n, threshold) {
92
+ // Union-Find Arrays (Integer-based)
93
+ const parent = new Uint32Array(n);
94
+ const rank = new Uint8Array(n);
95
+ for (let i = 0; i < n; i++) {
96
+ parent[i] = i;
97
+ rank[i] = 0;
98
+ }
99
+ function find(i) {
100
+ let root = i;
101
+ while (parent[root] !== root) {
102
+ root = parent[root];
103
+ }
104
+ let curr = i;
105
+ while (curr !== root) {
106
+ const next = parent[curr];
107
+ parent[curr] = root;
108
+ curr = next;
109
+ }
110
+ return root;
111
+ }
112
+ function union(i, j) {
113
+ const rootI = find(i);
114
+ const rootJ = find(j);
115
+ if (rootI !== rootJ) {
116
+ const rankI = rank[rootI];
117
+ const rankJ = rank[rootJ];
118
+ if (rankI < rankJ) {
119
+ parent[rootI] = rootJ;
120
+ }
121
+ else if (rankI > rankJ) {
122
+ parent[rootJ] = rootI;
123
+ }
124
+ else {
125
+ parent[rootJ] = rootI;
126
+ rank[rootI]++;
127
+ }
128
+ }
129
+ }
130
+ const involvedIndices = new Set();
131
+ for (let b = 0; b < SimHash.BANDS; b++) {
132
+ for (const bucketIndices of bandsMaps[b].values()) {
133
+ if (bucketIndices.length < 2)
134
+ continue;
135
+ for (let j = 0; j < bucketIndices.length; j++) {
136
+ for (let k = j + 1; k < bucketIndices.length; k++) {
137
+ const idx1 = bucketIndices[j];
138
+ const idx2 = bucketIndices[k];
139
+ const root1 = find(idx1);
140
+ const root2 = find(idx2);
141
+ if (root1 === root2)
142
+ continue; // Already connected, skip expensive distance check
143
+ const dist = SimHash.hammingDistance(simhashes[idx1], simhashes[idx2]);
88
144
  if (dist <= threshold) {
89
- // They are near duplicates.
90
- // Find or create their cluster set using union-find or reference propagation
91
- const setA = nearGroupMap.get(a.url);
92
- const setB = nearGroupMap.get(b.url);
93
- if (!setA && !setB) {
94
- const newSet = new Set([a, b]);
95
- nearGroupMap.set(a.url, newSet);
96
- nearGroupMap.set(b.url, newSet);
97
- }
98
- else if (setA && !setB) {
99
- setA.add(b);
100
- nearGroupMap.set(b.url, setA);
101
- }
102
- else if (setB && !setA) {
103
- setB.add(a);
104
- nearGroupMap.set(a.url, setB);
105
- }
106
- else if (setA && setB && setA !== setB) {
107
- // Merge sets
108
- for (const node of setB) {
109
- setA.add(node);
110
- nearGroupMap.set(node.url, setA);
111
- }
112
- }
145
+ union(root1, root2);
146
+ involvedIndices.add(idx1);
147
+ involvedIndices.add(idx2);
113
148
  }
114
149
  }
115
150
  }
116
151
  }
117
152
  }
118
- // Compile near duplicate clusters (deduplicated by Set reference)
119
- const uniqueNearSets = new Set();
120
- for (const group of nearGroupMap.values()) {
121
- uniqueNearSets.add(group);
153
+ return { parent, involvedIndices };
154
+ }
155
+ function extractClusters(parent, involvedIndices, candidates, startId) {
156
+ const nearClusters = [];
157
+ let clusterCounter = startId;
158
+ function find(i) {
159
+ let root = i;
160
+ while (parent[root] !== root) {
161
+ root = parent[root];
162
+ }
163
+ let curr = i;
164
+ while (curr !== root) {
165
+ const next = parent[curr];
166
+ parent[curr] = root;
167
+ curr = next;
168
+ }
169
+ return root;
170
+ }
171
+ // Compile clusters
172
+ const clusterMap = new Map();
173
+ for (const idx of involvedIndices) {
174
+ const root = find(idx);
175
+ let group = clusterMap.get(root);
176
+ if (!group) {
177
+ group = [];
178
+ clusterMap.set(root, group);
179
+ }
180
+ group.push(idx);
122
181
  }
123
- for (const groupSet of uniqueNearSets) {
124
- if (groupSet.size > 1) {
182
+ for (const groupIndices of clusterMap.values()) {
183
+ if (groupIndices.length > 1) {
125
184
  const id = `cluster_near_${clusterCounter++}`;
126
- const groupArr = Array.from(groupSet);
127
- nearClusters.push({ id, type: 'near', nodes: groupArr });
128
- for (const n of groupArr) {
185
+ const groupNodes = groupIndices.map(idx => candidates[idx]);
186
+ nearClusters.push({ id, type: 'near', nodes: groupNodes });
187
+ for (const n of groupNodes) {
129
188
  n.duplicateClusterId = id;
130
189
  n.duplicateType = 'near';
131
190
  }
132
191
  }
133
192
  }
134
- const allClusters = [...exactClusters, ...nearClusters];
135
- // Phase 4: Template-Heavy Detection
136
- // Mark classes as 'template_heavy' if ratio < 0.3
137
- for (const cluster of allClusters) {
138
- const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
139
- if (avgRatio < 0.3) {
140
- cluster.type = 'template_heavy';
141
- cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
142
- }
193
+ return { nearClusters, nextId: clusterCounter };
194
+ }
195
+ function processClusters(clusters, graph, collapse) {
196
+ for (const cluster of clusters) {
197
+ processSingleCluster(cluster, graph, collapse);
198
+ }
199
+ }
200
+ function processSingleCluster(cluster, graph, collapse) {
201
+ checkTemplateHeavy(cluster);
202
+ cluster.severity = calculateSeverity(cluster);
203
+ const representative = selectRepresentative(cluster);
204
+ cluster.representative = representative.url;
205
+ applyClusterToGraph(cluster, representative, graph, collapse);
206
+ }
207
+ function checkTemplateHeavy(cluster) {
208
+ const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
209
+ if (avgRatio < 0.3) {
210
+ cluster.type = 'template_heavy';
211
+ cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
212
+ }
213
+ }
214
+ function calculateSeverity(cluster) {
215
+ const canonicals = new Set();
216
+ let hasMissing = false;
217
+ for (const n of cluster.nodes) {
218
+ if (!n.canonical)
219
+ hasMissing = true;
220
+ else
221
+ canonicals.add(n.canonical);
222
+ }
223
+ if (hasMissing || canonicals.size > 1) {
224
+ return 'high';
225
+ }
226
+ else if (cluster.type === 'near') {
227
+ return 'medium';
143
228
  }
144
- // Phase 5: Canonical Conflict & Representative Selection
145
- for (const cluster of allClusters) {
146
- const canonicals = new Set();
147
- let hasMissing = false;
229
+ else {
230
+ return 'low';
231
+ }
232
+ }
233
+ function selectRepresentative(cluster) {
234
+ const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
235
+ const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
236
+ if (validCanonicals.length > 0) {
237
+ return validCanonicals[0];
238
+ }
239
+ return cluster.nodes.reduce((best, current) => {
240
+ if (current.inLinks > best.inLinks)
241
+ return current;
242
+ if (current.inLinks < best.inLinks)
243
+ return best;
244
+ if (current.url.length < best.url.length)
245
+ return current;
246
+ return best;
247
+ });
248
+ }
249
+ function applyClusterToGraph(cluster, representative, graph, collapse) {
250
+ cluster.nodes.forEach(n => {
251
+ n.isClusterPrimary = n.url === representative.url;
252
+ n.isCollapsed = false;
253
+ n.collapseInto = undefined;
254
+ });
255
+ graph.duplicateClusters.push({
256
+ id: cluster.id,
257
+ type: cluster.type,
258
+ size: cluster.nodes.length,
259
+ representative: representative.url,
260
+ severity: cluster.severity
261
+ });
262
+ if (collapse) {
148
263
  for (const n of cluster.nodes) {
149
- if (!n.canonical)
150
- hasMissing = true;
151
- // We compare full absolute canonical URLs (assuming they are normalized during crawl)
152
- else
153
- canonicals.add(n.canonical);
154
- }
155
- if (hasMissing || canonicals.size > 1) {
156
- cluster.severity = 'high';
157
- }
158
- else if (cluster.type === 'near') {
159
- cluster.severity = 'medium';
160
- }
161
- else {
162
- cluster.severity = 'low';
163
- }
164
- // Phase 6: Select Representative
165
- // 1. Valid Canonical target in cluster
166
- // 2. Highest internal in-degree
167
- // 3. Shortest URL
168
- // 4. First discovered (relying on array order, which is from BFS map roughly)
169
- let representativeNode = cluster.nodes[0];
170
- // Evaluate best rep
171
- const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
172
- const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
173
- if (validCanonicals.length > 0) {
174
- representativeNode = validCanonicals[0]; // If multiple, just pick first matching self
175
- }
176
- else {
177
- representativeNode = cluster.nodes.reduce((best, current) => {
178
- if (current.inLinks > best.inLinks)
179
- return current;
180
- if (current.inLinks < best.inLinks)
181
- return best;
182
- if (current.url.length < best.url.length)
183
- return current;
184
- return best;
185
- });
186
- }
187
- cluster.representative = representativeNode.url;
188
- cluster.nodes.forEach(n => {
189
- n.isClusterPrimary = n.url === representativeNode.url;
190
- n.isCollapsed = false; // default for JSON
191
- n.collapseInto = undefined;
192
- });
193
- // Push to Graph's final cluster list
194
- graph.duplicateClusters.push({
195
- id: cluster.id,
196
- type: cluster.type,
197
- size: cluster.nodes.length,
198
- representative: representativeNode.url,
199
- severity: cluster.severity
200
- });
201
- // Controlled Collapse
202
- if (collapse) {
203
- for (const n of cluster.nodes) {
204
- if (n.url !== representativeNode.url) {
205
- n.isCollapsed = true;
206
- n.collapseInto = representativeNode.url;
207
- }
264
+ if (n.url !== representative.url) {
265
+ n.isCollapsed = true;
266
+ n.collapseInto = representative.url;
208
267
  }
209
268
  }
210
269
  }
211
- // Final Edge Transfer if Collapsing
212
- if (collapse) {
213
- const edges = graph.getEdges();
214
- const updatedEdges = new Map();
215
- for (const edge of edges) {
216
- const sourceNode = graph.nodes.get(edge.source);
217
- const targetNode = graph.nodes.get(edge.target);
218
- if (!sourceNode || !targetNode)
219
- continue;
220
- // We do NOT modify source structure for out-bound edges of collapsed nodes?
221
- // Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative."
222
- // Actually, if a node links TO a collapsed node, we repoint the edge to the representative.
223
- // If a collapsed node links to X, we ignore it (PageRank will filter it out).
224
- const actualSource = edge.source;
225
- // repoint target
226
- const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
227
- // Skip self-referential edges caused by repointing
228
- if (actualSource === actualTarget)
229
- continue;
230
- const edgeKey = `${actualSource}|${actualTarget}`;
231
- const existingWeight = updatedEdges.get(edgeKey) || 0;
232
- updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); // deduplicate
233
- }
234
- // Update graph edges in-place
235
- graph.edges = updatedEdges;
236
- // Re-calculate inLinks and outLinks based on collapsed edges
237
- for (const node of graph.getNodes()) {
238
- node.inLinks = 0;
239
- node.outLinks = 0;
240
- }
241
- for (const [edgeKey, _weight] of updatedEdges.entries()) {
242
- const [src, tgt] = edgeKey.split('|');
243
- const sn = graph.nodes.get(src);
244
- const tn = graph.nodes.get(tgt);
245
- if (sn)
246
- sn.outLinks++;
247
- if (tn)
248
- tn.inLinks++;
249
- }
270
+ }
271
+ function collapseEdges(graph) {
272
+ const edges = graph.getEdges();
273
+ const updatedEdges = new Map();
274
+ for (const edge of edges) {
275
+ const sourceNode = graph.nodes.get(edge.source);
276
+ const targetNode = graph.nodes.get(edge.target);
277
+ if (!sourceNode || !targetNode)
278
+ continue;
279
+ const actualSource = edge.source;
280
+ const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
281
+ if (actualSource === actualTarget)
282
+ continue;
283
+ const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
284
+ const existingWeight = updatedEdges.get(edgeKey) || 0;
285
+ updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
286
+ }
287
+ graph.edges = updatedEdges;
288
+ // Re-calculate inLinks and outLinks based on collapsed edges
289
+ for (const node of graph.getNodes()) {
290
+ node.inLinks = 0;
291
+ node.outLinks = 0;
292
+ }
293
+ for (const [edgeKey, _weight] of updatedEdges.entries()) {
294
+ const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
295
+ const sn = graph.nodes.get(src);
296
+ const tn = graph.nodes.get(tgt);
297
+ if (sn)
298
+ sn.outLinks++;
299
+ if (tn)
300
+ tn.inLinks++;
250
301
  }
251
302
  }