npm - @crawlith/core - Versions diffs - 0.1.0 → 0.1.1 - Mend

@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

package/CHANGELOG.md +6 -0
package/dist/analysis/analysis_list.html +35 -0
package/dist/analysis/analysis_page.html +123 -0
package/dist/analysis/analyze.d.ts +17 -3
package/dist/analysis/analyze.js +192 -248
package/dist/analysis/scoring.js +7 -1
package/dist/analysis/templates.d.ts +2 -0
package/dist/analysis/templates.js +7 -0
package/dist/core/security/ipGuard.d.ts +11 -0
package/dist/core/security/ipGuard.js +71 -3
package/dist/crawler/crawl.d.ts +4 -22
package/dist/crawler/crawl.js +4 -335
package/dist/crawler/crawler.d.ts +75 -0
package/dist/crawler/crawler.js +518 -0
package/dist/crawler/extract.d.ts +4 -1
package/dist/crawler/extract.js +7 -2
package/dist/crawler/fetcher.d.ts +1 -0
package/dist/crawler/fetcher.js +20 -5
package/dist/crawler/metricsRunner.d.ts +3 -1
package/dist/crawler/metricsRunner.js +55 -46
package/dist/crawler/sitemap.d.ts +3 -0
package/dist/crawler/sitemap.js +5 -1
package/dist/db/graphLoader.js +32 -3
package/dist/db/index.d.ts +3 -0
package/dist/db/index.js +4 -0
package/dist/db/repositories/EdgeRepository.d.ts +8 -0
package/dist/db/repositories/EdgeRepository.js +13 -0
package/dist/db/repositories/MetricsRepository.d.ts +3 -0
package/dist/db/repositories/MetricsRepository.js +14 -1
package/dist/db/repositories/PageRepository.d.ts +11 -0
package/dist/db/repositories/PageRepository.js +112 -19
package/dist/db/repositories/SiteRepository.d.ts +3 -0
package/dist/db/repositories/SiteRepository.js +9 -0
package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
package/dist/db/repositories/SnapshotRepository.js +23 -2
package/dist/events.d.ts +48 -0
package/dist/events.js +1 -0
package/dist/graph/cluster.js +62 -14
package/dist/graph/duplicate.js +242 -191
package/dist/graph/graph.d.ts +16 -0
package/dist/graph/graph.js +17 -4
package/dist/graph/metrics.js +12 -0
package/dist/graph/pagerank.js +2 -0
package/dist/graph/simhash.d.ts +6 -0
package/dist/graph/simhash.js +14 -0
package/dist/index.d.ts +5 -2
package/dist/index.js +5 -2
package/dist/lock/hashKey.js +1 -1
package/dist/lock/lockManager.d.ts +4 -1
package/dist/lock/lockManager.js +23 -13
package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
package/dist/report/crawlExport.d.ts +3 -0
package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
package/dist/report/crawl_template.d.ts +1 -0
package/dist/report/crawl_template.js +7 -0
package/dist/report/html.js +15 -216
package/dist/scoring/health.d.ts +50 -0
package/dist/scoring/health.js +170 -0
package/dist/scoring/hits.d.ts +1 -0
package/dist/scoring/hits.js +64 -44
package/dist/scoring/orphanSeverity.d.ts +5 -5
package/package.json +3 -3
package/scripts/copy-assets.js +37 -0
package/src/analysis/analysis_list.html +35 -0
package/src/analysis/analysis_page.html +123 -0
package/src/analysis/analyze.ts +218 -261
package/src/analysis/scoring.ts +8 -1
package/src/analysis/templates.ts +9 -0
package/src/core/security/ipGuard.ts +82 -3
package/src/crawler/crawl.ts +6 -379
package/src/crawler/crawler.ts +601 -0
package/src/crawler/extract.ts +7 -2
package/src/crawler/fetcher.ts +24 -6
package/src/crawler/metricsRunner.ts +60 -47
package/src/crawler/sitemap.ts +4 -1
package/src/db/graphLoader.ts +33 -3
package/src/db/index.ts +5 -0
package/src/db/repositories/EdgeRepository.ts +14 -0
package/src/db/repositories/MetricsRepository.ts +15 -1
package/src/db/repositories/PageRepository.ts +119 -19
package/src/db/repositories/SiteRepository.ts +11 -0
package/src/db/repositories/SnapshotRepository.ts +28 -3
package/src/events.ts +16 -0
package/src/graph/cluster.ts +69 -15
package/src/graph/duplicate.ts +249 -185
package/src/graph/graph.ts +24 -4
package/src/graph/metrics.ts +15 -0
package/src/graph/pagerank.ts +1 -0
package/src/graph/simhash.ts +15 -0
package/src/index.ts +5 -2
package/src/lock/hashKey.ts +1 -1
package/src/lock/lockManager.ts +21 -13
package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
package/src/report/crawl_template.ts +9 -0
package/src/report/html.ts +17 -217
package/src/scoring/health.ts +241 -0
package/src/scoring/hits.ts +67 -45
package/src/scoring/orphanSeverity.ts +8 -8
package/tests/analysis.unit.test.ts +44 -0
package/tests/analyze.integration.test.ts +88 -53
package/tests/analyze_markdown.test.ts +98 -0
package/tests/audit/audit.test.ts +101 -0
package/tests/audit/scoring.test.ts +25 -25
package/tests/audit/transport.test.ts +0 -1
package/tests/clustering_risk.test.ts +118 -0
package/tests/crawler.test.ts +19 -13
package/tests/db/index.test.ts +134 -0
package/tests/db/repositories.test.ts +115 -0
package/tests/db_repos.test.ts +72 -0
package/tests/duplicate.test.ts +2 -2
package/tests/extract.test.ts +86 -0
package/tests/fetcher.test.ts +5 -1
package/tests/fetcher_safety.test.ts +9 -3
package/tests/graph/graph.test.ts +100 -0
package/tests/graphLoader.test.ts +124 -0
package/tests/html_report.test.ts +52 -51
package/tests/ipGuard.test.ts +73 -0
package/tests/lock/lockManager.test.ts +77 -17
package/tests/normalize.test.ts +6 -19
package/tests/orphanSeverity.test.ts +9 -9
package/tests/redirect_safety.test.ts +5 -1
package/tests/renderAnalysisCsv.test.ts +183 -0
package/tests/safety.test.ts +12 -0
package/tests/scope.test.ts +18 -0
package/tests/scoring.test.ts +25 -24
package/tests/sitemap.test.ts +13 -1
package/tests/ssrf_fix.test.ts +69 -0
package/tests/visualization_data.test.ts +10 -10
package/dist/report/sitegraphExport.d.ts +0 -3
package/dist/report/sitegraph_template.d.ts +0 -1

package/src/graph/cluster.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { Graph, GraphNode, ClusterInfo } from './graph.js';
 import { SimHash } from './simhash.js';
+import { load } from 'cheerio';
 /**
  * Detects content clusters using 64-bit SimHash and Hamming Distance.
@@ -18,24 +19,23 @@ export function detectContentClusters(
     // Banding Optimization (4 bands of 16 bits)
     // Note: For threshold > 3, this is a heuristic and may miss some pairs,
     // but it dramatically reduces the search space as requested.
-    const bands = 4;
-    const bandWidth = 16;
-    const buckets: Map<number, Set<string>>[] = Array.from({ length: bands }, () => new Map());
+    const buckets: Map<number, Set<string>>[] = Array.from({ length: SimHash.BANDS }, () => new Map());
     for (const node of nodes) {
         const hash = BigInt(node.simhash!);
-        for (let b = 0; b < bands; b++) {
-            const bandValue = Number((hash >> BigInt(b * bandWidth)) & 0xFFFFn);
+        const bandValues = SimHash.getBands(hash);
+        bandValues.forEach((bandValue, b) => {
             if (!buckets[b].has(bandValue)) {
                 buckets[b].set(bandValue, new Set());
             }
             buckets[b].get(bandValue)!.add(node.url);
-        }
+        });
     }
     const checkedPairs = new Set<string>();
-    for (let b = 0; b < bands; b++) {
+    for (let b = 0; b < SimHash.BANDS; b++) {
         for (const bucket of buckets[b].values()) {
             if (bucket.size < 2) continue;
             const bucketNodes = Array.from(bucket);
@@ -154,14 +154,68 @@ function selectPrimaryUrl(urls: string[], graph: Graph): string {
  * Calculates cannibalization risk based on title and H1 similarity within the cluster.
  */
 function calculateClusterRisk(nodes: GraphNode[]): 'low' | 'medium' | 'high' {
-    // Logic: Check if there's significant overlap in Titles or H1s among cluster members.
-    // This is a heuristic as requested.
-    // Simplified heuristic: risk is based on cluster density and size
-    // Large clusters of highly similar content are high risk.
-    // Fallback to a safe categorization
-    if (nodes.length > 5) return 'high';
-    if (nodes.length > 2) return 'medium';
+    if (nodes.length <= 1) return 'low';
+    // Count title and H1 occurrences
+    const titleCounts = new Map<string, number>();
+    const h1Counts = new Map<string, number>();
+    let processedCount = 0;
+    for (const node of nodes) {
+        if (!node.html) continue;
+        try {
+            const $ = load(node.html);
+            const title = $('title').text().trim().toLowerCase();
+            const h1 = $('h1').first().text().trim().toLowerCase();
+            if (title) {
+                titleCounts.set(title, (titleCounts.get(title) || 0) + 1);
+            }
+            if (h1) {
+                h1Counts.set(h1, (h1Counts.get(h1) || 0) + 1);
+            }
+            processedCount++;
+        } catch {
+            // Ignore parsing errors
+        }
+    }
+    // If we couldn't parse enough content (e.g., no HTML stored), fallback to size-based heuristic
+    if (processedCount < nodes.length * 0.5) {
+        if (nodes.length > 5) return 'high';
+        if (nodes.length > 2) return 'medium';
+        return 'low';
+    }
+    // Calculate duplicate ratios
+    let duplicateTitleCount = 0;
+    let duplicateH1Count = 0;
+    for (const count of titleCounts.values()) {
+        if (count > 1) duplicateTitleCount += count;
+    }
+    for (const count of h1Counts.values()) {
+        if (count > 1) duplicateH1Count += count;
+    }
+    const titleDupeRatio = duplicateTitleCount / nodes.length;
+    const h1DupeRatio = duplicateH1Count / nodes.length;
+    // Heuristic 1: High Risk
+    // Significant overlap in Titles OR H1s (e.g., > 30% of cluster members are duplicates)
+    if (titleDupeRatio > 0.3 || h1DupeRatio > 0.3) {
+        return 'high';
+    }
+    // Heuristic 2: Medium Risk
+    // Any overlap, or very large clusters (potential template issues or thin content)
+    if (titleDupeRatio > 0 || h1DupeRatio > 0 || nodes.length > 10) {
+        return 'medium';
+    }
+    // Heuristic 3: Low Risk
+    // Unique content and manageable cluster size
     return 'low';
 }

package/src/graph/duplicate.ts CHANGED Viewed

@@ -21,18 +21,36 @@ interface DuplicateCluster {
 export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
     const collapse = options.collapse !== false; // Default to true
     const threshold = options.simhashThreshold ?? 3;
-    const exactClusters: DuplicateCluster[] = [];
-    const nearClusters: DuplicateCluster[] = [];
     const nodes = graph.getNodes();
+    let clusterCounter = 1;
     // Phase 1 & 2: Exact Duplicate Detection
+    const { exactClusters, nearCandidates, nextId: nextId1 } = findExactDuplicates(nodes, clusterCounter);
+    clusterCounter = nextId1;
+    // Phase 3: Near Duplicate Detection
+    const { nearClusters } = findNearDuplicates(nearCandidates, threshold, clusterCounter);
+    const allClusters = [...exactClusters, ...nearClusters];
+    // Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative)
+    processClusters(allClusters, graph, collapse);
+    // Final Edge Transfer if Collapsing
+    if (collapse) {
+        collapseEdges(graph);
+    }
+}
+function findExactDuplicates(nodes: GraphNode[], startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
+    const exactMap = groupNodesByContentHash(nodes);
+    return createExactClusters(exactMap, startId);
+}
+function groupNodesByContentHash(nodes: GraphNode[]): Map<string, GraphNode[]> {
     const exactMap = new Map<string, GraphNode[]>();
     for (const node of nodes) {
         if (!node.contentHash || node.status !== 200) continue;
-        // Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs)
         let arr = exactMap.get(node.contentHash);
         if (!arr) {
             arr = [];
@@ -40,16 +58,18 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
         }
         arr.push(node);
     }
+    return exactMap;
+}
-    // Nodes that are NOT part of an exact duplicate group are candidates for near duplicate checks
+function createExactClusters(exactMap: Map<string, GraphNode[]>, startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
+    const exactClusters: DuplicateCluster[] = [];
     const nearCandidates: GraphNode[] = [];
-    let clusterCounter = 1;
+    let clusterCounter = startId;
     for (const [_hash, group] of exactMap.entries()) {
         if (group.length > 1) {
             const id = `cluster_exact_${clusterCounter++}`;
             exactClusters.push({ id, type: 'exact', nodes: group });
-            // Mark nodes
             for (const n of group) {
                 n.duplicateClusterId = id;
                 n.duplicateType = 'exact';
@@ -59,228 +79,272 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
         }
     }
-    // Phase 3: Near Duplicate Detection (SimHash with Bands)
-    // 64-bit simhash -> split into 4 bands of 16 bits.
-    const bandsMaps = [
-        new Map<number, GraphNode[]>(),
-        new Map<number, GraphNode[]>(),
-        new Map<number, GraphNode[]>(),
-        new Map<number, GraphNode[]>()
-    ];
-    for (const node of nearCandidates) {
-        if (!node.simhash) continue;
-        const simhash = BigInt(node.simhash);
-        // Extract 16 bit bands
-        const b0 = Number(simhash & 0xFFFFn);
-        const b1 = Number((simhash >> 16n) & 0xFFFFn);
-        const b2 = Number((simhash >> 32n) & 0xFFFFn);
-        const b3 = Number((simhash >> 48n) & 0xFFFFn);
-        const bands = [b0, b1, b2, b3];
-        for (let i = 0; i < 4; i++) {
-            let arr = bandsMaps[i].get(bands[i]);
+    return { exactClusters, nearCandidates, nextId: clusterCounter };
+}
+function findNearDuplicates(candidates: GraphNode[], threshold: number, startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
+    const { bandsMaps, simhashes } = buildSimHashBuckets(candidates);
+    const { parent, involvedIndices } = findConnectedComponents(bandsMaps, simhashes, candidates.length, threshold);
+    return extractClusters(parent, involvedIndices, candidates, startId);
+}
+function buildSimHashBuckets(candidates: GraphNode[]): { bandsMaps: Map<number, number[]>[], simhashes: BigUint64Array, validIndices: number[] } {
+    const n = candidates.length;
+    const simhashes = new BigUint64Array(n);
+    const validIndices: number[] = [];
+    for (let i = 0; i < n; i++) {
+        if (candidates[i].simhash) {
+            simhashes[i] = BigInt(candidates[i].simhash!);
+            validIndices.push(i);
+        }
+    }
+    const bandsMaps: Map<number, number[]>[] = Array.from({ length: SimHash.BANDS }, () => new Map());
+    for (const idx of validIndices) {
+        const bands = SimHash.getBands(simhashes[idx]);
+        for (let b = 0; b < SimHash.BANDS; b++) {
+            let arr = bandsMaps[b].get(bands[b]);
             if (!arr) {
                 arr = [];
-                bandsMaps[i].set(bands[i], arr);
+                bandsMaps[b].set(bands[b], arr);
+            }
+            arr.push(idx);
+        }
+    }
+    return { bandsMaps, simhashes, validIndices };
+}
+function findConnectedComponents(bandsMaps: Map<number, number[]>[], simhashes: BigUint64Array, n: number, threshold: number): { parent: Uint32Array, involvedIndices: Set<number> } {
+    // Union-Find Arrays (Integer-based)
+    const parent = new Uint32Array(n);
+    const rank = new Uint8Array(n);
+    for (let i = 0; i < n; i++) {
+        parent[i] = i;
+        rank[i] = 0;
+    }
+    function find(i: number): number {
+        let root = i;
+        while (parent[root] !== root) {
+            root = parent[root];
+        }
+        let curr = i;
+        while (curr !== root) {
+            const next = parent[curr];
+            parent[curr] = root;
+            curr = next;
+        }
+        return root;
+    }
+    function union(i: number, j: number) {
+        const rootI = find(i);
+        const rootJ = find(j);
+        if (rootI !== rootJ) {
+            const rankI = rank[rootI];
+            const rankJ = rank[rootJ];
+            if (rankI < rankJ) {
+                parent[rootI] = rootJ;
+            } else if (rankI > rankJ) {
+                parent[rootJ] = rootI;
+            } else {
+                parent[rootJ] = rootI;
+                rank[rootI]++;
             }
-            arr.push(node);
         }
     }
-    // Find candidate pairs
-    const nearGroupMap = new Map<string, Set<GraphNode>>(); // node.url -> cluster set
-    const checkedPairs = new Set<string>();
+    const involvedIndices = new Set<number>();
-    for (let i = 0; i < 4; i++) {
-        for (const [_bandVal, bucketNodes] of bandsMaps[i].entries()) {
-            if (bucketNodes.length < 2) continue; // nothing to compare
+    for (let b = 0; b < SimHash.BANDS; b++) {
+        for (const bucketIndices of bandsMaps[b].values()) {
+            if (bucketIndices.length < 2) continue;
-            // Compare all nodes in this bucket
-            for (let j = 0; j < bucketNodes.length; j++) {
-                for (let k = j + 1; k < bucketNodes.length; k++) {
-                    const n1 = bucketNodes[j];
-                    const n2 = bucketNodes[k];
+            for (let j = 0; j < bucketIndices.length; j++) {
+                for (let k = j + 1; k < bucketIndices.length; k++) {
+                    const idx1 = bucketIndices[j];
+                    const idx2 = bucketIndices[k];
-                    // Ensure n1 < n2 lexicographically to avoid duplicate pairs
-                    const [a, b] = n1.url < n2.url ? [n1, n2] : [n2, n1];
-                    const pairKey = `${a.url}|${b.url}`;
+                    const root1 = find(idx1);
+                    const root2 = find(idx2);
-                    if (checkedPairs.has(pairKey)) continue;
-                    checkedPairs.add(pairKey);
+                    if (root1 === root2) continue; // Already connected, skip expensive distance check
-                    const dist = SimHash.hammingDistance(BigInt(a.simhash!), BigInt(b.simhash!));
+                    const dist = SimHash.hammingDistance(simhashes[idx1], simhashes[idx2]);
                     if (dist <= threshold) {
-                        // They are near duplicates.
-                        // Find or create their cluster set using union-find or reference propagation
-                        const setA = nearGroupMap.get(a.url);
-                        const setB = nearGroupMap.get(b.url);
-                        if (!setA && !setB) {
-                            const newSet = new Set<GraphNode>([a, b]);
-                            nearGroupMap.set(a.url, newSet);
-                            nearGroupMap.set(b.url, newSet);
-                        } else if (setA && !setB) {
-                            setA.add(b);
-                            nearGroupMap.set(b.url, setA);
-                        } else if (setB && !setA) {
-                            setB.add(a);
-                            nearGroupMap.set(a.url, setB);
-                        } else if (setA && setB && setA !== setB) {
-                            // Merge sets
-                            for (const node of setB) {
-                                setA.add(node);
-                                nearGroupMap.set(node.url, setA);
-                            }
-                        }
+                        union(root1, root2);
+                        involvedIndices.add(idx1);
+                        involvedIndices.add(idx2);
                     }
                 }
             }
         }
     }
-    // Compile near duplicate clusters (deduplicated by Set reference)
-    const uniqueNearSets = new Set<Set<GraphNode>>();
-    for (const group of nearGroupMap.values()) {
-        uniqueNearSets.add(group);
+    return { parent, involvedIndices };
+}
+function extractClusters(parent: Uint32Array, involvedIndices: Set<number>, candidates: GraphNode[], startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
+    const nearClusters: DuplicateCluster[] = [];
+    let clusterCounter = startId;
+    function find(i: number): number {
+        let root = i;
+        while (parent[root] !== root) {
+            root = parent[root];
+        }
+        let curr = i;
+        while (curr !== root) {
+            const next = parent[curr];
+            parent[curr] = root;
+            curr = next;
+        }
+        return root;
     }
-    for (const groupSet of uniqueNearSets) {
-        if (groupSet.size > 1) {
+    // Compile clusters
+    const clusterMap = new Map<number, number[]>();
+    for (const idx of involvedIndices) {
+        const root = find(idx);
+        let group = clusterMap.get(root);
+        if (!group) {
+            group = [];
+            clusterMap.set(root, group);
+        }
+        group.push(idx);
+    }
+    for (const groupIndices of clusterMap.values()) {
+        if (groupIndices.length > 1) {
             const id = `cluster_near_${clusterCounter++}`;
-            const groupArr = Array.from(groupSet);
-            nearClusters.push({ id, type: 'near', nodes: groupArr });
-            for (const n of groupArr) {
+            const groupNodes = groupIndices.map(idx => candidates[idx]);
+            nearClusters.push({ id, type: 'near', nodes: groupNodes });
+            for (const n of groupNodes) {
                 n.duplicateClusterId = id;
                 n.duplicateType = 'near';
             }
         }
     }
-    const allClusters = [...exactClusters, ...nearClusters];
+    return { nearClusters, nextId: clusterCounter };
+}
-    // Phase 4: Template-Heavy Detection
-    // Mark classes as 'template_heavy' if ratio < 0.3
-    for (const cluster of allClusters) {
-        const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
-        if (avgRatio < 0.3) {
-            cluster.type = 'template_heavy';
-            cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
-        }
+function processClusters(clusters: DuplicateCluster[], graph: Graph, collapse: boolean) {
+    for (const cluster of clusters) {
+        processSingleCluster(cluster, graph, collapse);
     }
+}
-    // Phase 5: Canonical Conflict & Representative Selection
-    for (const cluster of allClusters) {
-        const canonicals = new Set<string>();
-        let hasMissing = false;
+function processSingleCluster(cluster: DuplicateCluster, graph: Graph, collapse: boolean) {
+    checkTemplateHeavy(cluster);
+    cluster.severity = calculateSeverity(cluster);
+    const representative = selectRepresentative(cluster);
+    cluster.representative = representative.url;
+    applyClusterToGraph(cluster, representative, graph, collapse);
+}
-        for (const n of cluster.nodes) {
-            if (!n.canonical) hasMissing = true;
-            // We compare full absolute canonical URLs (assuming they are normalized during crawl)
-            else canonicals.add(n.canonical);
-        }
+function checkTemplateHeavy(cluster: DuplicateCluster) {
+    const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
+    if (avgRatio < 0.3) {
+        cluster.type = 'template_heavy';
+        cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
+    }
+}
-        if (hasMissing || canonicals.size > 1) {
-            cluster.severity = 'high';
-        } else if (cluster.type === 'near') {
-            cluster.severity = 'medium';
-        } else {
-            cluster.severity = 'low';
-        }
+function calculateSeverity(cluster: DuplicateCluster): 'low' | 'medium' | 'high' {
+    const canonicals = new Set<string>();
+    let hasMissing = false;
+    for (const n of cluster.nodes) {
+        if (!n.canonical) hasMissing = true;
+        else canonicals.add(n.canonical);
+    }
-        // Phase 6: Select Representative
-        // 1. Valid Canonical target in cluster
-        // 2. Highest internal in-degree
-        // 3. Shortest URL
-        // 4. First discovered (relying on array order, which is from BFS map roughly)
-        let representativeNode = cluster.nodes[0];
+    if (hasMissing || canonicals.size > 1) {
+        return 'high';
+    } else if (cluster.type === 'near') {
+        return 'medium';
+    } else {
+        return 'low';
+    }
+}
-        // Evaluate best rep
-        const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
-        const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
+function selectRepresentative(cluster: DuplicateCluster): GraphNode {
+    const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
+    const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
-        if (validCanonicals.length > 0) {
-            representativeNode = validCanonicals[0]; // If multiple, just pick first matching self
-        } else {
-            representativeNode = cluster.nodes.reduce((best, current) => {
-                if (current.inLinks > best.inLinks) return current;
-                if (current.inLinks < best.inLinks) return best;
-                if (current.url.length < best.url.length) return current;
-                return best;
-            });
-        }
+    if (validCanonicals.length > 0) {
+        return validCanonicals[0];
+    }
-        cluster.representative = representativeNode.url;
-        cluster.nodes.forEach(n => {
-            n.isClusterPrimary = n.url === representativeNode.url;
-            n.isCollapsed = false; // default for JSON
-            n.collapseInto = undefined;
-        });
-        // Push to Graph's final cluster list
-        graph.duplicateClusters.push({
-            id: cluster.id,
-            type: cluster.type,
-            size: cluster.nodes.length,
-            representative: representativeNode.url,
-            severity: cluster.severity!
-        });
-        // Controlled Collapse
-        if (collapse) {
-            for (const n of cluster.nodes) {
-                if (n.url !== representativeNode.url) {
-                    n.isCollapsed = true;
-                    n.collapseInto = representativeNode.url;
-                }
+    return cluster.nodes.reduce((best, current) => {
+        if (current.inLinks > best.inLinks) return current;
+        if (current.inLinks < best.inLinks) return best;
+        if (current.url.length < best.url.length) return current;
+        return best;
+    });
+}
+function applyClusterToGraph(cluster: DuplicateCluster, representative: GraphNode, graph: Graph, collapse: boolean) {
+    cluster.nodes.forEach(n => {
+        n.isClusterPrimary = n.url === representative.url;
+        n.isCollapsed = false;
+        n.collapseInto = undefined;
+    });
+    graph.duplicateClusters.push({
+        id: cluster.id,
+        type: cluster.type,
+        size: cluster.nodes.length,
+        representative: representative.url,
+        severity: cluster.severity!
+    });
+    if (collapse) {
+        for (const n of cluster.nodes) {
+            if (n.url !== representative.url) {
+                n.isCollapsed = true;
+                n.collapseInto = representative.url;
             }
         }
     }
+}
-    // Final Edge Transfer if Collapsing
-    if (collapse) {
-        const edges = graph.getEdges();
-        const updatedEdges = new Map<string, number>();
-        for (const edge of edges) {
-            const sourceNode = graph.nodes.get(edge.source);
-            const targetNode = graph.nodes.get(edge.target);
+function collapseEdges(graph: Graph) {
+    const edges = graph.getEdges();
+    const updatedEdges = new Map<string, number>();
-            if (!sourceNode || !targetNode) continue;
+    for (const edge of edges) {
+        const sourceNode = graph.nodes.get(edge.source);
+        const targetNode = graph.nodes.get(edge.target);
-            // We do NOT modify source structure for out-bound edges of collapsed nodes?
-            // Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative."
-            // Actually, if a node links TO a collapsed node, we repoint the edge to the representative.
-            // If a collapsed node links to X, we ignore it (PageRank will filter it out).
+        if (!sourceNode || !targetNode) continue;
-            const actualSource = edge.source;
-            // repoint target
-            const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
+        const actualSource = edge.source;
+        const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
-            // Skip self-referential edges caused by repointing
-            if (actualSource === actualTarget) continue;
+        if (actualSource === actualTarget) continue;
-            const edgeKey = `${actualSource}|${actualTarget}`;
-            const existingWeight = updatedEdges.get(edgeKey) || 0;
-            updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); // deduplicate
-        }
+        const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
+        const existingWeight = updatedEdges.get(edgeKey) || 0;
+        updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
+    }
-        // Update graph edges in-place
-        graph.edges = updatedEdges;
+    graph.edges = updatedEdges;
-        // Re-calculate inLinks and outLinks based on collapsed edges
-        for (const node of graph.getNodes()) {
-            node.inLinks = 0;
-            node.outLinks = 0;
-        }
-        for (const [edgeKey, _weight] of updatedEdges.entries()) {
-            const [src, tgt] = edgeKey.split('|');
-            const sn = graph.nodes.get(src);
-            const tn = graph.nodes.get(tgt);
-            if (sn) sn.outLinks++;
-            if (tn) tn.inLinks++;
-        }
+    // Re-calculate inLinks and outLinks based on collapsed edges
+    for (const node of graph.getNodes()) {
+        node.inLinks = 0;
+        node.outLinks = 0;
+    }
+    for (const [edgeKey, _weight] of updatedEdges.entries()) {
+        const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
+        const sn = graph.nodes.get(src);
+        const tn = graph.nodes.get(tgt);
+        if (sn) sn.outLinks++;
+        if (tn) tn.inLinks++;
     }
 }