@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/src/graph/cluster.ts
DELETED
|
@@ -1,246 +0,0 @@
|
|
|
1
|
-
import { Graph, GraphNode, ClusterInfo } from './graph.js';
|
|
2
|
-
import { SimHash } from './simhash.js';
|
|
3
|
-
import { load } from 'cheerio';
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* Detects content clusters using 64-bit SimHash and Hamming Distance.
|
|
7
|
-
* Uses band optimization to reduce O(n^2) comparisons.
|
|
8
|
-
*/
|
|
9
|
-
export function detectContentClusters(
|
|
10
|
-
graph: Graph,
|
|
11
|
-
threshold: number = 10,
|
|
12
|
-
minSize: number = 3
|
|
13
|
-
): ClusterInfo[] {
|
|
14
|
-
const nodes = graph.getNodes().filter(n => n.simhash && n.status === 200);
|
|
15
|
-
if (nodes.length === 0) return [];
|
|
16
|
-
|
|
17
|
-
const adjacency = new Map<string, Set<string>>();
|
|
18
|
-
|
|
19
|
-
// Banding Optimization (4 bands of 16 bits)
|
|
20
|
-
// Note: For threshold > 3, this is a heuristic and may miss some pairs,
|
|
21
|
-
// but it dramatically reduces the search space as requested.
|
|
22
|
-
const buckets: Map<number, Set<string>>[] = Array.from({ length: SimHash.BANDS }, () => new Map());
|
|
23
|
-
|
|
24
|
-
for (const node of nodes) {
|
|
25
|
-
const hash = BigInt(node.simhash!);
|
|
26
|
-
const bandValues = SimHash.getBands(hash);
|
|
27
|
-
|
|
28
|
-
bandValues.forEach((bandValue, b) => {
|
|
29
|
-
if (!buckets[b].has(bandValue)) {
|
|
30
|
-
buckets[b].set(bandValue, new Set());
|
|
31
|
-
}
|
|
32
|
-
buckets[b].get(bandValue)!.add(node.url);
|
|
33
|
-
});
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
const checkedPairs = new Set<string>();
|
|
37
|
-
|
|
38
|
-
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
39
|
-
for (const bucket of buckets[b].values()) {
|
|
40
|
-
if (bucket.size < 2) continue;
|
|
41
|
-
const bucketNodes = Array.from(bucket);
|
|
42
|
-
for (let i = 0; i < bucketNodes.length; i++) {
|
|
43
|
-
for (let j = i + 1; j < bucketNodes.length; j++) {
|
|
44
|
-
const u1 = bucketNodes[i];
|
|
45
|
-
const u2 = bucketNodes[j];
|
|
46
|
-
if (u1 === u2) continue;
|
|
47
|
-
|
|
48
|
-
const pairKey = u1 < u2 ? `${u1}|${u2}` : `${u2}|${u1}`;
|
|
49
|
-
if (checkedPairs.has(pairKey)) continue;
|
|
50
|
-
checkedPairs.add(pairKey);
|
|
51
|
-
|
|
52
|
-
const n1 = graph.nodes.get(u1)!;
|
|
53
|
-
const n2 = graph.nodes.get(u2)!;
|
|
54
|
-
|
|
55
|
-
const dist = SimHash.hammingDistance(BigInt(n1.simhash!), BigInt(n2.simhash!));
|
|
56
|
-
if (dist <= threshold) {
|
|
57
|
-
if (!adjacency.has(u1)) adjacency.set(u1, new Set());
|
|
58
|
-
if (!adjacency.has(u2)) adjacency.set(u2, new Set());
|
|
59
|
-
adjacency.get(u1)!.add(u2);
|
|
60
|
-
adjacency.get(u2)!.add(u1);
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
// Find connected components (Clusters)
|
|
68
|
-
const visited = new Set<string>();
|
|
69
|
-
const clusters: string[][] = [];
|
|
70
|
-
|
|
71
|
-
for (const node of nodes) {
|
|
72
|
-
if (visited.has(node.url)) continue;
|
|
73
|
-
|
|
74
|
-
const component: string[] = [];
|
|
75
|
-
const queue = [node.url];
|
|
76
|
-
visited.add(node.url);
|
|
77
|
-
|
|
78
|
-
while (queue.length > 0) {
|
|
79
|
-
const current = queue.shift()!;
|
|
80
|
-
component.push(current);
|
|
81
|
-
|
|
82
|
-
const neighbors = adjacency.get(current);
|
|
83
|
-
if (neighbors) {
|
|
84
|
-
for (const neighbor of neighbors) {
|
|
85
|
-
if (!visited.has(neighbor)) {
|
|
86
|
-
visited.add(neighbor);
|
|
87
|
-
queue.push(neighbor);
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
if (component.length >= minSize) {
|
|
94
|
-
clusters.push(component);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
// Sort clusters by size (descending) then by primary URL (ascending) for deterministic IDs
|
|
99
|
-
clusters.sort((a, b) => {
|
|
100
|
-
if (b.length !== a.length) return b.length - a.length;
|
|
101
|
-
const aPrimary = selectPrimaryUrl(a, graph);
|
|
102
|
-
const bPrimary = selectPrimaryUrl(b, graph);
|
|
103
|
-
return aPrimary.localeCompare(bPrimary);
|
|
104
|
-
});
|
|
105
|
-
|
|
106
|
-
const clusterInfos: ClusterInfo[] = [];
|
|
107
|
-
clusters.forEach((memberUrls, index) => {
|
|
108
|
-
const clusterId = index + 1;
|
|
109
|
-
const clusterNodes = memberUrls.map(url => graph.nodes.get(url)!);
|
|
110
|
-
|
|
111
|
-
for (const node of clusterNodes) {
|
|
112
|
-
node.clusterId = clusterId;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
const primaryUrl = selectPrimaryUrl(memberUrls, graph);
|
|
116
|
-
const risk = calculateClusterRisk(clusterNodes);
|
|
117
|
-
const sharedPathPrefix = findSharedPathPrefix(memberUrls);
|
|
118
|
-
|
|
119
|
-
clusterInfos.push({
|
|
120
|
-
id: clusterId,
|
|
121
|
-
count: memberUrls.length,
|
|
122
|
-
primaryUrl,
|
|
123
|
-
risk,
|
|
124
|
-
sharedPathPrefix
|
|
125
|
-
});
|
|
126
|
-
});
|
|
127
|
-
|
|
128
|
-
graph.contentClusters = clusterInfos;
|
|
129
|
-
return clusterInfos;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
/**
|
|
133
|
-
* Selects the primary URL for a cluster based on:
|
|
134
|
-
* 1. Highest PageRank
|
|
135
|
-
* 2. Shortest URL
|
|
136
|
-
* 3. Lexicographic fallback
|
|
137
|
-
*/
|
|
138
|
-
function selectPrimaryUrl(urls: string[], graph: Graph): string {
|
|
139
|
-
return urls.reduce((best, current) => {
|
|
140
|
-
const nBest = graph.nodes.get(best)!;
|
|
141
|
-
const nCurrent = graph.nodes.get(current)!;
|
|
142
|
-
|
|
143
|
-
if ((nCurrent.pageRank || 0) > (nBest.pageRank || 0)) return current;
|
|
144
|
-
if ((nCurrent.pageRank || 0) < (nBest.pageRank || 0)) return best;
|
|
145
|
-
|
|
146
|
-
if (current.length < best.length) return current;
|
|
147
|
-
if (current.length > best.length) return best;
|
|
148
|
-
|
|
149
|
-
return current.localeCompare(best) < 0 ? current : best;
|
|
150
|
-
});
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
/**
|
|
154
|
-
* Calculates cannibalization risk based on title and H1 similarity within the cluster.
|
|
155
|
-
*/
|
|
156
|
-
function calculateClusterRisk(nodes: GraphNode[]): 'low' | 'medium' | 'high' {
|
|
157
|
-
if (nodes.length <= 1) return 'low';
|
|
158
|
-
|
|
159
|
-
// Count title and H1 occurrences
|
|
160
|
-
const titleCounts = new Map<string, number>();
|
|
161
|
-
const h1Counts = new Map<string, number>();
|
|
162
|
-
let processedCount = 0;
|
|
163
|
-
|
|
164
|
-
for (const node of nodes) {
|
|
165
|
-
if (!node.html) continue;
|
|
166
|
-
|
|
167
|
-
try {
|
|
168
|
-
const $ = load(node.html);
|
|
169
|
-
const title = $('title').text().trim().toLowerCase();
|
|
170
|
-
const h1 = $('h1').first().text().trim().toLowerCase();
|
|
171
|
-
|
|
172
|
-
if (title) {
|
|
173
|
-
titleCounts.set(title, (titleCounts.get(title) || 0) + 1);
|
|
174
|
-
}
|
|
175
|
-
if (h1) {
|
|
176
|
-
h1Counts.set(h1, (h1Counts.get(h1) || 0) + 1);
|
|
177
|
-
}
|
|
178
|
-
processedCount++;
|
|
179
|
-
} catch {
|
|
180
|
-
// Ignore parsing errors
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
// If we couldn't parse enough content (e.g., no HTML stored), fallback to size-based heuristic
|
|
185
|
-
if (processedCount < nodes.length * 0.5) {
|
|
186
|
-
if (nodes.length > 5) return 'high';
|
|
187
|
-
if (nodes.length > 2) return 'medium';
|
|
188
|
-
return 'low';
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
// Calculate duplicate ratios
|
|
192
|
-
let duplicateTitleCount = 0;
|
|
193
|
-
let duplicateH1Count = 0;
|
|
194
|
-
|
|
195
|
-
for (const count of titleCounts.values()) {
|
|
196
|
-
if (count > 1) duplicateTitleCount += count;
|
|
197
|
-
}
|
|
198
|
-
for (const count of h1Counts.values()) {
|
|
199
|
-
if (count > 1) duplicateH1Count += count;
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
const titleDupeRatio = duplicateTitleCount / nodes.length;
|
|
203
|
-
const h1DupeRatio = duplicateH1Count / nodes.length;
|
|
204
|
-
|
|
205
|
-
// Heuristic 1: High Risk
|
|
206
|
-
// Significant overlap in Titles OR H1s (e.g., > 30% of cluster members are duplicates)
|
|
207
|
-
if (titleDupeRatio > 0.3 || h1DupeRatio > 0.3) {
|
|
208
|
-
return 'high';
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
// Heuristic 2: Medium Risk
|
|
212
|
-
// Any overlap, or very large clusters (potential template issues or thin content)
|
|
213
|
-
if (titleDupeRatio > 0 || h1DupeRatio > 0 || nodes.length > 10) {
|
|
214
|
-
return 'medium';
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
// Heuristic 3: Low Risk
|
|
218
|
-
// Unique content and manageable cluster size
|
|
219
|
-
return 'low';
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
/**
|
|
223
|
-
* Finds the common path prefix among a set of URLs.
|
|
224
|
-
*/
|
|
225
|
-
function findSharedPathPrefix(urls: string[]): string | undefined {
|
|
226
|
-
if (urls.length < 2) return undefined;
|
|
227
|
-
|
|
228
|
-
try {
|
|
229
|
-
const paths = urls.map(u => new URL(u).pathname.split('/').filter(Boolean));
|
|
230
|
-
const first = paths[0];
|
|
231
|
-
const common: string[] = [];
|
|
232
|
-
|
|
233
|
-
for (let i = 0; i < first.length; i++) {
|
|
234
|
-
const segment = first[i];
|
|
235
|
-
if (paths.every(p => p[i] === segment)) {
|
|
236
|
-
common.push(segment);
|
|
237
|
-
} else {
|
|
238
|
-
break;
|
|
239
|
-
}
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
return common.length > 0 ? '/' + common.join('/') : undefined;
|
|
243
|
-
} catch {
|
|
244
|
-
return undefined;
|
|
245
|
-
}
|
|
246
|
-
}
|
package/src/graph/duplicate.ts
DELETED
|
@@ -1,350 +0,0 @@
|
|
|
1
|
-
import { Graph, GraphNode } from './graph.js';
|
|
2
|
-
import { SimHash } from './simhash.js';
|
|
3
|
-
|
|
4
|
-
export interface DuplicateOptions {
|
|
5
|
-
collapse?: boolean;
|
|
6
|
-
simhashThreshold?: number; // Hamming distance threshold (default: 3)
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
interface DuplicateCluster {
|
|
10
|
-
id: string;
|
|
11
|
-
type: 'exact' | 'near' | 'template_heavy';
|
|
12
|
-
nodes: GraphNode[];
|
|
13
|
-
representative?: string;
|
|
14
|
-
severity?: 'low' | 'medium' | 'high';
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* Detects exact and near duplicates, identifies canonical conflicts,
|
|
19
|
-
* and performs non-destructive collapse of edges.
|
|
20
|
-
*/
|
|
21
|
-
export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
|
|
22
|
-
const collapse = options.collapse !== false; // Default to true
|
|
23
|
-
const threshold = options.simhashThreshold ?? 3;
|
|
24
|
-
const nodes = graph.getNodes();
|
|
25
|
-
let clusterCounter = 1;
|
|
26
|
-
|
|
27
|
-
// Phase 1 & 2: Exact Duplicate Detection
|
|
28
|
-
const { exactClusters, nearCandidates, nextId: nextId1 } = findExactDuplicates(nodes, clusterCounter);
|
|
29
|
-
clusterCounter = nextId1;
|
|
30
|
-
|
|
31
|
-
// Phase 3: Near Duplicate Detection
|
|
32
|
-
const { nearClusters } = findNearDuplicates(nearCandidates, threshold, clusterCounter);
|
|
33
|
-
|
|
34
|
-
const allClusters = [...exactClusters, ...nearClusters];
|
|
35
|
-
|
|
36
|
-
// Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative)
|
|
37
|
-
processClusters(allClusters, graph, collapse);
|
|
38
|
-
|
|
39
|
-
// Final Edge Transfer if Collapsing
|
|
40
|
-
if (collapse) {
|
|
41
|
-
collapseEdges(graph);
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
function findExactDuplicates(nodes: GraphNode[], startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
|
|
46
|
-
const exactMap = groupNodesByContentHash(nodes);
|
|
47
|
-
return createExactClusters(exactMap, startId);
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
function groupNodesByContentHash(nodes: GraphNode[]): Map<string, GraphNode[]> {
|
|
51
|
-
const exactMap = new Map<string, GraphNode[]>();
|
|
52
|
-
for (const node of nodes) {
|
|
53
|
-
if (!node.contentHash || node.status !== 200) continue;
|
|
54
|
-
let arr = exactMap.get(node.contentHash);
|
|
55
|
-
if (!arr) {
|
|
56
|
-
arr = [];
|
|
57
|
-
exactMap.set(node.contentHash, arr);
|
|
58
|
-
}
|
|
59
|
-
arr.push(node);
|
|
60
|
-
}
|
|
61
|
-
return exactMap;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
function createExactClusters(exactMap: Map<string, GraphNode[]>, startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
|
|
65
|
-
const exactClusters: DuplicateCluster[] = [];
|
|
66
|
-
const nearCandidates: GraphNode[] = [];
|
|
67
|
-
let clusterCounter = startId;
|
|
68
|
-
|
|
69
|
-
for (const [_hash, group] of exactMap.entries()) {
|
|
70
|
-
if (group.length > 1) {
|
|
71
|
-
const id = `cluster_exact_${clusterCounter++}`;
|
|
72
|
-
exactClusters.push({ id, type: 'exact', nodes: group });
|
|
73
|
-
for (const n of group) {
|
|
74
|
-
n.duplicateClusterId = id;
|
|
75
|
-
n.duplicateType = 'exact';
|
|
76
|
-
}
|
|
77
|
-
} else {
|
|
78
|
-
nearCandidates.push(group[0]);
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
return { exactClusters, nearCandidates, nextId: clusterCounter };
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
function findNearDuplicates(candidates: GraphNode[], threshold: number, startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
|
|
86
|
-
const { bandsMaps, simhashes } = buildSimHashBuckets(candidates);
|
|
87
|
-
const { parent, involvedIndices } = findConnectedComponents(bandsMaps, simhashes, candidates.length, threshold);
|
|
88
|
-
return extractClusters(parent, involvedIndices, candidates, startId);
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
function buildSimHashBuckets(candidates: GraphNode[]): { bandsMaps: Map<number, number[]>[], simhashes: BigUint64Array, validIndices: number[] } {
|
|
92
|
-
const n = candidates.length;
|
|
93
|
-
const simhashes = new BigUint64Array(n);
|
|
94
|
-
const validIndices: number[] = [];
|
|
95
|
-
|
|
96
|
-
for (let i = 0; i < n; i++) {
|
|
97
|
-
if (candidates[i].simhash) {
|
|
98
|
-
simhashes[i] = BigInt(candidates[i].simhash!);
|
|
99
|
-
validIndices.push(i);
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
const bandsMaps: Map<number, number[]>[] = Array.from({ length: SimHash.BANDS }, () => new Map());
|
|
104
|
-
|
|
105
|
-
for (const idx of validIndices) {
|
|
106
|
-
const bands = SimHash.getBands(simhashes[idx]);
|
|
107
|
-
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
108
|
-
let arr = bandsMaps[b].get(bands[b]);
|
|
109
|
-
if (!arr) {
|
|
110
|
-
arr = [];
|
|
111
|
-
bandsMaps[b].set(bands[b], arr);
|
|
112
|
-
}
|
|
113
|
-
arr.push(idx);
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
return { bandsMaps, simhashes, validIndices };
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
function findConnectedComponents(bandsMaps: Map<number, number[]>[], simhashes: BigUint64Array, n: number, threshold: number): { parent: Uint32Array, involvedIndices: Set<number> } {
|
|
121
|
-
// Union-Find Arrays (Integer-based)
|
|
122
|
-
const parent = new Uint32Array(n);
|
|
123
|
-
const rank = new Uint8Array(n);
|
|
124
|
-
for (let i = 0; i < n; i++) {
|
|
125
|
-
parent[i] = i;
|
|
126
|
-
rank[i] = 0;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
function find(i: number): number {
|
|
130
|
-
let root = i;
|
|
131
|
-
while (parent[root] !== root) {
|
|
132
|
-
root = parent[root];
|
|
133
|
-
}
|
|
134
|
-
let curr = i;
|
|
135
|
-
while (curr !== root) {
|
|
136
|
-
const next = parent[curr];
|
|
137
|
-
parent[curr] = root;
|
|
138
|
-
curr = next;
|
|
139
|
-
}
|
|
140
|
-
return root;
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
function union(i: number, j: number) {
|
|
144
|
-
const rootI = find(i);
|
|
145
|
-
const rootJ = find(j);
|
|
146
|
-
if (rootI !== rootJ) {
|
|
147
|
-
const rankI = rank[rootI];
|
|
148
|
-
const rankJ = rank[rootJ];
|
|
149
|
-
if (rankI < rankJ) {
|
|
150
|
-
parent[rootI] = rootJ;
|
|
151
|
-
} else if (rankI > rankJ) {
|
|
152
|
-
parent[rootJ] = rootI;
|
|
153
|
-
} else {
|
|
154
|
-
parent[rootJ] = rootI;
|
|
155
|
-
rank[rootI]++;
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
const involvedIndices = new Set<number>();
|
|
161
|
-
|
|
162
|
-
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
163
|
-
for (const bucketIndices of bandsMaps[b].values()) {
|
|
164
|
-
if (bucketIndices.length < 2) continue;
|
|
165
|
-
|
|
166
|
-
for (let j = 0; j < bucketIndices.length; j++) {
|
|
167
|
-
for (let k = j + 1; k < bucketIndices.length; k++) {
|
|
168
|
-
const idx1 = bucketIndices[j];
|
|
169
|
-
const idx2 = bucketIndices[k];
|
|
170
|
-
|
|
171
|
-
const root1 = find(idx1);
|
|
172
|
-
const root2 = find(idx2);
|
|
173
|
-
|
|
174
|
-
if (root1 === root2) continue; // Already connected, skip expensive distance check
|
|
175
|
-
|
|
176
|
-
const dist = SimHash.hammingDistance(simhashes[idx1], simhashes[idx2]);
|
|
177
|
-
if (dist <= threshold) {
|
|
178
|
-
union(root1, root2);
|
|
179
|
-
involvedIndices.add(idx1);
|
|
180
|
-
involvedIndices.add(idx2);
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
return { parent, involvedIndices };
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
function extractClusters(parent: Uint32Array, involvedIndices: Set<number>, candidates: GraphNode[], startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
|
|
191
|
-
const nearClusters: DuplicateCluster[] = [];
|
|
192
|
-
let clusterCounter = startId;
|
|
193
|
-
|
|
194
|
-
function find(i: number): number {
|
|
195
|
-
let root = i;
|
|
196
|
-
while (parent[root] !== root) {
|
|
197
|
-
root = parent[root];
|
|
198
|
-
}
|
|
199
|
-
let curr = i;
|
|
200
|
-
while (curr !== root) {
|
|
201
|
-
const next = parent[curr];
|
|
202
|
-
parent[curr] = root;
|
|
203
|
-
curr = next;
|
|
204
|
-
}
|
|
205
|
-
return root;
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
// Compile clusters
|
|
209
|
-
const clusterMap = new Map<number, number[]>();
|
|
210
|
-
for (const idx of involvedIndices) {
|
|
211
|
-
const root = find(idx);
|
|
212
|
-
let group = clusterMap.get(root);
|
|
213
|
-
if (!group) {
|
|
214
|
-
group = [];
|
|
215
|
-
clusterMap.set(root, group);
|
|
216
|
-
}
|
|
217
|
-
group.push(idx);
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
for (const groupIndices of clusterMap.values()) {
|
|
221
|
-
if (groupIndices.length > 1) {
|
|
222
|
-
const id = `cluster_near_${clusterCounter++}`;
|
|
223
|
-
const groupNodes = groupIndices.map(idx => candidates[idx]);
|
|
224
|
-
nearClusters.push({ id, type: 'near', nodes: groupNodes });
|
|
225
|
-
for (const n of groupNodes) {
|
|
226
|
-
n.duplicateClusterId = id;
|
|
227
|
-
n.duplicateType = 'near';
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
return { nearClusters, nextId: clusterCounter };
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
function processClusters(clusters: DuplicateCluster[], graph: Graph, collapse: boolean) {
|
|
236
|
-
for (const cluster of clusters) {
|
|
237
|
-
processSingleCluster(cluster, graph, collapse);
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
function processSingleCluster(cluster: DuplicateCluster, graph: Graph, collapse: boolean) {
|
|
242
|
-
checkTemplateHeavy(cluster);
|
|
243
|
-
cluster.severity = calculateSeverity(cluster);
|
|
244
|
-
const representative = selectRepresentative(cluster);
|
|
245
|
-
cluster.representative = representative.url;
|
|
246
|
-
applyClusterToGraph(cluster, representative, graph, collapse);
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
function checkTemplateHeavy(cluster: DuplicateCluster) {
|
|
250
|
-
const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
|
|
251
|
-
if (avgRatio < 0.3) {
|
|
252
|
-
cluster.type = 'template_heavy';
|
|
253
|
-
cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
function calculateSeverity(cluster: DuplicateCluster): 'low' | 'medium' | 'high' {
|
|
258
|
-
const canonicals = new Set<string>();
|
|
259
|
-
let hasMissing = false;
|
|
260
|
-
|
|
261
|
-
for (const n of cluster.nodes) {
|
|
262
|
-
if (!n.canonical) hasMissing = true;
|
|
263
|
-
else canonicals.add(n.canonical);
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
if (hasMissing || canonicals.size > 1) {
|
|
267
|
-
return 'high';
|
|
268
|
-
} else if (cluster.type === 'near') {
|
|
269
|
-
return 'medium';
|
|
270
|
-
} else {
|
|
271
|
-
return 'low';
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
function selectRepresentative(cluster: DuplicateCluster): GraphNode {
|
|
276
|
-
const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
|
|
277
|
-
const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
|
|
278
|
-
|
|
279
|
-
if (validCanonicals.length > 0) {
|
|
280
|
-
return validCanonicals[0];
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
return cluster.nodes.reduce((best, current) => {
|
|
284
|
-
if (current.inLinks > best.inLinks) return current;
|
|
285
|
-
if (current.inLinks < best.inLinks) return best;
|
|
286
|
-
if (current.url.length < best.url.length) return current;
|
|
287
|
-
return best;
|
|
288
|
-
});
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
function applyClusterToGraph(cluster: DuplicateCluster, representative: GraphNode, graph: Graph, collapse: boolean) {
|
|
292
|
-
cluster.nodes.forEach(n => {
|
|
293
|
-
n.isClusterPrimary = n.url === representative.url;
|
|
294
|
-
n.isCollapsed = false;
|
|
295
|
-
n.collapseInto = undefined;
|
|
296
|
-
});
|
|
297
|
-
|
|
298
|
-
graph.duplicateClusters.push({
|
|
299
|
-
id: cluster.id,
|
|
300
|
-
type: cluster.type,
|
|
301
|
-
size: cluster.nodes.length,
|
|
302
|
-
representative: representative.url,
|
|
303
|
-
severity: cluster.severity!
|
|
304
|
-
});
|
|
305
|
-
|
|
306
|
-
if (collapse) {
|
|
307
|
-
for (const n of cluster.nodes) {
|
|
308
|
-
if (n.url !== representative.url) {
|
|
309
|
-
n.isCollapsed = true;
|
|
310
|
-
n.collapseInto = representative.url;
|
|
311
|
-
}
|
|
312
|
-
}
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
function collapseEdges(graph: Graph) {
|
|
317
|
-
const edges = graph.getEdges();
|
|
318
|
-
const updatedEdges = new Map<string, number>();
|
|
319
|
-
|
|
320
|
-
for (const edge of edges) {
|
|
321
|
-
const sourceNode = graph.nodes.get(edge.source);
|
|
322
|
-
const targetNode = graph.nodes.get(edge.target);
|
|
323
|
-
|
|
324
|
-
if (!sourceNode || !targetNode) continue;
|
|
325
|
-
|
|
326
|
-
const actualSource = edge.source;
|
|
327
|
-
const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
|
|
328
|
-
|
|
329
|
-
if (actualSource === actualTarget) continue;
|
|
330
|
-
|
|
331
|
-
const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
|
|
332
|
-
const existingWeight = updatedEdges.get(edgeKey) || 0;
|
|
333
|
-
updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
graph.edges = updatedEdges;
|
|
337
|
-
|
|
338
|
-
// Re-calculate inLinks and outLinks based on collapsed edges
|
|
339
|
-
for (const node of graph.getNodes()) {
|
|
340
|
-
node.inLinks = 0;
|
|
341
|
-
node.outLinks = 0;
|
|
342
|
-
}
|
|
343
|
-
for (const [edgeKey, _weight] of updatedEdges.entries()) {
|
|
344
|
-
const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
|
|
345
|
-
const sn = graph.nodes.get(src);
|
|
346
|
-
const tn = graph.nodes.get(tgt);
|
|
347
|
-
if (sn) sn.outLinks++;
|
|
348
|
-
if (tn) tn.inLinks++;
|
|
349
|
-
}
|
|
350
|
-
}
|