@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { Graph } from '../graph/graph.js';
|
|
2
|
+
export interface ClusterInfo {
|
|
3
|
+
id: number;
|
|
4
|
+
count: number;
|
|
5
|
+
primaryUrl: string;
|
|
6
|
+
risk: 'low' | 'medium' | 'high';
|
|
7
|
+
sharedPathPrefix?: string;
|
|
8
|
+
nodes?: string[];
|
|
9
|
+
}
|
|
10
|
+
export interface ClusteringOptions {
|
|
11
|
+
threshold?: number;
|
|
12
|
+
minSize?: number;
|
|
13
|
+
}
|
|
14
|
+
export declare class ClusteringService {
|
|
15
|
+
/**
|
|
16
|
+
* Detects content clusters using 64-bit SimHash and Hamming Distance.
|
|
17
|
+
* Uses band optimization to reduce O(n^2) comparisons.
|
|
18
|
+
*/
|
|
19
|
+
detectContentClusters(graph: Graph, threshold?: number, minSize?: number): ClusterInfo[];
|
|
20
|
+
private selectPrimaryUrl;
|
|
21
|
+
private calculateClusterRisk;
|
|
22
|
+
private findSharedPathPrefix;
|
|
23
|
+
}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import { SimHash } from '../graph/simhash.js';
|
|
2
|
+
import { analyzeH1, analyzeTitle } from './seo.js';
|
|
3
|
+
export class ClusteringService {
|
|
4
|
+
/**
|
|
5
|
+
* Detects content clusters using 64-bit SimHash and Hamming Distance.
|
|
6
|
+
* Uses band optimization to reduce O(n^2) comparisons.
|
|
7
|
+
*/
|
|
8
|
+
detectContentClusters(graph, threshold = 10, minSize = 3) {
|
|
9
|
+
const nodes = graph.getNodes().filter(n => n.simhash && n.status === 200);
|
|
10
|
+
if (nodes.length === 0)
|
|
11
|
+
return [];
|
|
12
|
+
const adjacency = new Map();
|
|
13
|
+
// Banding Optimization (4 bands of 16 bits)
|
|
14
|
+
const buckets = Array.from({ length: SimHash.BANDS }, () => new Map());
|
|
15
|
+
for (const node of nodes) {
|
|
16
|
+
const hash = BigInt(node.simhash);
|
|
17
|
+
const bandValues = SimHash.getBands(hash);
|
|
18
|
+
bandValues.forEach((bandValue, b) => {
|
|
19
|
+
if (!buckets[b].has(bandValue)) {
|
|
20
|
+
buckets[b].set(bandValue, []);
|
|
21
|
+
}
|
|
22
|
+
buckets[b].get(bandValue).push(node.url);
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
const checkedPairs = new Set();
|
|
26
|
+
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
27
|
+
for (const bucket of buckets[b].values()) {
|
|
28
|
+
if (bucket.length < 2)
|
|
29
|
+
continue;
|
|
30
|
+
const bucketNodes = bucket;
|
|
31
|
+
for (let i = 0; i < bucketNodes.length; i++) {
|
|
32
|
+
for (let j = i + 1; j < bucketNodes.length; j++) {
|
|
33
|
+
const u1 = bucketNodes[i];
|
|
34
|
+
const u2 = bucketNodes[j];
|
|
35
|
+
if (u1 === u2)
|
|
36
|
+
continue;
|
|
37
|
+
const pairKey = u1 < u2 ? `${u1}|${u2}` : `${u2}|${u1}`;
|
|
38
|
+
if (checkedPairs.has(pairKey))
|
|
39
|
+
continue;
|
|
40
|
+
checkedPairs.add(pairKey);
|
|
41
|
+
const n1 = graph.nodes.get(u1);
|
|
42
|
+
const n2 = graph.nodes.get(u2);
|
|
43
|
+
const dist = SimHash.hammingDistance(BigInt(n1.simhash), BigInt(n2.simhash));
|
|
44
|
+
if (dist <= threshold) {
|
|
45
|
+
if (!adjacency.has(u1))
|
|
46
|
+
adjacency.set(u1, new Set());
|
|
47
|
+
if (!adjacency.has(u2))
|
|
48
|
+
adjacency.set(u2, new Set());
|
|
49
|
+
adjacency.get(u1).add(u2);
|
|
50
|
+
adjacency.get(u2).add(u1);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// Find connected components (Clusters)
|
|
57
|
+
const visited = new Set();
|
|
58
|
+
const clusters = [];
|
|
59
|
+
for (const node of nodes) {
|
|
60
|
+
if (visited.has(node.url))
|
|
61
|
+
continue;
|
|
62
|
+
const component = [];
|
|
63
|
+
const queue = [node.url];
|
|
64
|
+
visited.add(node.url);
|
|
65
|
+
while (queue.length > 0) {
|
|
66
|
+
const current = queue.shift();
|
|
67
|
+
component.push(current);
|
|
68
|
+
const neighbors = adjacency.get(current);
|
|
69
|
+
if (neighbors) {
|
|
70
|
+
for (const neighbor of neighbors) {
|
|
71
|
+
if (!visited.has(neighbor)) {
|
|
72
|
+
visited.add(neighbor);
|
|
73
|
+
queue.push(neighbor);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
if (component.length >= minSize) {
|
|
79
|
+
clusters.push(component);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
// Sort clusters by size (descending) then by primary URL (ascending) for deterministic IDs
|
|
83
|
+
clusters.sort((a, b) => {
|
|
84
|
+
if (b.length !== a.length)
|
|
85
|
+
return b.length - a.length;
|
|
86
|
+
const aPrimary = this.selectPrimaryUrl(a, graph);
|
|
87
|
+
const bPrimary = this.selectPrimaryUrl(b, graph);
|
|
88
|
+
return aPrimary.localeCompare(bPrimary);
|
|
89
|
+
});
|
|
90
|
+
const clusterInfos = [];
|
|
91
|
+
clusters.forEach((memberUrls, index) => {
|
|
92
|
+
const clusterId = index + 1;
|
|
93
|
+
const clusterNodes = memberUrls.map(url => graph.nodes.get(url));
|
|
94
|
+
for (const node of clusterNodes) {
|
|
95
|
+
node.clusterId = clusterId;
|
|
96
|
+
}
|
|
97
|
+
const primaryUrl = this.selectPrimaryUrl(memberUrls, graph);
|
|
98
|
+
const risk = this.calculateClusterRisk(clusterNodes);
|
|
99
|
+
const sharedPathPrefix = this.findSharedPathPrefix(memberUrls);
|
|
100
|
+
clusterInfos.push({
|
|
101
|
+
id: clusterId,
|
|
102
|
+
count: memberUrls.length,
|
|
103
|
+
primaryUrl,
|
|
104
|
+
risk,
|
|
105
|
+
sharedPathPrefix,
|
|
106
|
+
nodes: memberUrls
|
|
107
|
+
});
|
|
108
|
+
});
|
|
109
|
+
return clusterInfos;
|
|
110
|
+
}
|
|
111
|
+
selectPrimaryUrl(urls, graph) {
|
|
112
|
+
return urls.reduce((best, current) => {
|
|
113
|
+
const nBest = graph.nodes.get(best);
|
|
114
|
+
const nCurrent = graph.nodes.get(current);
|
|
115
|
+
if (nCurrent.inLinks > nBest.inLinks)
|
|
116
|
+
return current;
|
|
117
|
+
if (nCurrent.inLinks < nBest.inLinks)
|
|
118
|
+
return best;
|
|
119
|
+
if (current.length < best.length)
|
|
120
|
+
return current;
|
|
121
|
+
if (current.length > best.length)
|
|
122
|
+
return best;
|
|
123
|
+
return current.localeCompare(best) < 0 ? current : best;
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
calculateClusterRisk(nodes) {
|
|
127
|
+
if (nodes.length <= 1)
|
|
128
|
+
return 'low';
|
|
129
|
+
const titleCounts = new Map();
|
|
130
|
+
const h1Counts = new Map();
|
|
131
|
+
let processedCount = 0;
|
|
132
|
+
for (const node of nodes) {
|
|
133
|
+
if (!node.html)
|
|
134
|
+
continue;
|
|
135
|
+
const titleRes = analyzeTitle(node.html);
|
|
136
|
+
const h1Res = analyzeH1(node.html, titleRes.value);
|
|
137
|
+
const title = titleRes.value || '';
|
|
138
|
+
const h1 = h1Res.value || '';
|
|
139
|
+
if (title) {
|
|
140
|
+
titleCounts.set(title.toLowerCase(), (titleCounts.get(title.toLowerCase()) || 0) + 1);
|
|
141
|
+
}
|
|
142
|
+
if (h1) {
|
|
143
|
+
h1Counts.set(h1.toLowerCase(), (h1Counts.get(h1.toLowerCase()) || 0) + 1);
|
|
144
|
+
}
|
|
145
|
+
processedCount++;
|
|
146
|
+
}
|
|
147
|
+
if (processedCount < nodes.length * 0.5) {
|
|
148
|
+
if (nodes.length > 5)
|
|
149
|
+
return 'high';
|
|
150
|
+
if (nodes.length > 2)
|
|
151
|
+
return 'medium';
|
|
152
|
+
return 'low';
|
|
153
|
+
}
|
|
154
|
+
let duplicateTitleCount = 0;
|
|
155
|
+
let duplicateH1Count = 0;
|
|
156
|
+
for (const count of titleCounts.values()) {
|
|
157
|
+
if (count > 1)
|
|
158
|
+
duplicateTitleCount += count;
|
|
159
|
+
}
|
|
160
|
+
for (const count of h1Counts.values()) {
|
|
161
|
+
if (count > 1)
|
|
162
|
+
duplicateH1Count += count;
|
|
163
|
+
}
|
|
164
|
+
const titleDupeRatio = duplicateTitleCount / nodes.length;
|
|
165
|
+
const h1DupeRatio = duplicateH1Count / nodes.length;
|
|
166
|
+
if (titleDupeRatio > 0.3 || h1DupeRatio > 0.3) {
|
|
167
|
+
return 'high';
|
|
168
|
+
}
|
|
169
|
+
if (titleDupeRatio > 0 || h1DupeRatio > 0 || nodes.length > 10) {
|
|
170
|
+
return 'medium';
|
|
171
|
+
}
|
|
172
|
+
return 'low';
|
|
173
|
+
}
|
|
174
|
+
findSharedPathPrefix(urls) {
|
|
175
|
+
if (urls.length < 2)
|
|
176
|
+
return undefined;
|
|
177
|
+
try {
|
|
178
|
+
// Works for both absolute URLs (https://example.com/foo) and root-relative paths (/foo)
|
|
179
|
+
const getPathname = (u) => {
|
|
180
|
+
try {
|
|
181
|
+
return new URL(u).pathname.split('/').filter(Boolean);
|
|
182
|
+
}
|
|
183
|
+
catch {
|
|
184
|
+
// root-relative path like '/foo/bar'
|
|
185
|
+
return u.split('/').filter(Boolean);
|
|
186
|
+
}
|
|
187
|
+
};
|
|
188
|
+
const paths = urls.map(getPathname);
|
|
189
|
+
const first = paths[0];
|
|
190
|
+
const common = [];
|
|
191
|
+
for (let i = 0; i < first.length; i++) {
|
|
192
|
+
const segment = first[i];
|
|
193
|
+
if (paths.every(p => p[i] === segment)) {
|
|
194
|
+
common.push(segment);
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
return common.length > 0 ? '/' + common.join('/') : undefined;
|
|
201
|
+
}
|
|
202
|
+
catch {
|
|
203
|
+
return undefined;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
@@ -8,5 +8,5 @@ export interface ThinScoreWeights {
|
|
|
8
8
|
ratioWeight: number;
|
|
9
9
|
dupWeight: number;
|
|
10
10
|
}
|
|
11
|
-
export declare function analyzeContent(
|
|
11
|
+
export declare function analyzeContent($: any): ContentAnalysis;
|
|
12
12
|
export declare function calculateThinContentScore(content: ContentAnalysis, duplicationScore: number, weights?: ThinScoreWeights): number;
|
package/dist/analysis/content.js
CHANGED
|
@@ -4,14 +4,20 @@ const DEFAULT_WEIGHTS = {
|
|
|
4
4
|
ratioWeight: 0.35,
|
|
5
5
|
dupWeight: 0.25
|
|
6
6
|
};
|
|
7
|
-
export function analyzeContent(
|
|
8
|
-
const
|
|
9
|
-
$
|
|
10
|
-
|
|
7
|
+
export function analyzeContent($) {
|
|
8
|
+
const isString = typeof $ === 'string';
|
|
9
|
+
const cheerioObj = isString ? load($ || '<html></html>') : $;
|
|
10
|
+
// We don't want to modify the shared $ object if we remove elements
|
|
11
|
+
// So we create a localized copy of the body text or use selection
|
|
12
|
+
const body = cheerioObj('body').length ? cheerioObj('body') : cheerioObj('html');
|
|
13
|
+
// To avoid removing from shared $, we extract text from a clone if possible,
|
|
14
|
+
// but cloning in cheerio is expensive.
|
|
15
|
+
// Better: just get the text and clean it or use a filter.
|
|
16
|
+
const text = body.clone().find('script,style,nav,footer').remove().end().text();
|
|
11
17
|
const cleanText = text.replace(/\s+/g, ' ').trim();
|
|
12
18
|
const words = cleanText ? cleanText.split(/\s+/).filter(Boolean) : [];
|
|
13
19
|
const wordCount = words.length;
|
|
14
|
-
const htmlLength =
|
|
20
|
+
const htmlLength = isString ? ($.length || 1) : 1000; // Fallback if we don't have original HTML length
|
|
15
21
|
const textHtmlRatio = cleanText.length / htmlLength;
|
|
16
22
|
const sentenceSet = new Set(cleanText
|
|
17
23
|
.split(/[.!?]+/)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { Graph, GraphNode } from '../graph/graph.js';
|
|
2
|
+
export interface DuplicateOptions {
|
|
3
|
+
collapse?: boolean;
|
|
4
|
+
simhashThreshold?: number;
|
|
5
|
+
}
|
|
6
|
+
export interface DuplicateCluster {
|
|
7
|
+
id: string;
|
|
8
|
+
type: 'exact' | 'near' | 'template_heavy';
|
|
9
|
+
nodes: GraphNode[];
|
|
10
|
+
size: number;
|
|
11
|
+
representative?: string;
|
|
12
|
+
severity?: 'low' | 'medium' | 'high';
|
|
13
|
+
}
|
|
14
|
+
export declare class DuplicateService {
|
|
15
|
+
/**
|
|
16
|
+
* Detects exact and near duplicates, identifies canonical conflicts,
|
|
17
|
+
* and performs non-destructive collapse of edges.
|
|
18
|
+
*/
|
|
19
|
+
detectDuplicates(graph: Graph, options?: DuplicateOptions): DuplicateCluster[];
|
|
20
|
+
private findExactDuplicates;
|
|
21
|
+
private groupNodesByContentHash;
|
|
22
|
+
private createExactClusters;
|
|
23
|
+
private findNearDuplicates;
|
|
24
|
+
private buildSimHashBuckets;
|
|
25
|
+
private findConnectedComponents;
|
|
26
|
+
private extractClusters;
|
|
27
|
+
private processClusters;
|
|
28
|
+
private processSingleCluster;
|
|
29
|
+
private checkTemplateHeavy;
|
|
30
|
+
private calculateSeverity;
|
|
31
|
+
private selectRepresentative;
|
|
32
|
+
private applyClusterToGraph;
|
|
33
|
+
private collapseEdges;
|
|
34
|
+
}
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
import { Graph } from '../graph/graph.js';
|
|
2
|
+
import { SimHash } from '../graph/simhash.js';
|
|
3
|
+
export class DuplicateService {
|
|
4
|
+
/**
|
|
5
|
+
* Detects exact and near duplicates, identifies canonical conflicts,
|
|
6
|
+
* and performs non-destructive collapse of edges.
|
|
7
|
+
*/
|
|
8
|
+
detectDuplicates(graph, options = {}) {
|
|
9
|
+
const collapse = options.collapse !== false; // Default to true
|
|
10
|
+
const threshold = options.simhashThreshold ?? 3;
|
|
11
|
+
const nodes = graph.getNodes();
|
|
12
|
+
let clusterCounter = 1;
|
|
13
|
+
// Phase 1 & 2: Exact Duplicate Detection
|
|
14
|
+
const { exactClusters, nearCandidates, nextId: nextId1 } = this.findExactDuplicates(nodes, clusterCounter);
|
|
15
|
+
clusterCounter = nextId1;
|
|
16
|
+
// Phase 3: Near Duplicate Detection
|
|
17
|
+
const { nearClusters } = this.findNearDuplicates(nearCandidates, threshold, clusterCounter);
|
|
18
|
+
const allClusters = [...exactClusters, ...nearClusters];
|
|
19
|
+
// Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative)
|
|
20
|
+
this.processClusters(allClusters, graph, collapse);
|
|
21
|
+
// Final Edge Transfer if Collapsing
|
|
22
|
+
if (collapse) {
|
|
23
|
+
this.collapseEdges(graph);
|
|
24
|
+
}
|
|
25
|
+
return allClusters;
|
|
26
|
+
}
|
|
27
|
+
findExactDuplicates(nodes, startId) {
|
|
28
|
+
const exactMap = this.groupNodesByContentHash(nodes);
|
|
29
|
+
return this.createExactClusters(exactMap, startId);
|
|
30
|
+
}
|
|
31
|
+
groupNodesByContentHash(nodes) {
|
|
32
|
+
const exactMap = new Map();
|
|
33
|
+
for (const node of nodes) {
|
|
34
|
+
if (!node.contentHash || node.status !== 200)
|
|
35
|
+
continue;
|
|
36
|
+
let arr = exactMap.get(node.contentHash);
|
|
37
|
+
if (!arr) {
|
|
38
|
+
arr = [];
|
|
39
|
+
exactMap.set(node.contentHash, arr);
|
|
40
|
+
}
|
|
41
|
+
arr.push(node);
|
|
42
|
+
}
|
|
43
|
+
return exactMap;
|
|
44
|
+
}
|
|
45
|
+
createExactClusters(exactMap, startId) {
|
|
46
|
+
const exactClusters = [];
|
|
47
|
+
const nearCandidates = [];
|
|
48
|
+
let clusterCounter = startId;
|
|
49
|
+
for (const group of exactMap.values()) {
|
|
50
|
+
if (group.length > 1) {
|
|
51
|
+
const id = `cluster_exact_${clusterCounter++}`;
|
|
52
|
+
exactClusters.push({ id, type: 'exact', nodes: group, size: group.length });
|
|
53
|
+
for (const n of group) {
|
|
54
|
+
n.duplicateClusterId = id;
|
|
55
|
+
n.duplicateType = 'exact';
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
nearCandidates.push(group[0]);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return { exactClusters, nearCandidates, nextId: clusterCounter };
|
|
63
|
+
}
|
|
64
|
+
findNearDuplicates(candidates, threshold, startId) {
|
|
65
|
+
const { bandsMaps, simhashes } = this.buildSimHashBuckets(candidates);
|
|
66
|
+
const { parent, involvedIndices } = this.findConnectedComponents(bandsMaps, simhashes, candidates.length, threshold);
|
|
67
|
+
return this.extractClusters(parent, involvedIndices, candidates, startId);
|
|
68
|
+
}
|
|
69
|
+
buildSimHashBuckets(candidates) {
|
|
70
|
+
const n = candidates.length;
|
|
71
|
+
const simhashes = new BigUint64Array(n);
|
|
72
|
+
const validIndices = [];
|
|
73
|
+
for (let i = 0; i < n; i++) {
|
|
74
|
+
if (candidates[i].simhash) {
|
|
75
|
+
simhashes[i] = BigInt(candidates[i].simhash);
|
|
76
|
+
validIndices.push(i);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
const bandsMaps = Array.from({ length: SimHash.BANDS }, () => new Map());
|
|
80
|
+
for (const idx of validIndices) {
|
|
81
|
+
const bands = SimHash.getBands(simhashes[idx]);
|
|
82
|
+
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
83
|
+
let arr = bandsMaps[b].get(bands[b]);
|
|
84
|
+
if (!arr) {
|
|
85
|
+
arr = [];
|
|
86
|
+
bandsMaps[b].set(bands[b], arr);
|
|
87
|
+
}
|
|
88
|
+
arr.push(idx);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return { bandsMaps, simhashes, validIndices };
|
|
92
|
+
}
|
|
93
|
+
findConnectedComponents(bandsMaps, simhashes, n, threshold) {
|
|
94
|
+
const parent = new Uint32Array(n);
|
|
95
|
+
const rank = new Uint8Array(n);
|
|
96
|
+
for (let i = 0; i < n; i++) {
|
|
97
|
+
parent[i] = i;
|
|
98
|
+
rank[i] = 0;
|
|
99
|
+
}
|
|
100
|
+
const find = (i) => {
|
|
101
|
+
let root = i;
|
|
102
|
+
while (parent[root] !== root) {
|
|
103
|
+
root = parent[root];
|
|
104
|
+
}
|
|
105
|
+
let curr = i;
|
|
106
|
+
while (curr !== root) {
|
|
107
|
+
const next = parent[curr];
|
|
108
|
+
parent[curr] = root;
|
|
109
|
+
curr = next;
|
|
110
|
+
}
|
|
111
|
+
return root;
|
|
112
|
+
};
|
|
113
|
+
const union = (i, j) => {
|
|
114
|
+
const rootI = find(i);
|
|
115
|
+
const rootJ = find(j);
|
|
116
|
+
if (rootI !== rootJ) {
|
|
117
|
+
const rankI = rank[rootI];
|
|
118
|
+
const rankJ = rank[rootJ];
|
|
119
|
+
if (rankI < rankJ) {
|
|
120
|
+
parent[rootI] = rootJ;
|
|
121
|
+
}
|
|
122
|
+
else if (rankI > rankJ) {
|
|
123
|
+
parent[rootJ] = rootI;
|
|
124
|
+
}
|
|
125
|
+
else {
|
|
126
|
+
parent[rootJ] = rootI;
|
|
127
|
+
rank[rootI]++;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
};
|
|
131
|
+
const involvedIndices = new Set();
|
|
132
|
+
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
133
|
+
for (const bucketIndices of bandsMaps[b].values()) {
|
|
134
|
+
if (bucketIndices.length < 2)
|
|
135
|
+
continue;
|
|
136
|
+
for (let j = 0; j < bucketIndices.length; j++) {
|
|
137
|
+
for (let k = j + 1; k < bucketIndices.length; k++) {
|
|
138
|
+
const idx1 = bucketIndices[j];
|
|
139
|
+
const idx2 = bucketIndices[k];
|
|
140
|
+
const root1 = find(idx1);
|
|
141
|
+
const root2 = find(idx2);
|
|
142
|
+
if (root1 === root2)
|
|
143
|
+
continue;
|
|
144
|
+
const dist = SimHash.hammingDistance(simhashes[idx1], simhashes[idx2]);
|
|
145
|
+
if (dist <= threshold) {
|
|
146
|
+
union(root1, root2);
|
|
147
|
+
involvedIndices.add(idx1);
|
|
148
|
+
involvedIndices.add(idx2);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return { parent, involvedIndices };
|
|
155
|
+
}
|
|
156
|
+
extractClusters(parent, involvedIndices, candidates, startId) {
|
|
157
|
+
const nearClusters = [];
|
|
158
|
+
let clusterCounter = startId;
|
|
159
|
+
const find = (i) => {
|
|
160
|
+
let root = i;
|
|
161
|
+
while (parent[root] !== root) {
|
|
162
|
+
root = parent[root];
|
|
163
|
+
}
|
|
164
|
+
let curr = i;
|
|
165
|
+
while (curr !== root) {
|
|
166
|
+
const next = parent[curr];
|
|
167
|
+
parent[curr] = root;
|
|
168
|
+
curr = next;
|
|
169
|
+
}
|
|
170
|
+
return root;
|
|
171
|
+
};
|
|
172
|
+
const clusterMap = new Map();
|
|
173
|
+
for (const idx of involvedIndices) {
|
|
174
|
+
const root = find(idx);
|
|
175
|
+
let group = clusterMap.get(root);
|
|
176
|
+
if (!group) {
|
|
177
|
+
group = [];
|
|
178
|
+
clusterMap.set(root, group);
|
|
179
|
+
}
|
|
180
|
+
group.push(idx);
|
|
181
|
+
}
|
|
182
|
+
for (const groupIndices of clusterMap.values()) {
|
|
183
|
+
if (groupIndices.length > 1) {
|
|
184
|
+
const id = `cluster_near_${clusterCounter++}`;
|
|
185
|
+
const groupNodes = groupIndices.map(idx => candidates[idx]);
|
|
186
|
+
nearClusters.push({ id, type: 'near', nodes: groupNodes, size: groupNodes.length });
|
|
187
|
+
for (const n of groupNodes) {
|
|
188
|
+
n.duplicateClusterId = id;
|
|
189
|
+
n.duplicateType = 'near';
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return { nearClusters, nextId: clusterCounter };
|
|
194
|
+
}
|
|
195
|
+
processClusters(clusters, graph, collapse) {
|
|
196
|
+
for (const cluster of clusters) {
|
|
197
|
+
this.processSingleCluster(cluster, graph, collapse);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
processSingleCluster(cluster, graph, collapse) {
|
|
201
|
+
this.checkTemplateHeavy(cluster);
|
|
202
|
+
cluster.severity = this.calculateSeverity(cluster);
|
|
203
|
+
const representative = this.selectRepresentative(cluster);
|
|
204
|
+
cluster.representative = representative.url;
|
|
205
|
+
this.applyClusterToGraph(cluster, representative, graph, collapse);
|
|
206
|
+
}
|
|
207
|
+
checkTemplateHeavy(cluster) {
|
|
208
|
+
const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
|
|
209
|
+
if (avgRatio < 0.3) {
|
|
210
|
+
cluster.type = 'template_heavy';
|
|
211
|
+
cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
calculateSeverity(cluster) {
|
|
215
|
+
const canonicals = new Set();
|
|
216
|
+
let hasMissing = false;
|
|
217
|
+
for (const n of cluster.nodes) {
|
|
218
|
+
if (!n.canonical)
|
|
219
|
+
hasMissing = true;
|
|
220
|
+
else
|
|
221
|
+
canonicals.add(n.canonical);
|
|
222
|
+
}
|
|
223
|
+
if (hasMissing || canonicals.size > 1) {
|
|
224
|
+
return 'high';
|
|
225
|
+
}
|
|
226
|
+
else if (cluster.type === 'near') {
|
|
227
|
+
return 'medium';
|
|
228
|
+
}
|
|
229
|
+
else {
|
|
230
|
+
return 'low';
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
selectRepresentative(cluster) {
|
|
234
|
+
const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
|
|
235
|
+
const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
|
|
236
|
+
if (validCanonicals.length > 0) {
|
|
237
|
+
return validCanonicals[0];
|
|
238
|
+
}
|
|
239
|
+
return cluster.nodes.reduce((best, current) => {
|
|
240
|
+
if (current.inLinks > best.inLinks)
|
|
241
|
+
return current;
|
|
242
|
+
if (current.inLinks < best.inLinks)
|
|
243
|
+
return best;
|
|
244
|
+
if (current.url.length < best.url.length)
|
|
245
|
+
return current;
|
|
246
|
+
return best;
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
applyClusterToGraph(cluster, representative, graph, collapse) {
|
|
250
|
+
cluster.nodes.forEach(n => {
|
|
251
|
+
n.isClusterPrimary = n.url === representative.url;
|
|
252
|
+
n.isCollapsed = false;
|
|
253
|
+
n.collapseInto = undefined;
|
|
254
|
+
});
|
|
255
|
+
if (!graph.duplicateClusters) {
|
|
256
|
+
graph.duplicateClusters = [];
|
|
257
|
+
}
|
|
258
|
+
graph.duplicateClusters.push({
|
|
259
|
+
id: cluster.id,
|
|
260
|
+
type: cluster.type,
|
|
261
|
+
size: cluster.nodes.length,
|
|
262
|
+
representative: representative.url,
|
|
263
|
+
severity: cluster.severity
|
|
264
|
+
});
|
|
265
|
+
if (collapse) {
|
|
266
|
+
for (const n of cluster.nodes) {
|
|
267
|
+
if (n.url !== representative.url) {
|
|
268
|
+
n.isCollapsed = true;
|
|
269
|
+
n.collapseInto = representative.url;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
collapseEdges(graph) {
|
|
275
|
+
const edges = graph.getEdges();
|
|
276
|
+
const updatedEdges = new Map();
|
|
277
|
+
for (const edge of edges) {
|
|
278
|
+
const targetNode = graph.nodes.get(edge.target);
|
|
279
|
+
if (!targetNode)
|
|
280
|
+
continue;
|
|
281
|
+
const actualSource = edge.source;
|
|
282
|
+
const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
|
|
283
|
+
if (actualSource === actualTarget)
|
|
284
|
+
continue;
|
|
285
|
+
const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
|
|
286
|
+
const existingWeight = updatedEdges.get(edgeKey) || 0;
|
|
287
|
+
updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
|
|
288
|
+
}
|
|
289
|
+
graph.edges = updatedEdges;
|
|
290
|
+
// Re-calculate inLinks and outLinks
|
|
291
|
+
for (const node of graph.getNodes()) {
|
|
292
|
+
node.inLinks = 0;
|
|
293
|
+
node.outLinks = 0;
|
|
294
|
+
}
|
|
295
|
+
for (const edgeKey of updatedEdges.keys()) {
|
|
296
|
+
const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
|
|
297
|
+
const sn = graph.nodes.get(src);
|
|
298
|
+
const tn = graph.nodes.get(tgt);
|
|
299
|
+
if (sn)
|
|
300
|
+
sn.outLinks++;
|
|
301
|
+
if (tn)
|
|
302
|
+
tn.inLinks++;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
}
|