@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export interface CrawlithConfig {
|
|
2
|
+
[section: string]: {
|
|
3
|
+
key?: string;
|
|
4
|
+
createdAt?: number;
|
|
5
|
+
[key: string]: unknown;
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Resolve the canonical Crawlith config file path.
|
|
10
|
+
*/
|
|
11
|
+
export declare function getCrawlithConfigPath(): string;
|
|
12
|
+
/**
|
|
13
|
+
* Return section config, or undefined if config file/section does not exist.
|
|
14
|
+
*/
|
|
15
|
+
export declare function getConfigSection(section: string): CrawlithConfig[string] | undefined;
|
|
16
|
+
/**
|
|
17
|
+
* Encrypt and persist a section API key in ~/.crawlith/config.json.
|
|
18
|
+
*/
|
|
19
|
+
export declare function setEncryptedConfigKey(section: string, apiKey: string): void;
|
|
20
|
+
/**
|
|
21
|
+
* Get and decrypt the API key for a config section.
|
|
22
|
+
*/
|
|
23
|
+
export declare function getDecryptedConfigKey(section: string): string;
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import os from 'node:os';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import crypto from 'node:crypto';
|
|
5
|
+
const CONFIG_DIR = path.join(os.homedir(), '.crawlith');
|
|
6
|
+
const CONFIG_PATH = path.join(CONFIG_DIR, 'config.json');
|
|
7
|
+
/**
|
|
8
|
+
* Resolve the canonical Crawlith config file path.
|
|
9
|
+
*/
|
|
10
|
+
export function getCrawlithConfigPath() {
|
|
11
|
+
return CONFIG_PATH;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Return section config, or undefined if config file/section does not exist.
|
|
15
|
+
*/
|
|
16
|
+
export function getConfigSection(section) {
|
|
17
|
+
const config = readConfigFile(false);
|
|
18
|
+
if (!config)
|
|
19
|
+
return undefined;
|
|
20
|
+
return config[section];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Encrypt and persist a section API key in ~/.crawlith/config.json.
|
|
24
|
+
*/
|
|
25
|
+
export function setEncryptedConfigKey(section, apiKey) {
|
|
26
|
+
const config = readConfigFile(false) || {};
|
|
27
|
+
config[section] = {
|
|
28
|
+
...(config[section] || {}),
|
|
29
|
+
key: encryptString(apiKey),
|
|
30
|
+
createdAt: Math.floor(Date.now() / 1000)
|
|
31
|
+
};
|
|
32
|
+
writeConfigFile(config);
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Get and decrypt the API key for a config section.
|
|
36
|
+
*/
|
|
37
|
+
export function getDecryptedConfigKey(section) {
|
|
38
|
+
if (!fs.existsSync(CONFIG_PATH)) {
|
|
39
|
+
throw new Error(`Missing ${section} config. Run: crawlith config ${section} set <api_key>`);
|
|
40
|
+
}
|
|
41
|
+
const config = readConfigFile(true);
|
|
42
|
+
if (!config) {
|
|
43
|
+
throw new Error(`Missing ${section} config. Run: crawlith config ${section} set <api_key>`);
|
|
44
|
+
}
|
|
45
|
+
const payload = config[section]?.key;
|
|
46
|
+
if (!payload || typeof payload !== 'string') {
|
|
47
|
+
throw new Error(`Missing ${section} key in config. Run: crawlith config ${section} set <api_key>`);
|
|
48
|
+
}
|
|
49
|
+
return decryptString(payload);
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Read config file from disk.
|
|
53
|
+
*/
|
|
54
|
+
function readConfigFile(required) {
|
|
55
|
+
if (!fs.existsSync(CONFIG_PATH)) {
|
|
56
|
+
if (required) {
|
|
57
|
+
throw new Error('Missing config file. Run: crawlith config <service> set <api_key>');
|
|
58
|
+
}
|
|
59
|
+
return null;
|
|
60
|
+
}
|
|
61
|
+
try {
|
|
62
|
+
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
throw new Error('Corrupt config file at ~/.crawlith/config.json. Refusing to continue.');
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Persist config to disk with secure permissions.
|
|
70
|
+
*/
|
|
71
|
+
function writeConfigFile(config) {
|
|
72
|
+
if (!fs.existsSync(CONFIG_DIR)) {
|
|
73
|
+
fs.mkdirSync(CONFIG_DIR, { recursive: true, mode: 0o700 });
|
|
74
|
+
}
|
|
75
|
+
fs.writeFileSync(CONFIG_PATH, JSON.stringify(config, null, 2), { encoding: 'utf8', mode: 0o600 });
|
|
76
|
+
fs.chmodSync(CONFIG_PATH, 0o600);
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Build a machine-bound secret so encrypted config blobs are not portable across systems.
|
|
80
|
+
*/
|
|
81
|
+
function getMachineSecret() {
|
|
82
|
+
return `${os.hostname()}::${os.userInfo().username}`;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Encrypt plaintext using AES-256-GCM and scrypt-derived key.
|
|
86
|
+
*/
|
|
87
|
+
function encryptString(plaintext) {
|
|
88
|
+
const salt = crypto.randomBytes(16);
|
|
89
|
+
const iv = crypto.randomBytes(12);
|
|
90
|
+
const key = crypto.scryptSync(getMachineSecret(), salt, 32);
|
|
91
|
+
const cipher = crypto.createCipheriv('aes-256-gcm', key, iv);
|
|
92
|
+
const encrypted = Buffer.concat([cipher.update(plaintext, 'utf8'), cipher.final()]);
|
|
93
|
+
const payload = {
|
|
94
|
+
salt: salt.toString('base64'),
|
|
95
|
+
iv: iv.toString('base64'),
|
|
96
|
+
tag: cipher.getAuthTag().toString('base64'),
|
|
97
|
+
data: encrypted.toString('base64')
|
|
98
|
+
};
|
|
99
|
+
return Buffer.from(JSON.stringify(payload), 'utf8').toString('base64');
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Decrypt an encrypted base64 payload from config.json.
|
|
103
|
+
*/
|
|
104
|
+
function decryptString(encodedPayload) {
|
|
105
|
+
let payload;
|
|
106
|
+
try {
|
|
107
|
+
payload = JSON.parse(Buffer.from(encodedPayload, 'base64').toString('utf8'));
|
|
108
|
+
}
|
|
109
|
+
catch {
|
|
110
|
+
throw new Error('Corrupt config payload: unable to parse encrypted key data.');
|
|
111
|
+
}
|
|
112
|
+
if (!payload?.salt || !payload?.iv || !payload?.tag || !payload?.data) {
|
|
113
|
+
throw new Error('Corrupt config payload: required encryption fields are missing.');
|
|
114
|
+
}
|
|
115
|
+
try {
|
|
116
|
+
const salt = Buffer.from(payload.salt, 'base64');
|
|
117
|
+
const iv = Buffer.from(payload.iv, 'base64');
|
|
118
|
+
const tag = Buffer.from(payload.tag, 'base64');
|
|
119
|
+
const data = Buffer.from(payload.data, 'base64');
|
|
120
|
+
const key = crypto.scryptSync(getMachineSecret(), salt, 32);
|
|
121
|
+
const decipher = crypto.createDecipheriv('aes-256-gcm', key, iv);
|
|
122
|
+
decipher.setAuthTag(tag);
|
|
123
|
+
return Buffer.concat([decipher.update(data), decipher.final()]).toString('utf8');
|
|
124
|
+
}
|
|
125
|
+
catch {
|
|
126
|
+
throw new Error('Unable to decrypt config key. Config may be invalid or tied to another machine/user.');
|
|
127
|
+
}
|
|
128
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlith/core",
|
|
3
|
-
"
|
|
3
|
+
"license": "Apache-2.0",
|
|
4
|
+
"version": "0.1.2",
|
|
5
|
+
"description": "Headless intelligence engine for Crawlith. Handles crawling, graph analysis, scoring, and data persistence.",
|
|
4
6
|
"type": "module",
|
|
5
7
|
"main": "dist/index.js",
|
|
6
8
|
"types": "dist/index.d.ts",
|
|
@@ -11,19 +13,23 @@
|
|
|
11
13
|
"default": "./dist/index.js"
|
|
12
14
|
}
|
|
13
15
|
},
|
|
16
|
+
"files": [
|
|
17
|
+
"dist"
|
|
18
|
+
],
|
|
14
19
|
"dependencies": {
|
|
15
20
|
"better-sqlite3": "^12.6.2",
|
|
16
|
-
"chalk": "^5.3.0",
|
|
17
21
|
"cheerio": "^1.0.0-rc.12",
|
|
22
|
+
"commander": "^12.1.0",
|
|
18
23
|
"p-limit": "^7.3.0",
|
|
19
24
|
"robots-parser": "^3.0.1",
|
|
20
|
-
"undici": "^6.13.0"
|
|
21
|
-
"vite": "7.3.1"
|
|
25
|
+
"undici": "^6.13.0"
|
|
22
26
|
},
|
|
23
27
|
"devDependencies": {
|
|
24
28
|
"@types/better-sqlite3": "^7.6.13",
|
|
29
|
+
"@types/cheerio": "0.22.35",
|
|
25
30
|
"@types/node": "^20.12.7",
|
|
26
31
|
"typescript": "^5.4.5",
|
|
32
|
+
"vite": "7.3.1",
|
|
27
33
|
"vitest": "^4.0.18"
|
|
28
34
|
},
|
|
29
35
|
"scripts": {
|
package/CHANGELOG.md
DELETED
package/dist/db/schema.d.ts
DELETED
package/dist/graph/cluster.d.ts
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
import { Graph, ClusterInfo } from './graph.js';
|
|
2
|
-
/**
|
|
3
|
-
* Detects content clusters using 64-bit SimHash and Hamming Distance.
|
|
4
|
-
* Uses band optimization to reduce O(n^2) comparisons.
|
|
5
|
-
*/
|
|
6
|
-
export declare function detectContentClusters(graph: Graph, threshold?: number, minSize?: number): ClusterInfo[];
|
package/dist/graph/cluster.js
DELETED
|
@@ -1,221 +0,0 @@
|
|
|
1
|
-
import { SimHash } from './simhash.js';
|
|
2
|
-
import { load } from 'cheerio';
|
|
3
|
-
/**
|
|
4
|
-
* Detects content clusters using 64-bit SimHash and Hamming Distance.
|
|
5
|
-
* Uses band optimization to reduce O(n^2) comparisons.
|
|
6
|
-
*/
|
|
7
|
-
export function detectContentClusters(graph, threshold = 10, minSize = 3) {
|
|
8
|
-
const nodes = graph.getNodes().filter(n => n.simhash && n.status === 200);
|
|
9
|
-
if (nodes.length === 0)
|
|
10
|
-
return [];
|
|
11
|
-
const adjacency = new Map();
|
|
12
|
-
// Banding Optimization (4 bands of 16 bits)
|
|
13
|
-
// Note: For threshold > 3, this is a heuristic and may miss some pairs,
|
|
14
|
-
// but it dramatically reduces the search space as requested.
|
|
15
|
-
const buckets = Array.from({ length: SimHash.BANDS }, () => new Map());
|
|
16
|
-
for (const node of nodes) {
|
|
17
|
-
const hash = BigInt(node.simhash);
|
|
18
|
-
const bandValues = SimHash.getBands(hash);
|
|
19
|
-
bandValues.forEach((bandValue, b) => {
|
|
20
|
-
if (!buckets[b].has(bandValue)) {
|
|
21
|
-
buckets[b].set(bandValue, new Set());
|
|
22
|
-
}
|
|
23
|
-
buckets[b].get(bandValue).add(node.url);
|
|
24
|
-
});
|
|
25
|
-
}
|
|
26
|
-
const checkedPairs = new Set();
|
|
27
|
-
for (let b = 0; b < SimHash.BANDS; b++) {
|
|
28
|
-
for (const bucket of buckets[b].values()) {
|
|
29
|
-
if (bucket.size < 2)
|
|
30
|
-
continue;
|
|
31
|
-
const bucketNodes = Array.from(bucket);
|
|
32
|
-
for (let i = 0; i < bucketNodes.length; i++) {
|
|
33
|
-
for (let j = i + 1; j < bucketNodes.length; j++) {
|
|
34
|
-
const u1 = bucketNodes[i];
|
|
35
|
-
const u2 = bucketNodes[j];
|
|
36
|
-
if (u1 === u2)
|
|
37
|
-
continue;
|
|
38
|
-
const pairKey = u1 < u2 ? `${u1}|${u2}` : `${u2}|${u1}`;
|
|
39
|
-
if (checkedPairs.has(pairKey))
|
|
40
|
-
continue;
|
|
41
|
-
checkedPairs.add(pairKey);
|
|
42
|
-
const n1 = graph.nodes.get(u1);
|
|
43
|
-
const n2 = graph.nodes.get(u2);
|
|
44
|
-
const dist = SimHash.hammingDistance(BigInt(n1.simhash), BigInt(n2.simhash));
|
|
45
|
-
if (dist <= threshold) {
|
|
46
|
-
if (!adjacency.has(u1))
|
|
47
|
-
adjacency.set(u1, new Set());
|
|
48
|
-
if (!adjacency.has(u2))
|
|
49
|
-
adjacency.set(u2, new Set());
|
|
50
|
-
adjacency.get(u1).add(u2);
|
|
51
|
-
adjacency.get(u2).add(u1);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
// Find connected components (Clusters)
|
|
58
|
-
const visited = new Set();
|
|
59
|
-
const clusters = [];
|
|
60
|
-
for (const node of nodes) {
|
|
61
|
-
if (visited.has(node.url))
|
|
62
|
-
continue;
|
|
63
|
-
const component = [];
|
|
64
|
-
const queue = [node.url];
|
|
65
|
-
visited.add(node.url);
|
|
66
|
-
while (queue.length > 0) {
|
|
67
|
-
const current = queue.shift();
|
|
68
|
-
component.push(current);
|
|
69
|
-
const neighbors = adjacency.get(current);
|
|
70
|
-
if (neighbors) {
|
|
71
|
-
for (const neighbor of neighbors) {
|
|
72
|
-
if (!visited.has(neighbor)) {
|
|
73
|
-
visited.add(neighbor);
|
|
74
|
-
queue.push(neighbor);
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
if (component.length >= minSize) {
|
|
80
|
-
clusters.push(component);
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
// Sort clusters by size (descending) then by primary URL (ascending) for deterministic IDs
|
|
84
|
-
clusters.sort((a, b) => {
|
|
85
|
-
if (b.length !== a.length)
|
|
86
|
-
return b.length - a.length;
|
|
87
|
-
const aPrimary = selectPrimaryUrl(a, graph);
|
|
88
|
-
const bPrimary = selectPrimaryUrl(b, graph);
|
|
89
|
-
return aPrimary.localeCompare(bPrimary);
|
|
90
|
-
});
|
|
91
|
-
const clusterInfos = [];
|
|
92
|
-
clusters.forEach((memberUrls, index) => {
|
|
93
|
-
const clusterId = index + 1;
|
|
94
|
-
const clusterNodes = memberUrls.map(url => graph.nodes.get(url));
|
|
95
|
-
for (const node of clusterNodes) {
|
|
96
|
-
node.clusterId = clusterId;
|
|
97
|
-
}
|
|
98
|
-
const primaryUrl = selectPrimaryUrl(memberUrls, graph);
|
|
99
|
-
const risk = calculateClusterRisk(clusterNodes);
|
|
100
|
-
const sharedPathPrefix = findSharedPathPrefix(memberUrls);
|
|
101
|
-
clusterInfos.push({
|
|
102
|
-
id: clusterId,
|
|
103
|
-
count: memberUrls.length,
|
|
104
|
-
primaryUrl,
|
|
105
|
-
risk,
|
|
106
|
-
sharedPathPrefix
|
|
107
|
-
});
|
|
108
|
-
});
|
|
109
|
-
graph.contentClusters = clusterInfos;
|
|
110
|
-
return clusterInfos;
|
|
111
|
-
}
|
|
112
|
-
/**
|
|
113
|
-
* Selects the primary URL for a cluster based on:
|
|
114
|
-
* 1. Highest PageRank
|
|
115
|
-
* 2. Shortest URL
|
|
116
|
-
* 3. Lexicographic fallback
|
|
117
|
-
*/
|
|
118
|
-
function selectPrimaryUrl(urls, graph) {
|
|
119
|
-
return urls.reduce((best, current) => {
|
|
120
|
-
const nBest = graph.nodes.get(best);
|
|
121
|
-
const nCurrent = graph.nodes.get(current);
|
|
122
|
-
if ((nCurrent.pageRank || 0) > (nBest.pageRank || 0))
|
|
123
|
-
return current;
|
|
124
|
-
if ((nCurrent.pageRank || 0) < (nBest.pageRank || 0))
|
|
125
|
-
return best;
|
|
126
|
-
if (current.length < best.length)
|
|
127
|
-
return current;
|
|
128
|
-
if (current.length > best.length)
|
|
129
|
-
return best;
|
|
130
|
-
return current.localeCompare(best) < 0 ? current : best;
|
|
131
|
-
});
|
|
132
|
-
}
|
|
133
|
-
/**
|
|
134
|
-
* Calculates cannibalization risk based on title and H1 similarity within the cluster.
|
|
135
|
-
*/
|
|
136
|
-
function calculateClusterRisk(nodes) {
|
|
137
|
-
if (nodes.length <= 1)
|
|
138
|
-
return 'low';
|
|
139
|
-
// Count title and H1 occurrences
|
|
140
|
-
const titleCounts = new Map();
|
|
141
|
-
const h1Counts = new Map();
|
|
142
|
-
let processedCount = 0;
|
|
143
|
-
for (const node of nodes) {
|
|
144
|
-
if (!node.html)
|
|
145
|
-
continue;
|
|
146
|
-
try {
|
|
147
|
-
const $ = load(node.html);
|
|
148
|
-
const title = $('title').text().trim().toLowerCase();
|
|
149
|
-
const h1 = $('h1').first().text().trim().toLowerCase();
|
|
150
|
-
if (title) {
|
|
151
|
-
titleCounts.set(title, (titleCounts.get(title) || 0) + 1);
|
|
152
|
-
}
|
|
153
|
-
if (h1) {
|
|
154
|
-
h1Counts.set(h1, (h1Counts.get(h1) || 0) + 1);
|
|
155
|
-
}
|
|
156
|
-
processedCount++;
|
|
157
|
-
}
|
|
158
|
-
catch {
|
|
159
|
-
// Ignore parsing errors
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
// If we couldn't parse enough content (e.g., no HTML stored), fallback to size-based heuristic
|
|
163
|
-
if (processedCount < nodes.length * 0.5) {
|
|
164
|
-
if (nodes.length > 5)
|
|
165
|
-
return 'high';
|
|
166
|
-
if (nodes.length > 2)
|
|
167
|
-
return 'medium';
|
|
168
|
-
return 'low';
|
|
169
|
-
}
|
|
170
|
-
// Calculate duplicate ratios
|
|
171
|
-
let duplicateTitleCount = 0;
|
|
172
|
-
let duplicateH1Count = 0;
|
|
173
|
-
for (const count of titleCounts.values()) {
|
|
174
|
-
if (count > 1)
|
|
175
|
-
duplicateTitleCount += count;
|
|
176
|
-
}
|
|
177
|
-
for (const count of h1Counts.values()) {
|
|
178
|
-
if (count > 1)
|
|
179
|
-
duplicateH1Count += count;
|
|
180
|
-
}
|
|
181
|
-
const titleDupeRatio = duplicateTitleCount / nodes.length;
|
|
182
|
-
const h1DupeRatio = duplicateH1Count / nodes.length;
|
|
183
|
-
// Heuristic 1: High Risk
|
|
184
|
-
// Significant overlap in Titles OR H1s (e.g., > 30% of cluster members are duplicates)
|
|
185
|
-
if (titleDupeRatio > 0.3 || h1DupeRatio > 0.3) {
|
|
186
|
-
return 'high';
|
|
187
|
-
}
|
|
188
|
-
// Heuristic 2: Medium Risk
|
|
189
|
-
// Any overlap, or very large clusters (potential template issues or thin content)
|
|
190
|
-
if (titleDupeRatio > 0 || h1DupeRatio > 0 || nodes.length > 10) {
|
|
191
|
-
return 'medium';
|
|
192
|
-
}
|
|
193
|
-
// Heuristic 3: Low Risk
|
|
194
|
-
// Unique content and manageable cluster size
|
|
195
|
-
return 'low';
|
|
196
|
-
}
|
|
197
|
-
/**
|
|
198
|
-
* Finds the common path prefix among a set of URLs.
|
|
199
|
-
*/
|
|
200
|
-
function findSharedPathPrefix(urls) {
|
|
201
|
-
if (urls.length < 2)
|
|
202
|
-
return undefined;
|
|
203
|
-
try {
|
|
204
|
-
const paths = urls.map(u => new URL(u).pathname.split('/').filter(Boolean));
|
|
205
|
-
const first = paths[0];
|
|
206
|
-
const common = [];
|
|
207
|
-
for (let i = 0; i < first.length; i++) {
|
|
208
|
-
const segment = first[i];
|
|
209
|
-
if (paths.every(p => p[i] === segment)) {
|
|
210
|
-
common.push(segment);
|
|
211
|
-
}
|
|
212
|
-
else {
|
|
213
|
-
break;
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
return common.length > 0 ? '/' + common.join('/') : undefined;
|
|
217
|
-
}
|
|
218
|
-
catch {
|
|
219
|
-
return undefined;
|
|
220
|
-
}
|
|
221
|
-
}
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
import { Graph } from './graph.js';
|
|
2
|
-
export interface DuplicateOptions {
|
|
3
|
-
collapse?: boolean;
|
|
4
|
-
simhashThreshold?: number;
|
|
5
|
-
}
|
|
6
|
-
/**
|
|
7
|
-
* Detects exact and near duplicates, identifies canonical conflicts,
|
|
8
|
-
* and performs non-destructive collapse of edges.
|
|
9
|
-
*/
|
|
10
|
-
export declare function detectDuplicates(graph: Graph, options?: DuplicateOptions): void;
|