npm - @crawlith/core - Versions diffs - 0.1.0 → 0.1.2 - Mend

@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

package/LICENSE +201 -0
package/README.md +70 -0
package/dist/analysis/analysis_list.html +35 -0
package/dist/analysis/analysis_page.html +123 -0
package/dist/analysis/analyze.d.ts +40 -5
package/dist/analysis/analyze.js +395 -347
package/dist/analysis/clustering.d.ts +23 -0
package/dist/analysis/clustering.js +206 -0
package/dist/analysis/content.d.ts +1 -1
package/dist/analysis/content.js +11 -5
package/dist/analysis/duplicate.d.ts +34 -0
package/dist/analysis/duplicate.js +305 -0
package/dist/analysis/heading.d.ts +116 -0
package/dist/analysis/heading.js +356 -0
package/dist/analysis/images.d.ts +1 -1
package/dist/analysis/images.js +6 -5
package/dist/analysis/links.d.ts +1 -1
package/dist/analysis/links.js +8 -8
package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
package/dist/analysis/scoring.js +11 -2
package/dist/analysis/seo.d.ts +8 -4
package/dist/analysis/seo.js +41 -30
package/dist/analysis/soft404.d.ts +17 -0
package/dist/analysis/soft404.js +62 -0
package/dist/analysis/structuredData.d.ts +1 -1
package/dist/analysis/structuredData.js +5 -4
package/dist/analysis/templates.d.ts +2 -0
package/dist/analysis/templates.js +7 -0
package/dist/application/index.d.ts +2 -0
package/dist/application/index.js +2 -0
package/dist/application/usecase.d.ts +3 -0
package/dist/application/usecase.js +1 -0
package/dist/application/usecases.d.ts +114 -0
package/dist/application/usecases.js +201 -0
package/dist/audit/index.js +1 -1
package/dist/audit/transport.d.ts +1 -1
package/dist/audit/transport.js +5 -4
package/dist/audit/types.d.ts +1 -0
package/dist/constants.d.ts +17 -0
package/dist/constants.js +23 -0
package/dist/core/scope/scopeManager.js +3 -0
package/dist/core/security/ipGuard.d.ts +11 -0
package/dist/core/security/ipGuard.js +71 -3
package/dist/crawler/crawl.d.ts +4 -22
package/dist/crawler/crawl.js +4 -335
package/dist/crawler/crawler.d.ts +87 -0
package/dist/crawler/crawler.js +683 -0
package/dist/crawler/extract.d.ts +4 -1
package/dist/crawler/extract.js +7 -2
package/dist/crawler/fetcher.d.ts +2 -1
package/dist/crawler/fetcher.js +26 -11
package/dist/crawler/metricsRunner.d.ts +23 -1
package/dist/crawler/metricsRunner.js +202 -72
package/dist/crawler/normalize.d.ts +41 -0
package/dist/crawler/normalize.js +119 -3
package/dist/crawler/parser.d.ts +1 -3
package/dist/crawler/parser.js +2 -49
package/dist/crawler/resolver.d.ts +11 -0
package/dist/crawler/resolver.js +67 -0
package/dist/crawler/sitemap.d.ts +6 -0
package/dist/crawler/sitemap.js +27 -17
package/dist/crawler/trap.d.ts +5 -1
package/dist/crawler/trap.js +23 -2
package/dist/db/CrawlithDB.d.ts +110 -0
package/dist/db/CrawlithDB.js +500 -0
package/dist/db/graphLoader.js +42 -30
package/dist/db/index.d.ts +11 -0
package/dist/db/index.js +41 -29
package/dist/db/migrations.d.ts +2 -0
package/dist/db/{schema.js → migrations.js} +90 -43
package/dist/db/pluginRegistry.d.ts +9 -0
package/dist/db/pluginRegistry.js +19 -0
package/dist/db/repositories/EdgeRepository.d.ts +13 -0
package/dist/db/repositories/EdgeRepository.js +20 -0
package/dist/db/repositories/MetricsRepository.d.ts +16 -8
package/dist/db/repositories/MetricsRepository.js +28 -7
package/dist/db/repositories/PageRepository.d.ts +15 -2
package/dist/db/repositories/PageRepository.js +169 -25
package/dist/db/repositories/SiteRepository.d.ts +9 -0
package/dist/db/repositories/SiteRepository.js +13 -0
package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
package/dist/db/repositories/SnapshotRepository.js +64 -5
package/dist/db/reset.d.ts +9 -0
package/dist/db/reset.js +32 -0
package/dist/db/statements.d.ts +12 -0
package/dist/db/statements.js +40 -0
package/dist/diff/compare.d.ts +0 -5
package/dist/diff/compare.js +0 -12
package/dist/diff/service.d.ts +16 -0
package/dist/diff/service.js +41 -0
package/dist/domain/index.d.ts +4 -0
package/dist/domain/index.js +4 -0
package/dist/events.d.ts +56 -0
package/dist/events.js +1 -0
package/dist/graph/graph.d.ts +36 -42
package/dist/graph/graph.js +26 -17
package/dist/graph/hits.d.ts +23 -0
package/dist/graph/hits.js +111 -0
package/dist/graph/metrics.d.ts +0 -4
package/dist/graph/metrics.js +25 -9
package/dist/graph/pagerank.d.ts +17 -4
package/dist/graph/pagerank.js +126 -91
package/dist/graph/simhash.d.ts +6 -0
package/dist/graph/simhash.js +14 -0
package/dist/index.d.ts +29 -8
package/dist/index.js +29 -8
package/dist/lock/hashKey.js +1 -1
package/dist/lock/lockManager.d.ts +5 -1
package/dist/lock/lockManager.js +38 -13
package/dist/plugin-system/plugin-cli.d.ts +10 -0
package/dist/plugin-system/plugin-cli.js +31 -0
package/dist/plugin-system/plugin-config.d.ts +16 -0
package/dist/plugin-system/plugin-config.js +36 -0
package/dist/plugin-system/plugin-loader.d.ts +17 -0
package/dist/plugin-system/plugin-loader.js +122 -0
package/dist/plugin-system/plugin-registry.d.ts +25 -0
package/dist/plugin-system/plugin-registry.js +167 -0
package/dist/plugin-system/plugin-types.d.ts +205 -0
package/dist/plugin-system/plugin-types.js +1 -0
package/dist/ports/index.d.ts +9 -0
package/dist/ports/index.js +1 -0
package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
package/dist/report/crawlExport.d.ts +3 -0
package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
package/dist/report/crawl_template.d.ts +1 -0
package/dist/report/crawl_template.js +7 -0
package/dist/report/export.d.ts +3 -0
package/dist/report/export.js +81 -0
package/dist/report/html.js +15 -216
package/dist/report/insight.d.ts +27 -0
package/dist/report/insight.js +103 -0
package/dist/scoring/health.d.ts +56 -0
package/dist/scoring/health.js +213 -0
package/dist/utils/chalk.d.ts +6 -0
package/dist/utils/chalk.js +41 -0
package/dist/utils/secureConfig.d.ts +23 -0
package/dist/utils/secureConfig.js +128 -0
package/package.json +12 -6
package/CHANGELOG.md +0 -7
package/dist/db/schema.d.ts +0 -2
package/dist/graph/cluster.d.ts +0 -6
package/dist/graph/cluster.js +0 -173
package/dist/graph/duplicate.d.ts +0 -10
package/dist/graph/duplicate.js +0 -251
package/dist/report/sitegraphExport.d.ts +0 -3
package/dist/report/sitegraph_template.d.ts +0 -1
package/dist/report/sitegraph_template.js +0 -630
package/dist/scoring/hits.d.ts +0 -9
package/dist/scoring/hits.js +0 -111
package/src/analysis/analyze.ts +0 -548
package/src/analysis/content.ts +0 -62
package/src/analysis/images.ts +0 -28
package/src/analysis/links.ts +0 -41
package/src/analysis/scoring.ts +0 -59
package/src/analysis/seo.ts +0 -82
package/src/analysis/structuredData.ts +0 -62
package/src/audit/dns.ts +0 -49
package/src/audit/headers.ts +0 -98
package/src/audit/index.ts +0 -66
package/src/audit/scoring.ts +0 -232
package/src/audit/transport.ts +0 -258
package/src/audit/types.ts +0 -102
package/src/core/network/proxyAdapter.ts +0 -21
package/src/core/network/rateLimiter.ts +0 -39
package/src/core/network/redirectController.ts +0 -47
package/src/core/network/responseLimiter.ts +0 -34
package/src/core/network/retryPolicy.ts +0 -57
package/src/core/scope/domainFilter.ts +0 -45
package/src/core/scope/scopeManager.ts +0 -52
package/src/core/scope/subdomainPolicy.ts +0 -39
package/src/core/security/ipGuard.ts +0 -92
package/src/crawler/crawl.ts +0 -382
package/src/crawler/extract.ts +0 -34
package/src/crawler/fetcher.ts +0 -233
package/src/crawler/metricsRunner.ts +0 -124
package/src/crawler/normalize.ts +0 -108
package/src/crawler/parser.ts +0 -190
package/src/crawler/sitemap.ts +0 -73
package/src/crawler/trap.ts +0 -96
package/src/db/graphLoader.ts +0 -105
package/src/db/index.ts +0 -70
package/src/db/repositories/EdgeRepository.ts +0 -29
package/src/db/repositories/MetricsRepository.ts +0 -49
package/src/db/repositories/PageRepository.ts +0 -128
package/src/db/repositories/SiteRepository.ts +0 -32
package/src/db/repositories/SnapshotRepository.ts +0 -74
package/src/db/schema.ts +0 -177
package/src/diff/compare.ts +0 -84
package/src/graph/cluster.ts +0 -192
package/src/graph/duplicate.ts +0 -286
package/src/graph/graph.ts +0 -172
package/src/graph/metrics.ts +0 -110
package/src/graph/pagerank.ts +0 -125
package/src/graph/simhash.ts +0 -61
package/src/index.ts +0 -30
package/src/lock/hashKey.ts +0 -51
package/src/lock/lockManager.ts +0 -124
package/src/lock/pidCheck.ts +0 -13
package/src/report/html.ts +0 -227
package/src/report/sitegraphExport.ts +0 -58
package/src/scoring/hits.ts +0 -131
package/src/scoring/orphanSeverity.ts +0 -176
package/src/utils/version.ts +0 -18
package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
package/tests/analysis.unit.test.ts +0 -98
package/tests/analyze.integration.test.ts +0 -98
package/tests/audit/dns.test.ts +0 -31
package/tests/audit/headers.test.ts +0 -45
package/tests/audit/scoring.test.ts +0 -133
package/tests/audit/security.test.ts +0 -12
package/tests/audit/transport.test.ts +0 -112
package/tests/clustering.test.ts +0 -118
package/tests/crawler.test.ts +0 -358
package/tests/db.test.ts +0 -159
package/tests/diff.test.ts +0 -67
package/tests/duplicate.test.ts +0 -110
package/tests/fetcher.test.ts +0 -106
package/tests/fetcher_safety.test.ts +0 -85
package/tests/fixtures/analyze-crawl.json +0 -26
package/tests/hits.test.ts +0 -134
package/tests/html_report.test.ts +0 -58
package/tests/lock/lockManager.test.ts +0 -138
package/tests/metrics.test.ts +0 -196
package/tests/normalize.test.ts +0 -101
package/tests/orphanSeverity.test.ts +0 -160
package/tests/pagerank.test.ts +0 -98
package/tests/parser.test.ts +0 -117
package/tests/proxy_safety.test.ts +0 -57
package/tests/redirect_safety.test.ts +0 -73
package/tests/safety.test.ts +0 -114
package/tests/scope.test.ts +0 -66
package/tests/scoring.test.ts +0 -59
package/tests/sitemap.test.ts +0 -88
package/tests/soft404.test.ts +0 -41
package/tests/trap.test.ts +0 -39
package/tests/visualization_data.test.ts +0 -46
package/tsconfig.json +0 -11

package/dist/utils/chalk.js ADDED Viewed

@@ -0,0 +1,41 @@
+import { styleText } from 'node:util';
+const alias = {
+    grey: 'gray'
+};
+const chalk = createChalk([]);
+function createChalk(styles) {
+    const formatter = ((text) => applyStyles(styles, text));
+    return new Proxy(formatter, {
+        apply(_target, _thisArg, args) {
+            return applyStyles(styles, args[0]);
+        },
+        get(_target, prop) {
+            if (typeof prop !== 'string') {
+                return undefined;
+            }
+            const style = alias[prop] ?? prop;
+            return createChalk([...styles, style]);
+        }
+    });
+}
+function applyStyles(styles, text) {
+    const value = String(text ?? '');
+    if (styles.length === 0 || !isColorEnabled()) {
+        return value;
+    }
+    return styleText(styles, value);
+}
+function isColorEnabled() {
+    if (process.env.NO_COLOR !== undefined || process.env.NODE_DISABLE_COLORS !== undefined) {
+        return false;
+    }
+    const forceColor = process.env.FORCE_COLOR;
+    if (forceColor === '0') {
+        return false;
+    }
+    if (forceColor !== undefined) {
+        return true;
+    }
+    return Boolean(process.stdout?.isTTY);
+}
+export default chalk;

package/dist/utils/secureConfig.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+export interface CrawlithConfig {
+    [section: string]: {
+        key?: string;
+        createdAt?: number;
+        [key: string]: unknown;
+    };
+}
+/**
+ * Resolve the canonical Crawlith config file path.
+ */
+export declare function getCrawlithConfigPath(): string;
+/**
+ * Return section config, or undefined if config file/section does not exist.
+ */
+export declare function getConfigSection(section: string): CrawlithConfig[string] | undefined;
+/**
+ * Encrypt and persist a section API key in ~/.crawlith/config.json.
+ */
+export declare function setEncryptedConfigKey(section: string, apiKey: string): void;
+/**
+ * Get and decrypt the API key for a config section.
+ */
+export declare function getDecryptedConfigKey(section: string): string;

package/dist/utils/secureConfig.js ADDED Viewed

@@ -0,0 +1,128 @@
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+import crypto from 'node:crypto';
+const CONFIG_DIR = path.join(os.homedir(), '.crawlith');
+const CONFIG_PATH = path.join(CONFIG_DIR, 'config.json');
+/**
+ * Resolve the canonical Crawlith config file path.
+ */
+export function getCrawlithConfigPath() {
+    return CONFIG_PATH;
+}
+/**
+ * Return section config, or undefined if config file/section does not exist.
+ */
+export function getConfigSection(section) {
+    const config = readConfigFile(false);
+    if (!config)
+        return undefined;
+    return config[section];
+}
+/**
+ * Encrypt and persist a section API key in ~/.crawlith/config.json.
+ */
+export function setEncryptedConfigKey(section, apiKey) {
+    const config = readConfigFile(false) || {};
+    config[section] = {
+        ...(config[section] || {}),
+        key: encryptString(apiKey),
+        createdAt: Math.floor(Date.now() / 1000)
+    };
+    writeConfigFile(config);
+}
+/**
+ * Get and decrypt the API key for a config section.
+ */
+export function getDecryptedConfigKey(section) {
+    if (!fs.existsSync(CONFIG_PATH)) {
+        throw new Error(`Missing ${section} config. Run: crawlith config ${section} set <api_key>`);
+    }
+    const config = readConfigFile(true);
+    if (!config) {
+        throw new Error(`Missing ${section} config. Run: crawlith config ${section} set <api_key>`);
+    }
+    const payload = config[section]?.key;
+    if (!payload || typeof payload !== 'string') {
+        throw new Error(`Missing ${section} key in config. Run: crawlith config ${section} set <api_key>`);
+    }
+    return decryptString(payload);
+}
+/**
+ * Read config file from disk.
+ */
+function readConfigFile(required) {
+    if (!fs.existsSync(CONFIG_PATH)) {
+        if (required) {
+            throw new Error('Missing config file. Run: crawlith config <service> set <api_key>');
+        }
+        return null;
+    }
+    try {
+        return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
+    }
+    catch {
+        throw new Error('Corrupt config file at ~/.crawlith/config.json. Refusing to continue.');
+    }
+}
+/**
+ * Persist config to disk with secure permissions.
+ */
+function writeConfigFile(config) {
+    if (!fs.existsSync(CONFIG_DIR)) {
+        fs.mkdirSync(CONFIG_DIR, { recursive: true, mode: 0o700 });
+    }
+    fs.writeFileSync(CONFIG_PATH, JSON.stringify(config, null, 2), { encoding: 'utf8', mode: 0o600 });
+    fs.chmodSync(CONFIG_PATH, 0o600);
+}
+/**
+ * Build a machine-bound secret so encrypted config blobs are not portable across systems.
+ */
+function getMachineSecret() {
+    return `${os.hostname()}::${os.userInfo().username}`;
+}
+/**
+ * Encrypt plaintext using AES-256-GCM and scrypt-derived key.
+ */
+function encryptString(plaintext) {
+    const salt = crypto.randomBytes(16);
+    const iv = crypto.randomBytes(12);
+    const key = crypto.scryptSync(getMachineSecret(), salt, 32);
+    const cipher = crypto.createCipheriv('aes-256-gcm', key, iv);
+    const encrypted = Buffer.concat([cipher.update(plaintext, 'utf8'), cipher.final()]);
+    const payload = {
+        salt: salt.toString('base64'),
+        iv: iv.toString('base64'),
+        tag: cipher.getAuthTag().toString('base64'),
+        data: encrypted.toString('base64')
+    };
+    return Buffer.from(JSON.stringify(payload), 'utf8').toString('base64');
+}
+/**
+ * Decrypt an encrypted base64 payload from config.json.
+ */
+function decryptString(encodedPayload) {
+    let payload;
+    try {
+        payload = JSON.parse(Buffer.from(encodedPayload, 'base64').toString('utf8'));
+    }
+    catch {
+        throw new Error('Corrupt config payload: unable to parse encrypted key data.');
+    }
+    if (!payload?.salt || !payload?.iv || !payload?.tag || !payload?.data) {
+        throw new Error('Corrupt config payload: required encryption fields are missing.');
+    }
+    try {
+        const salt = Buffer.from(payload.salt, 'base64');
+        const iv = Buffer.from(payload.iv, 'base64');
+        const tag = Buffer.from(payload.tag, 'base64');
+        const data = Buffer.from(payload.data, 'base64');
+        const key = crypto.scryptSync(getMachineSecret(), salt, 32);
+        const decipher = crypto.createDecipheriv('aes-256-gcm', key, iv);
+        decipher.setAuthTag(tag);
+        return Buffer.concat([decipher.update(data), decipher.final()]).toString('utf8');
+    }
+    catch {
+        throw new Error('Unable to decrypt config key. Config may be invalid or tied to another machine/user.');
+    }
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,8 @@
 {
   "name": "@crawlith/core",
-  "version": "0.1.0",
+  "license": "Apache-2.0",
+  "version": "0.1.2",
+  "description": "Headless intelligence engine for Crawlith. Handles crawling, graph analysis, scoring, and data persistence.",
   "type": "module",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
@@ -11,23 +13,27 @@
       "default": "./dist/index.js"
     }
   },
+  "files": [
+    "dist"
+  ],
   "dependencies": {
     "better-sqlite3": "^12.6.2",
-    "chalk": "^5.3.0",
     "cheerio": "^1.0.0-rc.12",
-    "p-limit": "^5.0.0",
+    "commander": "^12.1.0",
+    "p-limit": "^7.3.0",
     "robots-parser": "^3.0.1",
-    "undici": "^6.13.0",
-    "vite": "7.3.1"
+    "undici": "^6.13.0"
   },
   "devDependencies": {
     "@types/better-sqlite3": "^7.6.13",
+    "@types/cheerio": "0.22.35",
     "@types/node": "^20.12.7",
     "typescript": "^5.4.5",
+    "vite": "7.3.1",
     "vitest": "^4.0.18"
   },
   "scripts": {
-    "build": "tsc",
+    "build": "tsc && node scripts/copy-assets.js",
     "test": "vitest run"
   }
 }

package/CHANGELOG.md DELETED Viewed

@@ -1,7 +0,0 @@
-# @crawlith/core
-## 0.1.0
-### Minor Changes
-- First test

package/dist/db/schema.d.ts DELETED Viewed

	@@ -1,2 +0,0 @@
1	- import { Database } from 'better-sqlite3';
2	- export declare function initSchema(db: Database): void;

package/dist/graph/cluster.d.ts DELETED Viewed

@@ -1,6 +0,0 @@
-import { Graph, ClusterInfo } from './graph.js';
-/**
- * Detects content clusters using 64-bit SimHash and Hamming Distance.
- * Uses band optimization to reduce O(n^2) comparisons.
- */
-export declare function detectContentClusters(graph: Graph, threshold?: number, minSize?: number): ClusterInfo[];

package/dist/graph/cluster.js DELETED Viewed

@@ -1,173 +0,0 @@
-import { SimHash } from './simhash.js';
-/**
- * Detects content clusters using 64-bit SimHash and Hamming Distance.
- * Uses band optimization to reduce O(n^2) comparisons.
- */
-export function detectContentClusters(graph, threshold = 10, minSize = 3) {
-    const nodes = graph.getNodes().filter(n => n.simhash && n.status === 200);
-    if (nodes.length === 0)
-        return [];
-    const adjacency = new Map();
-    // Banding Optimization (4 bands of 16 bits)
-    // Note: For threshold > 3, this is a heuristic and may miss some pairs,
-    // but it dramatically reduces the search space as requested.
-    const bands = 4;
-    const bandWidth = 16;
-    const buckets = Array.from({ length: bands }, () => new Map());
-    for (const node of nodes) {
-        const hash = BigInt(node.simhash);
-        for (let b = 0; b < bands; b++) {
-            const bandValue = Number((hash >> BigInt(b * bandWidth)) & 0xffffn);
-            if (!buckets[b].has(bandValue)) {
-                buckets[b].set(bandValue, new Set());
-            }
-            buckets[b].get(bandValue).add(node.url);
-        }
-    }
-    const checkedPairs = new Set();
-    for (let b = 0; b < bands; b++) {
-        for (const bucket of buckets[b].values()) {
-            if (bucket.size < 2)
-                continue;
-            const bucketNodes = Array.from(bucket);
-            for (let i = 0; i < bucketNodes.length; i++) {
-                for (let j = i + 1; j < bucketNodes.length; j++) {
-                    const u1 = bucketNodes[i];
-                    const u2 = bucketNodes[j];
-                    if (u1 === u2)
-                        continue;
-                    const pairKey = u1 < u2 ? `${u1}|${u2}` : `${u2}|${u1}`;
-                    if (checkedPairs.has(pairKey))
-                        continue;
-                    checkedPairs.add(pairKey);
-                    const n1 = graph.nodes.get(u1);
-                    const n2 = graph.nodes.get(u2);
-                    const dist = SimHash.hammingDistance(BigInt(n1.simhash), BigInt(n2.simhash));
-                    if (dist <= threshold) {
-                        if (!adjacency.has(u1))
-                            adjacency.set(u1, new Set());
-                        if (!adjacency.has(u2))
-                            adjacency.set(u2, new Set());
-                        adjacency.get(u1).add(u2);
-                        adjacency.get(u2).add(u1);
-                    }
-                }
-            }
-        }
-    }
-    // Find connected components (Clusters)
-    const visited = new Set();
-    const clusters = [];
-    for (const node of nodes) {
-        if (visited.has(node.url))
-            continue;
-        const component = [];
-        const queue = [node.url];
-        visited.add(node.url);
-        while (queue.length > 0) {
-            const current = queue.shift();
-            component.push(current);
-            const neighbors = adjacency.get(current);
-            if (neighbors) {
-                for (const neighbor of neighbors) {
-                    if (!visited.has(neighbor)) {
-                        visited.add(neighbor);
-                        queue.push(neighbor);
-                    }
-                }
-            }
-        }
-        if (component.length >= minSize) {
-            clusters.push(component);
-        }
-    }
-    // Sort clusters by size (descending) then by primary URL (ascending) for deterministic IDs
-    clusters.sort((a, b) => {
-        if (b.length !== a.length)
-            return b.length - a.length;
-        const aPrimary = selectPrimaryUrl(a, graph);
-        const bPrimary = selectPrimaryUrl(b, graph);
-        return aPrimary.localeCompare(bPrimary);
-    });
-    const clusterInfos = [];
-    clusters.forEach((memberUrls, index) => {
-        const clusterId = index + 1;
-        const clusterNodes = memberUrls.map(url => graph.nodes.get(url));
-        for (const node of clusterNodes) {
-            node.clusterId = clusterId;
-        }
-        const primaryUrl = selectPrimaryUrl(memberUrls, graph);
-        const risk = calculateClusterRisk(clusterNodes);
-        const sharedPathPrefix = findSharedPathPrefix(memberUrls);
-        clusterInfos.push({
-            id: clusterId,
-            count: memberUrls.length,
-            primaryUrl,
-            risk,
-            sharedPathPrefix
-        });
-    });
-    graph.contentClusters = clusterInfos;
-    return clusterInfos;
-}
-/**
- * Selects the primary URL for a cluster based on:
- * 1. Highest PageRank
- * 2. Shortest URL
- * 3. Lexicographic fallback
- */
-function selectPrimaryUrl(urls, graph) {
-    return urls.reduce((best, current) => {
-        const nBest = graph.nodes.get(best);
-        const nCurrent = graph.nodes.get(current);
-        if ((nCurrent.pageRank || 0) > (nBest.pageRank || 0))
-            return current;
-        if ((nCurrent.pageRank || 0) < (nBest.pageRank || 0))
-            return best;
-        if (current.length < best.length)
-            return current;
-        if (current.length > best.length)
-            return best;
-        return current.localeCompare(best) < 0 ? current : best;
-    });
-}
-/**
- * Calculates cannibalization risk based on title and H1 similarity within the cluster.
- */
-function calculateClusterRisk(nodes) {
-    // Logic: Check if there's significant overlap in Titles or H1s among cluster members.
-    // This is a heuristic as requested.
-    // Simplified heuristic: risk is based on cluster density and size
-    // Large clusters of highly similar content are high risk.
-    // Fallback to a safe categorization
-    if (nodes.length > 5)
-        return 'high';
-    if (nodes.length > 2)
-        return 'medium';
-    return 'low';
-}
-/**
- * Finds the common path prefix among a set of URLs.
- */
-function findSharedPathPrefix(urls) {
-    if (urls.length < 2)
-        return undefined;
-    try {
-        const paths = urls.map(u => new URL(u).pathname.split('/').filter(Boolean));
-        const first = paths[0];
-        const common = [];
-        for (let i = 0; i < first.length; i++) {
-            const segment = first[i];
-            if (paths.every(p => p[i] === segment)) {
-                common.push(segment);
-            }
-            else {
-                break;
-            }
-        }
-        return common.length > 0 ? '/' + common.join('/') : undefined;
-    }
-    catch {
-        return undefined;
-    }
-}

package/dist/graph/duplicate.d.ts DELETED Viewed

@@ -1,10 +0,0 @@
-import { Graph } from './graph.js';
-export interface DuplicateOptions {
-    collapse?: boolean;
-    simhashThreshold?: number;
-}
-/**
- * Detects exact and near duplicates, identifies canonical conflicts,
- * and performs non-destructive collapse of edges.
- */
-export declare function detectDuplicates(graph: Graph, options?: DuplicateOptions): void;

package/dist/graph/duplicate.js DELETED Viewed

@@ -1,251 +0,0 @@
-import { SimHash } from './simhash.js';
-/**
- * Detects exact and near duplicates, identifies canonical conflicts,
- * and performs non-destructive collapse of edges.
- */
-export function detectDuplicates(graph, options = {}) {
-    const collapse = options.collapse !== false; // Default to true
-    const threshold = options.simhashThreshold ?? 3;
-    const exactClusters = [];
-    const nearClusters = [];
-    const nodes = graph.getNodes();
-    // Phase 1 & 2: Exact Duplicate Detection
-    const exactMap = new Map();
-    for (const node of nodes) {
-        if (!node.contentHash || node.status !== 200)
-            continue;
-        // Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs)
-        let arr = exactMap.get(node.contentHash);
-        if (!arr) {
-            arr = [];
-            exactMap.set(node.contentHash, arr);
-        }
-        arr.push(node);
-    }
-    // Nodes that are NOT part of an exact duplicate group are candidates for near duplicate checks
-    const nearCandidates = [];
-    let clusterCounter = 1;
-    for (const [_hash, group] of exactMap.entries()) {
-        if (group.length > 1) {
-            const id = `cluster_exact_${clusterCounter++}`;
-            exactClusters.push({ id, type: 'exact', nodes: group });
-            // Mark nodes
-            for (const n of group) {
-                n.duplicateClusterId = id;
-                n.duplicateType = 'exact';
-            }
-        }
-        else {
-            nearCandidates.push(group[0]);
-        }
-    }
-    // Phase 3: Near Duplicate Detection (SimHash with Bands)
-    // 64-bit simhash -> split into 4 bands of 16 bits.
-    const bandsMaps = [
-        new Map(),
-        new Map(),
-        new Map(),
-        new Map()
-    ];
-    for (const node of nearCandidates) {
-        if (!node.simhash)
-            continue;
-        const simhash = BigInt(node.simhash);
-        // Extract 16 bit bands
-        const b0 = Number(simhash & 0xffffn);
-        const b1 = Number((simhash >> 16n) & 0xffffn);
-        const b2 = Number((simhash >> 32n) & 0xffffn);
-        const b3 = Number((simhash >> 48n) & 0xffffn);
-        const bands = [b0, b1, b2, b3];
-        for (let i = 0; i < 4; i++) {
-            let arr = bandsMaps[i].get(bands[i]);
-            if (!arr) {
-                arr = [];
-                bandsMaps[i].set(bands[i], arr);
-            }
-            arr.push(node);
-        }
-    }
-    // Find candidate pairs
-    const nearGroupMap = new Map(); // node.url -> cluster set
-    const checkedPairs = new Set();
-    for (let i = 0; i < 4; i++) {
-        for (const [_bandVal, bucketNodes] of bandsMaps[i].entries()) {
-            if (bucketNodes.length < 2)
-                continue; // nothing to compare
-            // Compare all nodes in this bucket
-            for (let j = 0; j < bucketNodes.length; j++) {
-                for (let k = j + 1; k < bucketNodes.length; k++) {
-                    const n1 = bucketNodes[j];
-                    const n2 = bucketNodes[k];
-                    // Ensure n1 < n2 lexicographically to avoid duplicate pairs
-                    const [a, b] = n1.url < n2.url ? [n1, n2] : [n2, n1];
-                    const pairKey = `${a.url}|${b.url}`;
-                    if (checkedPairs.has(pairKey))
-                        continue;
-                    checkedPairs.add(pairKey);
-                    const dist = SimHash.hammingDistance(BigInt(a.simhash), BigInt(b.simhash));
-                    if (dist <= threshold) {
-                        // They are near duplicates.
-                        // Find or create their cluster set using union-find or reference propagation
-                        const setA = nearGroupMap.get(a.url);
-                        const setB = nearGroupMap.get(b.url);
-                        if (!setA && !setB) {
-                            const newSet = new Set([a, b]);
-                            nearGroupMap.set(a.url, newSet);
-                            nearGroupMap.set(b.url, newSet);
-                        }
-                        else if (setA && !setB) {
-                            setA.add(b);
-                            nearGroupMap.set(b.url, setA);
-                        }
-                        else if (setB && !setA) {
-                            setB.add(a);
-                            nearGroupMap.set(a.url, setB);
-                        }
-                        else if (setA && setB && setA !== setB) {
-                            // Merge sets
-                            for (const node of setB) {
-                                setA.add(node);
-                                nearGroupMap.set(node.url, setA);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    // Compile near duplicate clusters (deduplicated by Set reference)
-    const uniqueNearSets = new Set();
-    for (const group of nearGroupMap.values()) {
-        uniqueNearSets.add(group);
-    }
-    for (const groupSet of uniqueNearSets) {
-        if (groupSet.size > 1) {
-            const id = `cluster_near_${clusterCounter++}`;
-            const groupArr = Array.from(groupSet);
-            nearClusters.push({ id, type: 'near', nodes: groupArr });
-            for (const n of groupArr) {
-                n.duplicateClusterId = id;
-                n.duplicateType = 'near';
-            }
-        }
-    }
-    const allClusters = [...exactClusters, ...nearClusters];
-    // Phase 4: Template-Heavy Detection
-    // Mark classes as 'template_heavy' if ratio < 0.3
-    for (const cluster of allClusters) {
-        const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
-        if (avgRatio < 0.3) {
-            cluster.type = 'template_heavy';
-            cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
-        }
-    }
-    // Phase 5: Canonical Conflict & Representative Selection
-    for (const cluster of allClusters) {
-        const canonicals = new Set();
-        let hasMissing = false;
-        for (const n of cluster.nodes) {
-            if (!n.canonical)
-                hasMissing = true;
-            // We compare full absolute canonical URLs (assuming they are normalized during crawl)
-            else
-                canonicals.add(n.canonical);
-        }
-        if (hasMissing || canonicals.size > 1) {
-            cluster.severity = 'high';
-        }
-        else if (cluster.type === 'near') {
-            cluster.severity = 'medium';
-        }
-        else {
-            cluster.severity = 'low';
-        }
-        // Phase 6: Select Representative
-        // 1. Valid Canonical target in cluster
-        // 2. Highest internal in-degree
-        // 3. Shortest URL
-        // 4. First discovered (relying on array order, which is from BFS map roughly)
-        let representativeNode = cluster.nodes[0];
-        // Evaluate best rep
-        const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
-        const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
-        if (validCanonicals.length > 0) {
-            representativeNode = validCanonicals[0]; // If multiple, just pick first matching self
-        }
-        else {
-            representativeNode = cluster.nodes.reduce((best, current) => {
-                if (current.inLinks > best.inLinks)
-                    return current;
-                if (current.inLinks < best.inLinks)
-                    return best;
-                if (current.url.length < best.url.length)
-                    return current;
-                return best;
-            });
-        }
-        cluster.representative = representativeNode.url;
-        cluster.nodes.forEach(n => {
-            n.isClusterPrimary = n.url === representativeNode.url;
-            n.isCollapsed = false; // default for JSON
-            n.collapseInto = undefined;
-        });
-        // Push to Graph's final cluster list
-        graph.duplicateClusters.push({
-            id: cluster.id,
-            type: cluster.type,
-            size: cluster.nodes.length,
-            representative: representativeNode.url,
-            severity: cluster.severity
-        });
-        // Controlled Collapse
-        if (collapse) {
-            for (const n of cluster.nodes) {
-                if (n.url !== representativeNode.url) {
-                    n.isCollapsed = true;
-                    n.collapseInto = representativeNode.url;
-                }
-            }
-        }
-    }
-    // Final Edge Transfer if Collapsing
-    if (collapse) {
-        const edges = graph.getEdges();
-        const updatedEdges = new Map();
-        for (const edge of edges) {
-            const sourceNode = graph.nodes.get(edge.source);
-            const targetNode = graph.nodes.get(edge.target);
-            if (!sourceNode || !targetNode)
-                continue;
-            // We do NOT modify source structure for out-bound edges of collapsed nodes?
-            // Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative."
-            // Actually, if a node links TO a collapsed node, we repoint the edge to the representative.
-            // If a collapsed node links to X, we ignore it (PageRank will filter it out).
-            const actualSource = edge.source;
-            // repoint target
-            const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
-            // Skip self-referential edges caused by repointing
-            if (actualSource === actualTarget)
-                continue;
-            const edgeKey = `${actualSource}|${actualTarget}`;
-            const existingWeight = updatedEdges.get(edgeKey) || 0;
-            updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); // deduplicate
-        }
-        // Update graph edges in-place
-        graph.edges = updatedEdges;
-        // Re-calculate inLinks and outLinks based on collapsed edges
-        for (const node of graph.getNodes()) {
-            node.inLinks = 0;
-            node.outLinks = 0;
-        }
-        for (const [edgeKey, _weight] of updatedEdges.entries()) {
-            const [src, tgt] = edgeKey.split('|');
-            const sn = graph.nodes.get(src);
-            const tn = graph.nodes.get(tgt);
-            if (sn)
-                sn.outLinks++;
-            if (tn)
-                tn.inLinks++;
-        }
-    }
-}