@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
package/dist/db/index.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import Database from 'better-sqlite3';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import { initSchema } from './schema.js';
|
|
6
|
+
let dbInstance = null;
|
|
7
|
+
export function getDbPath() {
|
|
8
|
+
if (process.env.NODE_ENV === 'test') {
|
|
9
|
+
return ':memory:';
|
|
10
|
+
}
|
|
11
|
+
if (process.env.CRAWLITH_DB_PATH) {
|
|
12
|
+
return process.env.CRAWLITH_DB_PATH;
|
|
13
|
+
}
|
|
14
|
+
const homeDir = os.homedir();
|
|
15
|
+
const crawlithDir = path.join(homeDir, '.crawlith');
|
|
16
|
+
if (!fs.existsSync(crawlithDir)) {
|
|
17
|
+
fs.mkdirSync(crawlithDir, { recursive: true });
|
|
18
|
+
// Set permissions to 700 (user only)
|
|
19
|
+
fs.chmodSync(crawlithDir, 0o700);
|
|
20
|
+
}
|
|
21
|
+
return path.join(crawlithDir, 'crawlith.db');
|
|
22
|
+
}
|
|
23
|
+
export function getDb() {
|
|
24
|
+
if (dbInstance) {
|
|
25
|
+
return dbInstance;
|
|
26
|
+
}
|
|
27
|
+
const dbPath = getDbPath();
|
|
28
|
+
const db = new Database(dbPath);
|
|
29
|
+
// Hardening & Performance Configuration
|
|
30
|
+
db.pragma('journal_mode = WAL');
|
|
31
|
+
db.pragma('synchronous = NORMAL');
|
|
32
|
+
db.pragma('foreign_keys = ON');
|
|
33
|
+
db.pragma('temp_store = MEMORY');
|
|
34
|
+
db.pragma('mmap_size = 30000000000');
|
|
35
|
+
db.pragma('cache_size = -20000');
|
|
36
|
+
db.pragma('busy_timeout = 5000');
|
|
37
|
+
// Security controls
|
|
38
|
+
// Ensure file permissions are 600 (user read/write only)
|
|
39
|
+
try {
|
|
40
|
+
fs.chmodSync(dbPath, 0o600);
|
|
41
|
+
}
|
|
42
|
+
catch (_e) {
|
|
43
|
+
// might fail on first creation if file doesn't exist yet, but better-sqlite3 creates it
|
|
44
|
+
// so we can try again or ignore if it's new
|
|
45
|
+
}
|
|
46
|
+
// Integrity check on startup
|
|
47
|
+
const integrity = db.pragma('integrity_check', { simple: true });
|
|
48
|
+
if (integrity !== 'ok') {
|
|
49
|
+
console.warn('Database integrity check failed:', integrity);
|
|
50
|
+
}
|
|
51
|
+
// Initialize schema
|
|
52
|
+
initSchema(db);
|
|
53
|
+
dbInstance = db;
|
|
54
|
+
return db;
|
|
55
|
+
}
|
|
56
|
+
export function closeDb() {
|
|
57
|
+
if (dbInstance) {
|
|
58
|
+
dbInstance.close();
|
|
59
|
+
dbInstance = null;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
export interface Edge {
|
|
3
|
+
id: number;
|
|
4
|
+
snapshot_id: number;
|
|
5
|
+
source_page_id: number;
|
|
6
|
+
target_page_id: number;
|
|
7
|
+
weight: number;
|
|
8
|
+
rel: 'nofollow' | 'sponsored' | 'ugc' | 'internal' | 'external' | 'unknown';
|
|
9
|
+
}
|
|
10
|
+
export declare class EdgeRepository {
|
|
11
|
+
private db;
|
|
12
|
+
private insertStmt;
|
|
13
|
+
constructor(db: Database);
|
|
14
|
+
insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight?: number, rel?: string): void;
|
|
15
|
+
getEdgesBySnapshot(snapshotId: number): Edge[];
|
|
16
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export class EdgeRepository {
|
|
2
|
+
db;
|
|
3
|
+
insertStmt;
|
|
4
|
+
constructor(db) {
|
|
5
|
+
this.db = db;
|
|
6
|
+
this.insertStmt = this.db.prepare(`
|
|
7
|
+
INSERT INTO edges (snapshot_id, source_page_id, target_page_id, weight, rel)
|
|
8
|
+
VALUES (?, ?, ?, ?, ?)
|
|
9
|
+
`);
|
|
10
|
+
}
|
|
11
|
+
insertEdge(snapshotId, sourcePageId, targetPageId, weight = 1.0, rel = 'internal') {
|
|
12
|
+
this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
|
|
13
|
+
}
|
|
14
|
+
getEdgesBySnapshot(snapshotId) {
|
|
15
|
+
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
export interface DbMetrics {
|
|
3
|
+
snapshot_id: number;
|
|
4
|
+
page_id: number;
|
|
5
|
+
authority_score: number | null;
|
|
6
|
+
hub_score: number | null;
|
|
7
|
+
pagerank: number | null;
|
|
8
|
+
pagerank_score: number | null;
|
|
9
|
+
link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
|
|
10
|
+
crawl_status: string | null;
|
|
11
|
+
word_count: number | null;
|
|
12
|
+
thin_content_score: number | null;
|
|
13
|
+
external_link_ratio: number | null;
|
|
14
|
+
orphan_score: number | null;
|
|
15
|
+
duplicate_cluster_id: string | null;
|
|
16
|
+
duplicate_type: 'exact' | 'near' | 'template_heavy' | 'none' | null;
|
|
17
|
+
is_cluster_primary: number;
|
|
18
|
+
}
|
|
19
|
+
export declare class MetricsRepository {
|
|
20
|
+
private db;
|
|
21
|
+
private insertStmt;
|
|
22
|
+
constructor(db: Database);
|
|
23
|
+
insertMetrics(metrics: DbMetrics): void;
|
|
24
|
+
getMetrics(snapshotId: number): DbMetrics[];
|
|
25
|
+
getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined;
|
|
26
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
export class MetricsRepository {
|
|
2
|
+
db;
|
|
3
|
+
insertStmt;
|
|
4
|
+
constructor(db) {
|
|
5
|
+
this.db = db;
|
|
6
|
+
this.insertStmt = this.db.prepare(`
|
|
7
|
+
INSERT OR REPLACE INTO metrics (
|
|
8
|
+
snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
|
|
9
|
+
link_role, crawl_status, word_count, thin_content_score, external_link_ratio,
|
|
10
|
+
orphan_score, duplicate_cluster_id, duplicate_type, is_cluster_primary
|
|
11
|
+
) VALUES (
|
|
12
|
+
@snapshot_id, @page_id, @authority_score, @hub_score, @pagerank, @pagerank_score,
|
|
13
|
+
@link_role, @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
|
|
14
|
+
@orphan_score, @duplicate_cluster_id, @duplicate_type, @is_cluster_primary
|
|
15
|
+
)
|
|
16
|
+
`);
|
|
17
|
+
}
|
|
18
|
+
insertMetrics(metrics) {
|
|
19
|
+
this.insertStmt.run(metrics);
|
|
20
|
+
}
|
|
21
|
+
getMetrics(snapshotId) {
|
|
22
|
+
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId);
|
|
23
|
+
}
|
|
24
|
+
getMetricsForPage(snapshotId, pageId) {
|
|
25
|
+
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?').get(snapshotId, pageId);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
export interface Page {
|
|
3
|
+
id: number;
|
|
4
|
+
site_id: number;
|
|
5
|
+
normalized_url: string;
|
|
6
|
+
first_seen_snapshot_id: number | null;
|
|
7
|
+
last_seen_snapshot_id: number | null;
|
|
8
|
+
http_status: number | null;
|
|
9
|
+
canonical_url: string | null;
|
|
10
|
+
content_hash: string | null;
|
|
11
|
+
simhash: string | null;
|
|
12
|
+
etag: string | null;
|
|
13
|
+
last_modified: string | null;
|
|
14
|
+
html: string | null;
|
|
15
|
+
soft404_score: number | null;
|
|
16
|
+
noindex: number;
|
|
17
|
+
nofollow: number;
|
|
18
|
+
security_error: string | null;
|
|
19
|
+
retries: number;
|
|
20
|
+
depth: number;
|
|
21
|
+
redirect_chain: string | null;
|
|
22
|
+
bytes_received: number | null;
|
|
23
|
+
crawl_trap_flag: number;
|
|
24
|
+
crawl_trap_risk: number | null;
|
|
25
|
+
trap_type: string | null;
|
|
26
|
+
created_at: string;
|
|
27
|
+
updated_at: string;
|
|
28
|
+
}
|
|
29
|
+
export declare class PageRepository {
|
|
30
|
+
private db;
|
|
31
|
+
private upsertStmt;
|
|
32
|
+
private getIdStmt;
|
|
33
|
+
constructor(db: Database);
|
|
34
|
+
upsertPage(page: Partial<Page> & {
|
|
35
|
+
site_id: number;
|
|
36
|
+
normalized_url: string;
|
|
37
|
+
last_seen_snapshot_id: number;
|
|
38
|
+
}): import("better-sqlite3").RunResult;
|
|
39
|
+
upsertAndGetId(page: Partial<Page> & {
|
|
40
|
+
site_id: number;
|
|
41
|
+
normalized_url: string;
|
|
42
|
+
last_seen_snapshot_id: number;
|
|
43
|
+
}): number;
|
|
44
|
+
getPage(siteId: number, url: string): Page | undefined;
|
|
45
|
+
getPagesBySnapshot(snapshotId: number): Page[];
|
|
46
|
+
getIdByUrl(siteId: number, url: string): number | undefined;
|
|
47
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
export class PageRepository {
|
|
2
|
+
db;
|
|
3
|
+
upsertStmt;
|
|
4
|
+
getIdStmt;
|
|
5
|
+
constructor(db) {
|
|
6
|
+
this.db = db;
|
|
7
|
+
this.upsertStmt = this.db.prepare(`
|
|
8
|
+
INSERT INTO pages (
|
|
9
|
+
site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
|
|
10
|
+
http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
|
|
11
|
+
soft404_score, noindex, nofollow, security_error, retries, depth,
|
|
12
|
+
redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
13
|
+
updated_at
|
|
14
|
+
) VALUES (
|
|
15
|
+
@site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
|
|
16
|
+
@http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
|
|
17
|
+
@soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
|
|
18
|
+
@redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
19
|
+
datetime('now')
|
|
20
|
+
)
|
|
21
|
+
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
22
|
+
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
23
|
+
http_status = excluded.http_status,
|
|
24
|
+
canonical_url = excluded.canonical_url,
|
|
25
|
+
content_hash = excluded.content_hash,
|
|
26
|
+
simhash = excluded.simhash,
|
|
27
|
+
etag = excluded.etag,
|
|
28
|
+
last_modified = excluded.last_modified,
|
|
29
|
+
html = excluded.html,
|
|
30
|
+
soft404_score = excluded.soft404_score,
|
|
31
|
+
noindex = excluded.noindex,
|
|
32
|
+
nofollow = excluded.nofollow,
|
|
33
|
+
security_error = excluded.security_error,
|
|
34
|
+
retries = excluded.retries,
|
|
35
|
+
depth = excluded.depth,
|
|
36
|
+
redirect_chain = excluded.redirect_chain,
|
|
37
|
+
bytes_received = excluded.bytes_received,
|
|
38
|
+
crawl_trap_flag = excluded.crawl_trap_flag,
|
|
39
|
+
crawl_trap_risk = excluded.crawl_trap_risk,
|
|
40
|
+
trap_type = excluded.trap_type,
|
|
41
|
+
updated_at = datetime('now')
|
|
42
|
+
`);
|
|
43
|
+
this.getIdStmt = this.db.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?');
|
|
44
|
+
}
|
|
45
|
+
upsertPage(page) {
|
|
46
|
+
const params = {
|
|
47
|
+
site_id: page.site_id,
|
|
48
|
+
normalized_url: page.normalized_url,
|
|
49
|
+
first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
|
|
50
|
+
last_seen_snapshot_id: page.last_seen_snapshot_id,
|
|
51
|
+
http_status: page.http_status ?? null,
|
|
52
|
+
canonical_url: page.canonical_url ?? null,
|
|
53
|
+
content_hash: page.content_hash ?? null,
|
|
54
|
+
simhash: page.simhash ?? null,
|
|
55
|
+
etag: page.etag ?? null,
|
|
56
|
+
last_modified: page.last_modified ?? null,
|
|
57
|
+
html: page.html ?? null,
|
|
58
|
+
soft404_score: page.soft404_score ?? null,
|
|
59
|
+
noindex: page.noindex ?? 0,
|
|
60
|
+
nofollow: page.nofollow ?? 0,
|
|
61
|
+
security_error: page.security_error ?? null,
|
|
62
|
+
retries: page.retries ?? 0,
|
|
63
|
+
depth: page.depth ?? 0,
|
|
64
|
+
redirect_chain: page.redirect_chain ?? null,
|
|
65
|
+
bytes_received: page.bytes_received ?? null,
|
|
66
|
+
crawl_trap_flag: page.crawl_trap_flag ?? 0,
|
|
67
|
+
crawl_trap_risk: page.crawl_trap_risk ?? null,
|
|
68
|
+
trap_type: page.trap_type ?? null,
|
|
69
|
+
};
|
|
70
|
+
const info = this.upsertStmt.run(params);
|
|
71
|
+
return info;
|
|
72
|
+
}
|
|
73
|
+
upsertAndGetId(page) {
|
|
74
|
+
const tx = this.db.transaction(() => {
|
|
75
|
+
this.upsertPage(page);
|
|
76
|
+
const row = this.getIdStmt.get(page.site_id, page.normalized_url);
|
|
77
|
+
if (!row)
|
|
78
|
+
throw new Error(`Failed to retrieve ID for upserted page: ${page.normalized_url}`);
|
|
79
|
+
return row.id;
|
|
80
|
+
});
|
|
81
|
+
return tx();
|
|
82
|
+
}
|
|
83
|
+
getPage(siteId, url) {
|
|
84
|
+
return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url);
|
|
85
|
+
}
|
|
86
|
+
getPagesBySnapshot(snapshotId) {
|
|
87
|
+
return this.db.prepare('SELECT * FROM pages WHERE last_seen_snapshot_id = ?').all(snapshotId);
|
|
88
|
+
}
|
|
89
|
+
getIdByUrl(siteId, url) {
|
|
90
|
+
const row = this.getIdStmt.get(siteId, url);
|
|
91
|
+
return row?.id;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
export interface Site {
|
|
3
|
+
id: number;
|
|
4
|
+
domain: string;
|
|
5
|
+
created_at: string;
|
|
6
|
+
settings_json: string | null;
|
|
7
|
+
is_active: number;
|
|
8
|
+
}
|
|
9
|
+
export declare class SiteRepository {
|
|
10
|
+
private db;
|
|
11
|
+
constructor(db: Database);
|
|
12
|
+
getSite(domain: string): Site | undefined;
|
|
13
|
+
createSite(domain: string): number;
|
|
14
|
+
firstOrCreateSite(domain: string): Site;
|
|
15
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export class SiteRepository {
|
|
2
|
+
db;
|
|
3
|
+
constructor(db) {
|
|
4
|
+
this.db = db;
|
|
5
|
+
}
|
|
6
|
+
getSite(domain) {
|
|
7
|
+
return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain);
|
|
8
|
+
}
|
|
9
|
+
createSite(domain) {
|
|
10
|
+
const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
|
|
11
|
+
const info = stmt.run(domain);
|
|
12
|
+
return info.lastInsertRowid;
|
|
13
|
+
}
|
|
14
|
+
firstOrCreateSite(domain) {
|
|
15
|
+
let site = this.getSite(domain);
|
|
16
|
+
if (!site) {
|
|
17
|
+
this.createSite(domain);
|
|
18
|
+
site = this.getSite(domain);
|
|
19
|
+
}
|
|
20
|
+
return site;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
export interface Snapshot {
|
|
3
|
+
id: number;
|
|
4
|
+
site_id: number;
|
|
5
|
+
type: 'full' | 'partial' | 'incremental';
|
|
6
|
+
created_at: string;
|
|
7
|
+
node_count: number;
|
|
8
|
+
edge_count: number;
|
|
9
|
+
status: 'running' | 'completed' | 'failed';
|
|
10
|
+
limit_reached: number;
|
|
11
|
+
health_score: number | null;
|
|
12
|
+
orphan_count: number | null;
|
|
13
|
+
thin_content_count: number | null;
|
|
14
|
+
}
|
|
15
|
+
export declare class SnapshotRepository {
|
|
16
|
+
private db;
|
|
17
|
+
constructor(db: Database);
|
|
18
|
+
createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status?: 'running' | 'completed' | 'failed'): number;
|
|
19
|
+
getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined;
|
|
20
|
+
updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats?: Partial<Snapshot>): void;
|
|
21
|
+
getSnapshot(id: number): Snapshot | undefined;
|
|
22
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
export class SnapshotRepository {
|
|
2
|
+
db;
|
|
3
|
+
constructor(db) {
|
|
4
|
+
this.db = db;
|
|
5
|
+
}
|
|
6
|
+
createSnapshot(siteId, type, status = 'running') {
|
|
7
|
+
const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
|
|
8
|
+
const info = stmt.run(siteId, type, status);
|
|
9
|
+
return info.lastInsertRowid;
|
|
10
|
+
}
|
|
11
|
+
getLatestSnapshot(siteId, status) {
|
|
12
|
+
let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
|
|
13
|
+
const params = [siteId];
|
|
14
|
+
if (status) {
|
|
15
|
+
sql += ' AND status = ?';
|
|
16
|
+
params.push(status);
|
|
17
|
+
}
|
|
18
|
+
sql += ' ORDER BY created_at DESC LIMIT 1';
|
|
19
|
+
return this.db.prepare(sql).get(...params);
|
|
20
|
+
}
|
|
21
|
+
updateSnapshotStatus(id, status, stats = {}) {
|
|
22
|
+
const sets = ['status = ?'];
|
|
23
|
+
const params = [status];
|
|
24
|
+
if (stats.node_count !== undefined) {
|
|
25
|
+
sets.push('node_count = ?');
|
|
26
|
+
params.push(stats.node_count);
|
|
27
|
+
}
|
|
28
|
+
if (stats.edge_count !== undefined) {
|
|
29
|
+
sets.push('edge_count = ?');
|
|
30
|
+
params.push(stats.edge_count);
|
|
31
|
+
}
|
|
32
|
+
if (stats.limit_reached !== undefined) {
|
|
33
|
+
sets.push('limit_reached = ?');
|
|
34
|
+
params.push(stats.limit_reached);
|
|
35
|
+
}
|
|
36
|
+
if (stats.health_score !== undefined) {
|
|
37
|
+
sets.push('health_score = ?');
|
|
38
|
+
params.push(stats.health_score);
|
|
39
|
+
}
|
|
40
|
+
if (stats.orphan_count !== undefined) {
|
|
41
|
+
sets.push('orphan_count = ?');
|
|
42
|
+
params.push(stats.orphan_count);
|
|
43
|
+
}
|
|
44
|
+
if (stats.thin_content_count !== undefined) {
|
|
45
|
+
sets.push('thin_content_count = ?');
|
|
46
|
+
params.push(stats.thin_content_count);
|
|
47
|
+
}
|
|
48
|
+
params.push(id);
|
|
49
|
+
const sql = `UPDATE snapshots SET ${sets.join(', ')} WHERE id = ?`;
|
|
50
|
+
this.db.prepare(sql).run(...params);
|
|
51
|
+
}
|
|
52
|
+
getSnapshot(id) {
|
|
53
|
+
return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
export function initSchema(db) {
|
|
2
|
+
// Sites Table
|
|
3
|
+
db.exec(`
|
|
4
|
+
CREATE TABLE IF NOT EXISTS sites (
|
|
5
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
6
|
+
domain TEXT UNIQUE NOT NULL,
|
|
7
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
8
|
+
settings_json TEXT,
|
|
9
|
+
is_active INTEGER DEFAULT 1
|
|
10
|
+
);
|
|
11
|
+
`);
|
|
12
|
+
// Snapshots Table
|
|
13
|
+
db.exec(`
|
|
14
|
+
CREATE TABLE IF NOT EXISTS snapshots (
|
|
15
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
16
|
+
site_id INTEGER NOT NULL,
|
|
17
|
+
type TEXT CHECK(type IN ('full', 'partial', 'incremental')) NOT NULL,
|
|
18
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
19
|
+
node_count INTEGER DEFAULT 0,
|
|
20
|
+
edge_count INTEGER DEFAULT 0,
|
|
21
|
+
status TEXT CHECK(status IN ('running', 'completed', 'failed')) DEFAULT 'running',
|
|
22
|
+
limit_reached INTEGER DEFAULT 0,
|
|
23
|
+
health_score REAL,
|
|
24
|
+
orphan_count INTEGER,
|
|
25
|
+
thin_content_count INTEGER,
|
|
26
|
+
FOREIGN KEY(site_id) REFERENCES sites(id) ON DELETE CASCADE
|
|
27
|
+
);
|
|
28
|
+
`);
|
|
29
|
+
// Pages Table
|
|
30
|
+
db.exec(`
|
|
31
|
+
CREATE TABLE IF NOT EXISTS pages (
|
|
32
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
33
|
+
site_id INTEGER NOT NULL,
|
|
34
|
+
normalized_url TEXT NOT NULL,
|
|
35
|
+
first_seen_snapshot_id INTEGER,
|
|
36
|
+
last_seen_snapshot_id INTEGER,
|
|
37
|
+
http_status INTEGER,
|
|
38
|
+
canonical_url TEXT,
|
|
39
|
+
content_hash TEXT,
|
|
40
|
+
simhash TEXT,
|
|
41
|
+
etag TEXT,
|
|
42
|
+
last_modified TEXT,
|
|
43
|
+
html TEXT,
|
|
44
|
+
soft404_score REAL,
|
|
45
|
+
noindex INTEGER DEFAULT 0,
|
|
46
|
+
nofollow INTEGER DEFAULT 0,
|
|
47
|
+
security_error TEXT,
|
|
48
|
+
retries INTEGER DEFAULT 0,
|
|
49
|
+
depth INTEGER DEFAULT 0,
|
|
50
|
+
redirect_chain TEXT,
|
|
51
|
+
bytes_received INTEGER,
|
|
52
|
+
crawl_trap_flag INTEGER DEFAULT 0,
|
|
53
|
+
crawl_trap_risk REAL,
|
|
54
|
+
trap_type TEXT,
|
|
55
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
56
|
+
updated_at TEXT DEFAULT (datetime('now')),
|
|
57
|
+
FOREIGN KEY(site_id) REFERENCES sites(id) ON DELETE CASCADE,
|
|
58
|
+
FOREIGN KEY(first_seen_snapshot_id) REFERENCES snapshots(id),
|
|
59
|
+
FOREIGN KEY(last_seen_snapshot_id) REFERENCES snapshots(id),
|
|
60
|
+
UNIQUE(site_id, normalized_url)
|
|
61
|
+
);
|
|
62
|
+
`);
|
|
63
|
+
// Index for Pages
|
|
64
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_pages_site_last_seen ON pages(site_id, last_seen_snapshot_id);`);
|
|
65
|
+
// Edges Table
|
|
66
|
+
db.exec(`
|
|
67
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
68
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
69
|
+
snapshot_id INTEGER NOT NULL,
|
|
70
|
+
source_page_id INTEGER NOT NULL,
|
|
71
|
+
target_page_id INTEGER NOT NULL,
|
|
72
|
+
weight REAL DEFAULT 1.0,
|
|
73
|
+
rel TEXT CHECK(rel IN ('nofollow', 'sponsored', 'ugc', 'internal', 'external', 'unknown')) DEFAULT 'internal',
|
|
74
|
+
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
|
|
75
|
+
FOREIGN KEY(source_page_id) REFERENCES pages(id) ON DELETE CASCADE,
|
|
76
|
+
FOREIGN KEY(target_page_id) REFERENCES pages(id) ON DELETE CASCADE
|
|
77
|
+
);
|
|
78
|
+
`);
|
|
79
|
+
// Index for Edges
|
|
80
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot_source ON edges(snapshot_id, source_page_id);`);
|
|
81
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot ON edges(snapshot_id);`);
|
|
82
|
+
// Metrics Table
|
|
83
|
+
db.exec(`
|
|
84
|
+
CREATE TABLE IF NOT EXISTS metrics (
|
|
85
|
+
snapshot_id INTEGER NOT NULL,
|
|
86
|
+
page_id INTEGER NOT NULL,
|
|
87
|
+
authority_score REAL,
|
|
88
|
+
hub_score REAL,
|
|
89
|
+
pagerank REAL,
|
|
90
|
+
pagerank_score REAL,
|
|
91
|
+
link_role TEXT CHECK(link_role IN ('hub', 'authority', 'power', 'balanced', 'peripheral')),
|
|
92
|
+
crawl_status TEXT,
|
|
93
|
+
word_count INTEGER,
|
|
94
|
+
thin_content_score REAL,
|
|
95
|
+
external_link_ratio REAL,
|
|
96
|
+
orphan_score INTEGER,
|
|
97
|
+
duplicate_cluster_id TEXT,
|
|
98
|
+
duplicate_type TEXT CHECK(duplicate_type IN ('exact', 'near', 'template_heavy', 'none')),
|
|
99
|
+
is_cluster_primary INTEGER DEFAULT 0,
|
|
100
|
+
PRIMARY KEY(snapshot_id, page_id),
|
|
101
|
+
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
|
|
102
|
+
FOREIGN KEY(page_id) REFERENCES pages(id) ON DELETE CASCADE
|
|
103
|
+
);
|
|
104
|
+
`);
|
|
105
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_metrics_snapshot ON metrics(snapshot_id);`);
|
|
106
|
+
// Duplicate Clusters Table
|
|
107
|
+
db.exec(`
|
|
108
|
+
CREATE TABLE IF NOT EXISTS duplicate_clusters (
|
|
109
|
+
id TEXT NOT NULL,
|
|
110
|
+
snapshot_id INTEGER NOT NULL,
|
|
111
|
+
type TEXT CHECK(type IN ('exact', 'near', 'template_heavy')) NOT NULL,
|
|
112
|
+
size INTEGER NOT NULL,
|
|
113
|
+
representative TEXT NOT NULL,
|
|
114
|
+
severity TEXT CHECK(severity IN ('low', 'medium', 'high')) NOT NULL,
|
|
115
|
+
PRIMARY KEY(snapshot_id, id),
|
|
116
|
+
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
|
|
117
|
+
);
|
|
118
|
+
`);
|
|
119
|
+
// Content Clusters Table
|
|
120
|
+
db.exec(`
|
|
121
|
+
CREATE TABLE IF NOT EXISTS content_clusters (
|
|
122
|
+
id INTEGER NOT NULL,
|
|
123
|
+
snapshot_id INTEGER NOT NULL,
|
|
124
|
+
count INTEGER NOT NULL,
|
|
125
|
+
primary_url TEXT NOT NULL,
|
|
126
|
+
risk TEXT CHECK(risk IN ('low', 'medium', 'high')) NOT NULL,
|
|
127
|
+
shared_path_prefix TEXT,
|
|
128
|
+
PRIMARY KEY(snapshot_id, id),
|
|
129
|
+
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
|
|
130
|
+
);
|
|
131
|
+
`);
|
|
132
|
+
// Migration: add columns to existing DBs that were created before this update
|
|
133
|
+
migrateSchema(db);
|
|
134
|
+
}
|
|
135
|
+
function migrateSchema(db) {
|
|
136
|
+
// Add missing columns to pages (safe: ALTER TABLE ADD COLUMN is idempotent-safe with try/catch)
|
|
137
|
+
const pageColumns = [
|
|
138
|
+
['redirect_chain', 'TEXT'],
|
|
139
|
+
['bytes_received', 'INTEGER'],
|
|
140
|
+
['crawl_trap_flag', 'INTEGER DEFAULT 0'],
|
|
141
|
+
['crawl_trap_risk', 'REAL'],
|
|
142
|
+
['trap_type', 'TEXT'],
|
|
143
|
+
];
|
|
144
|
+
for (const [col, type] of pageColumns) {
|
|
145
|
+
try {
|
|
146
|
+
db.exec(`ALTER TABLE pages ADD COLUMN ${col} ${type}`);
|
|
147
|
+
}
|
|
148
|
+
catch { /* already exists */ }
|
|
149
|
+
}
|
|
150
|
+
// Add missing columns to edges
|
|
151
|
+
try {
|
|
152
|
+
db.exec('ALTER TABLE edges ADD COLUMN weight REAL DEFAULT 1.0');
|
|
153
|
+
}
|
|
154
|
+
catch { /* already exists */ }
|
|
155
|
+
// Add missing columns to metrics
|
|
156
|
+
const metricsColumns = [
|
|
157
|
+
['pagerank_score', 'REAL'],
|
|
158
|
+
['link_role', 'TEXT'],
|
|
159
|
+
['duplicate_cluster_id', 'TEXT'],
|
|
160
|
+
['duplicate_type', 'TEXT'],
|
|
161
|
+
['is_cluster_primary', 'INTEGER DEFAULT 0'],
|
|
162
|
+
];
|
|
163
|
+
for (const [col, type] of metricsColumns) {
|
|
164
|
+
try {
|
|
165
|
+
db.exec(`ALTER TABLE metrics ADD COLUMN ${col} ${type}`);
|
|
166
|
+
}
|
|
167
|
+
catch { /* already exists */ }
|
|
168
|
+
}
|
|
169
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { Graph } from '../graph/graph.js';
|
|
2
|
+
export interface DiffResult {
|
|
3
|
+
addedUrls: string[];
|
|
4
|
+
removedUrls: string[];
|
|
5
|
+
changedStatus: {
|
|
6
|
+
url: string;
|
|
7
|
+
oldStatus: number;
|
|
8
|
+
newStatus: number;
|
|
9
|
+
}[];
|
|
10
|
+
changedCanonical: {
|
|
11
|
+
url: string;
|
|
12
|
+
oldCanonical: string | null;
|
|
13
|
+
newCanonical: string | null;
|
|
14
|
+
}[];
|
|
15
|
+
changedDuplicateGroup: {
|
|
16
|
+
url: string;
|
|
17
|
+
oldGroup: string | null;
|
|
18
|
+
newGroup: string | null;
|
|
19
|
+
}[];
|
|
20
|
+
metricDeltas: {
|
|
21
|
+
structuralEntropy: number;
|
|
22
|
+
orphanCount: number;
|
|
23
|
+
crawlEfficiency: number;
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
export declare function compareGraphs(oldGraph: Graph, newGraph: Graph): DiffResult;
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { calculateMetrics } from '../graph/metrics.js';
|
|
2
|
+
export function compareGraphs(oldGraph, newGraph) {
|
|
3
|
+
const oldNodes = new Map(oldGraph.getNodes().map(n => [n.url, n]));
|
|
4
|
+
const newNodes = new Map(newGraph.getNodes().map(n => [n.url, n]));
|
|
5
|
+
const addedUrls = [];
|
|
6
|
+
const removedUrls = [];
|
|
7
|
+
const changedStatus = [];
|
|
8
|
+
const changedCanonical = [];
|
|
9
|
+
const changedDuplicateGroup = [];
|
|
10
|
+
// Added & Changed
|
|
11
|
+
for (const [url, newNode] of newNodes) {
|
|
12
|
+
const oldNode = oldNodes.get(url);
|
|
13
|
+
if (!oldNode) {
|
|
14
|
+
addedUrls.push(url);
|
|
15
|
+
}
|
|
16
|
+
else {
|
|
17
|
+
// Changed Status
|
|
18
|
+
if (oldNode.status !== newNode.status) {
|
|
19
|
+
changedStatus.push({ url, oldStatus: oldNode.status, newStatus: newNode.status });
|
|
20
|
+
}
|
|
21
|
+
// Changed Canonical
|
|
22
|
+
if (oldNode.canonical !== newNode.canonical) {
|
|
23
|
+
changedCanonical.push({
|
|
24
|
+
url,
|
|
25
|
+
oldCanonical: oldNode.canonical || null,
|
|
26
|
+
newCanonical: newNode.canonical || null
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
// Changed Duplicate Group
|
|
30
|
+
const oldGroup = oldNode.duplicateClusterId || null;
|
|
31
|
+
const newGroup = newNode.duplicateClusterId || null;
|
|
32
|
+
if (oldGroup !== newGroup) {
|
|
33
|
+
changedDuplicateGroup.push({
|
|
34
|
+
url,
|
|
35
|
+
oldGroup,
|
|
36
|
+
newGroup
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
// Removed
|
|
42
|
+
for (const url of oldNodes.keys()) {
|
|
43
|
+
if (!newNodes.has(url)) {
|
|
44
|
+
removedUrls.push(url);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
// Metrics
|
|
48
|
+
// maxDepth is ignored by current calculateMetrics implementation but required by signature
|
|
49
|
+
const oldMetrics = calculateMetrics(oldGraph, 10);
|
|
50
|
+
const newMetrics = calculateMetrics(newGraph, 10);
|
|
51
|
+
const metricDeltas = {
|
|
52
|
+
structuralEntropy: newMetrics.structuralEntropy - oldMetrics.structuralEntropy,
|
|
53
|
+
orphanCount: newMetrics.orphanPages.length - oldMetrics.orphanPages.length,
|
|
54
|
+
crawlEfficiency: newMetrics.crawlEfficiencyScore - oldMetrics.crawlEfficiencyScore
|
|
55
|
+
};
|
|
56
|
+
return {
|
|
57
|
+
addedUrls,
|
|
58
|
+
removedUrls,
|
|
59
|
+
changedStatus,
|
|
60
|
+
changedCanonical,
|
|
61
|
+
changedDuplicateGroup,
|
|
62
|
+
metricDeltas
|
|
63
|
+
};
|
|
64
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { Graph, ClusterInfo } from './graph.js';
|
|
2
|
+
/**
|
|
3
|
+
* Detects content clusters using 64-bit SimHash and Hamming Distance.
|
|
4
|
+
* Uses band optimization to reduce O(n^2) comparisons.
|
|
5
|
+
*/
|
|
6
|
+
export declare function detectContentClusters(graph: Graph, threshold?: number, minSize?: number): ClusterInfo[];
|