@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { request } from 'undici';
|
|
2
|
+
import * as cheerio from 'cheerio';
|
|
3
|
+
import { normalizeUrl } from './normalize.js';
|
|
4
|
+
|
|
5
|
+
export class Sitemap {
|
|
6
|
+
/**
|
|
7
|
+
* Fetches and parses a sitemap (or sitemap index) to extract URLs.
|
|
8
|
+
* Recursively handles sitemap indexes with loop detection and depth limits.
|
|
9
|
+
*/
|
|
10
|
+
async fetch(url: string): Promise<string[]> {
|
|
11
|
+
const visited = new Set<string>();
|
|
12
|
+
const urls = new Set<string>();
|
|
13
|
+
|
|
14
|
+
await this.processSitemap(url, visited, urls);
|
|
15
|
+
|
|
16
|
+
return Array.from(urls);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
private async processSitemap(url: string, visited: Set<string>, urls: Set<string>) {
|
|
20
|
+
if (visited.has(url)) return;
|
|
21
|
+
visited.add(url);
|
|
22
|
+
|
|
23
|
+
// Hard limit on number of sitemaps to fetch to prevent abuse
|
|
24
|
+
if (visited.size > 50) return;
|
|
25
|
+
|
|
26
|
+
try {
|
|
27
|
+
const res = await request(url, {
|
|
28
|
+
maxRedirections: 3,
|
|
29
|
+
headers: { 'User-Agent': 'crawlith/1.0' },
|
|
30
|
+
headersTimeout: 10000,
|
|
31
|
+
bodyTimeout: 10000
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
if (res.statusCode >= 200 && res.statusCode < 300) {
|
|
35
|
+
const xml = await res.body.text();
|
|
36
|
+
// Basic validation: must verify it looks like XML
|
|
37
|
+
if (!xml.trim().startsWith('<')) return;
|
|
38
|
+
|
|
39
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
40
|
+
|
|
41
|
+
// Check if it's a sitemap index
|
|
42
|
+
const sitemaps = $('sitemap > loc');
|
|
43
|
+
if (sitemaps.length > 0) {
|
|
44
|
+
const childSitemaps: string[] = [];
|
|
45
|
+
sitemaps.each((_, el) => {
|
|
46
|
+
const loc = $(el).text().trim();
|
|
47
|
+
if (loc) childSitemaps.push(loc);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
// Process children sequentially to avoid massive concurrency spike
|
|
51
|
+
for (const childUrl of childSitemaps) {
|
|
52
|
+
await this.processSitemap(childUrl, visited, urls);
|
|
53
|
+
}
|
|
54
|
+
} else {
|
|
55
|
+
// It's a URL Set
|
|
56
|
+
$('url > loc').each((_, el) => {
|
|
57
|
+
const loc = $(el).text().trim();
|
|
58
|
+
if (loc) {
|
|
59
|
+
const normalized = normalizeUrl(loc, '');
|
|
60
|
+
if (normalized) {
|
|
61
|
+
urls.add(normalized);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
} else {
|
|
67
|
+
await res.body.dump();
|
|
68
|
+
}
|
|
69
|
+
} catch (e) {
|
|
70
|
+
console.warn(`Failed to fetch sitemap ${url}:`, e);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
|
|
2
|
+
export type TrapType = 'faceted_navigation' | 'calendar_trap' | 'pagination_loop' | 'session_trap';
|
|
3
|
+
|
|
4
|
+
export interface TrapResult {
|
|
5
|
+
risk: number;
|
|
6
|
+
type: TrapType | null;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export class TrapDetector {
|
|
10
|
+
private pathCounters = new Map<string, Set<string>>();
|
|
11
|
+
private paginationCounters = new Map<string, number>();
|
|
12
|
+
private sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
|
|
13
|
+
|
|
14
|
+
// Configurable thresholds
|
|
15
|
+
private PARAM_EXPLOSION_THRESHOLD = 30;
|
|
16
|
+
private PAGINATION_THRESHOLD = 50;
|
|
17
|
+
|
|
18
|
+
constructor(options: { paramThreshold?: number, paginationThreshold?: number } = {}) {
|
|
19
|
+
if (options.paramThreshold) this.PARAM_EXPLOSION_THRESHOLD = options.paramThreshold;
|
|
20
|
+
if (options.paginationThreshold) this.PAGINATION_THRESHOLD = options.paginationThreshold;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Checks if a URL represents a potential crawl trap.
|
|
25
|
+
*/
|
|
26
|
+
checkTrap(rawUrl: string, _depth: number): TrapResult {
|
|
27
|
+
let risk = 0;
|
|
28
|
+
let type: TrapType | null = null;
|
|
29
|
+
|
|
30
|
+
try {
|
|
31
|
+
const u = new URL(rawUrl);
|
|
32
|
+
const params = new URLSearchParams(u.search);
|
|
33
|
+
const pathname = u.pathname;
|
|
34
|
+
const pathKey = `${u.origin}${pathname}`;
|
|
35
|
+
|
|
36
|
+
// 1. Session IDs / Tracking Parameters
|
|
37
|
+
for (const [key] of params) {
|
|
38
|
+
if (this.sessionParams.has(key.toLowerCase()) || key.toLowerCase().includes('session')) {
|
|
39
|
+
risk = Math.max(risk, 0.9);
|
|
40
|
+
type = 'session_trap';
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// 2. Calendar Pattern Detection
|
|
45
|
+
// Matches /2023/12/01, /2023-12-01, /12-2023 etc
|
|
46
|
+
const calendarRegex = /\/\d{4}[-/]\d{2}[-/]\d{2}\/|\/\d{2}[-/]\d{2}[-/]\d{4}\//;
|
|
47
|
+
if (calendarRegex.test(pathname)) {
|
|
48
|
+
risk = Math.max(risk, 0.7);
|
|
49
|
+
type = 'calendar_trap';
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// 3. Pagination Loop
|
|
53
|
+
const pageParam = params.get('page') || params.get('p') || params.get('pg');
|
|
54
|
+
if (pageParam && /^\d+$/.test(pageParam)) {
|
|
55
|
+
const pageNum = parseInt(pageParam, 10);
|
|
56
|
+
const currentMaxPage = this.paginationCounters.get(pathKey) || 0;
|
|
57
|
+
|
|
58
|
+
if (pageNum > currentMaxPage) {
|
|
59
|
+
this.paginationCounters.set(pathKey, pageNum);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (pageNum > this.PAGINATION_THRESHOLD) {
|
|
63
|
+
risk = Math.max(risk, 0.85);
|
|
64
|
+
type = 'pagination_loop';
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 4. Infinite Parameter Explosion (Faceted Navigation)
|
|
69
|
+
if (params.size > 0) {
|
|
70
|
+
const paramSet = this.pathCounters.get(pathKey) || new Set<string>();
|
|
71
|
+
params.sort();
|
|
72
|
+
const paramKey = params.toString();
|
|
73
|
+
paramSet.add(paramKey);
|
|
74
|
+
this.pathCounters.set(pathKey, paramSet);
|
|
75
|
+
|
|
76
|
+
if (paramSet.size > this.PARAM_EXPLOSION_THRESHOLD) {
|
|
77
|
+
risk = Math.max(risk, 0.95);
|
|
78
|
+
if (!type) type = 'faceted_navigation';
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
} catch (_e) {
|
|
83
|
+
// Invalid URL
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return { risk, type };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Resets internal state (useful for multi-crawl sessions if needed)
|
|
91
|
+
*/
|
|
92
|
+
reset() {
|
|
93
|
+
this.pathCounters.clear();
|
|
94
|
+
this.paginationCounters.clear();
|
|
95
|
+
}
|
|
96
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { getDb } from './index.js';
|
|
2
|
+
import { PageRepository } from './repositories/PageRepository.js';
|
|
3
|
+
import { EdgeRepository } from './repositories/EdgeRepository.js';
|
|
4
|
+
import { MetricsRepository, DbMetrics } from './repositories/MetricsRepository.js';
|
|
5
|
+
import { SnapshotRepository } from './repositories/SnapshotRepository.js';
|
|
6
|
+
import { Graph } from '../graph/graph.js';
|
|
7
|
+
|
|
8
|
+
export function loadGraphFromSnapshot(snapshotId: number): Graph {
|
|
9
|
+
const db = getDb();
|
|
10
|
+
const pageRepo = new PageRepository(db);
|
|
11
|
+
const edgeRepo = new EdgeRepository(db);
|
|
12
|
+
const metricsRepo = new MetricsRepository(db);
|
|
13
|
+
const snapshotRepo = new SnapshotRepository(db);
|
|
14
|
+
|
|
15
|
+
const pages = pageRepo.getPagesBySnapshot(snapshotId);
|
|
16
|
+
const metrics = metricsRepo.getMetrics(snapshotId);
|
|
17
|
+
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
18
|
+
const metricsMap = new Map<number, DbMetrics>();
|
|
19
|
+
for (const m of metrics) {
|
|
20
|
+
metricsMap.set(m.page_id, m);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const graph = new Graph();
|
|
24
|
+
if (snapshot) {
|
|
25
|
+
graph.limitReached = !!snapshot.limit_reached;
|
|
26
|
+
}
|
|
27
|
+
const idMap = new Map<number, string>();
|
|
28
|
+
|
|
29
|
+
for (const p of pages) {
|
|
30
|
+
idMap.set(p.id, p.normalized_url);
|
|
31
|
+
graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
|
|
32
|
+
|
|
33
|
+
const m = metricsMap.get(p.id);
|
|
34
|
+
let incrementalStatus: 'new' | 'changed' | 'unchanged' | undefined;
|
|
35
|
+
if (p.first_seen_snapshot_id === snapshotId) {
|
|
36
|
+
incrementalStatus = 'new';
|
|
37
|
+
} else if (m?.crawl_status === 'cached') {
|
|
38
|
+
incrementalStatus = 'unchanged';
|
|
39
|
+
} else if (m?.crawl_status === 'fetched') {
|
|
40
|
+
incrementalStatus = 'changed';
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
graph.updateNodeData(p.normalized_url, {
|
|
44
|
+
canonical: p.canonical_url || undefined,
|
|
45
|
+
contentHash: p.content_hash || undefined,
|
|
46
|
+
simhash: p.simhash || undefined,
|
|
47
|
+
etag: p.etag || undefined,
|
|
48
|
+
lastModified: p.last_modified || undefined,
|
|
49
|
+
html: p.html || undefined,
|
|
50
|
+
soft404Score: p.soft404_score || undefined,
|
|
51
|
+
noindex: !!p.noindex,
|
|
52
|
+
nofollow: !!p.nofollow,
|
|
53
|
+
incrementalStatus,
|
|
54
|
+
securityError: p.security_error || undefined,
|
|
55
|
+
retries: p.retries || undefined,
|
|
56
|
+
bytesReceived: p.bytes_received || undefined,
|
|
57
|
+
redirectChain: p.redirect_chain ? JSON.parse(p.redirect_chain) : undefined,
|
|
58
|
+
crawlTrapFlag: !!p.crawl_trap_flag,
|
|
59
|
+
crawlTrapRisk: p.crawl_trap_risk || undefined,
|
|
60
|
+
trapType: p.trap_type || undefined,
|
|
61
|
+
// Metrics
|
|
62
|
+
pageRank: m?.pagerank ?? undefined,
|
|
63
|
+
pageRankScore: m?.pagerank_score ?? m?.pagerank ?? undefined,
|
|
64
|
+
authorityScore: m?.authority_score ?? undefined,
|
|
65
|
+
hubScore: m?.hub_score ?? undefined,
|
|
66
|
+
linkRole: m?.link_role ?? undefined,
|
|
67
|
+
// Duplicate info
|
|
68
|
+
duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
|
|
69
|
+
duplicateType: m?.duplicate_type ?? undefined,
|
|
70
|
+
isClusterPrimary: m?.is_cluster_primary ? true : undefined,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
|
|
75
|
+
|
|
76
|
+
for (const e of edges) {
|
|
77
|
+
const source = idMap.get(e.source_page_id);
|
|
78
|
+
const target = idMap.get(e.target_page_id);
|
|
79
|
+
if (source && target) {
|
|
80
|
+
graph.addEdge(source, target, e.weight || 1.0);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Load duplicate clusters
|
|
85
|
+
const dupClusters = db.prepare('SELECT * FROM duplicate_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
|
|
86
|
+
graph.duplicateClusters = dupClusters.map(c => ({
|
|
87
|
+
id: c.id,
|
|
88
|
+
type: c.type,
|
|
89
|
+
size: c.size,
|
|
90
|
+
representative: c.representative,
|
|
91
|
+
severity: c.severity
|
|
92
|
+
}));
|
|
93
|
+
|
|
94
|
+
// Load content clusters
|
|
95
|
+
const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
|
|
96
|
+
graph.contentClusters = contentClusters.map(c => ({
|
|
97
|
+
id: c.id,
|
|
98
|
+
count: c.count,
|
|
99
|
+
primaryUrl: c.primary_url,
|
|
100
|
+
risk: c.risk,
|
|
101
|
+
sharedPathPrefix: c.shared_path_prefix || undefined
|
|
102
|
+
}));
|
|
103
|
+
|
|
104
|
+
return graph;
|
|
105
|
+
}
|
package/src/db/index.ts
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import Database from 'better-sqlite3';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import { initSchema } from './schema.js';
|
|
6
|
+
|
|
7
|
+
let dbInstance: Database.Database | null = null;
|
|
8
|
+
|
|
9
|
+
export function getDbPath(): string {
|
|
10
|
+
if (process.env.NODE_ENV === 'test') {
|
|
11
|
+
return ':memory:';
|
|
12
|
+
}
|
|
13
|
+
if (process.env.CRAWLITH_DB_PATH) {
|
|
14
|
+
return process.env.CRAWLITH_DB_PATH;
|
|
15
|
+
}
|
|
16
|
+
const homeDir = os.homedir();
|
|
17
|
+
const crawlithDir = path.join(homeDir, '.crawlith');
|
|
18
|
+
if (!fs.existsSync(crawlithDir)) {
|
|
19
|
+
fs.mkdirSync(crawlithDir, { recursive: true });
|
|
20
|
+
// Set permissions to 700 (user only)
|
|
21
|
+
fs.chmodSync(crawlithDir, 0o700);
|
|
22
|
+
}
|
|
23
|
+
return path.join(crawlithDir, 'crawlith.db');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function getDb(): Database.Database {
|
|
27
|
+
if (dbInstance) {
|
|
28
|
+
return dbInstance;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const dbPath = getDbPath();
|
|
32
|
+
const db = new Database(dbPath);
|
|
33
|
+
|
|
34
|
+
// Hardening & Performance Configuration
|
|
35
|
+
db.pragma('journal_mode = WAL');
|
|
36
|
+
db.pragma('synchronous = NORMAL');
|
|
37
|
+
db.pragma('foreign_keys = ON');
|
|
38
|
+
db.pragma('temp_store = MEMORY');
|
|
39
|
+
db.pragma('mmap_size = 30000000000');
|
|
40
|
+
db.pragma('cache_size = -20000');
|
|
41
|
+
db.pragma('busy_timeout = 5000');
|
|
42
|
+
|
|
43
|
+
// Security controls
|
|
44
|
+
// Ensure file permissions are 600 (user read/write only)
|
|
45
|
+
try {
|
|
46
|
+
fs.chmodSync(dbPath, 0o600);
|
|
47
|
+
} catch (_e) {
|
|
48
|
+
// might fail on first creation if file doesn't exist yet, but better-sqlite3 creates it
|
|
49
|
+
// so we can try again or ignore if it's new
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Integrity check on startup
|
|
53
|
+
const integrity = db.pragma('integrity_check', { simple: true });
|
|
54
|
+
if (integrity !== 'ok') {
|
|
55
|
+
console.warn('Database integrity check failed:', integrity);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Initialize schema
|
|
59
|
+
initSchema(db);
|
|
60
|
+
|
|
61
|
+
dbInstance = db;
|
|
62
|
+
return db;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export function closeDb() {
|
|
66
|
+
if (dbInstance) {
|
|
67
|
+
dbInstance.close();
|
|
68
|
+
dbInstance = null;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
|
|
3
|
+
export interface Edge {
|
|
4
|
+
id: number;
|
|
5
|
+
snapshot_id: number;
|
|
6
|
+
source_page_id: number;
|
|
7
|
+
target_page_id: number;
|
|
8
|
+
weight: number;
|
|
9
|
+
rel: 'nofollow' | 'sponsored' | 'ugc' | 'internal' | 'external' | 'unknown';
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export class EdgeRepository {
|
|
13
|
+
private insertStmt;
|
|
14
|
+
|
|
15
|
+
constructor(private db: Database) {
|
|
16
|
+
this.insertStmt = this.db.prepare(`
|
|
17
|
+
INSERT INTO edges (snapshot_id, source_page_id, target_page_id, weight, rel)
|
|
18
|
+
VALUES (?, ?, ?, ?, ?)
|
|
19
|
+
`);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight: number = 1.0, rel: string = 'internal') {
|
|
23
|
+
this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
getEdgesBySnapshot(snapshotId: number): Edge[] {
|
|
27
|
+
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId) as Edge[];
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
|
|
3
|
+
export interface DbMetrics {
|
|
4
|
+
snapshot_id: number;
|
|
5
|
+
page_id: number;
|
|
6
|
+
authority_score: number | null;
|
|
7
|
+
hub_score: number | null;
|
|
8
|
+
pagerank: number | null;
|
|
9
|
+
pagerank_score: number | null;
|
|
10
|
+
link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
|
|
11
|
+
crawl_status: string | null;
|
|
12
|
+
word_count: number | null;
|
|
13
|
+
thin_content_score: number | null;
|
|
14
|
+
external_link_ratio: number | null;
|
|
15
|
+
orphan_score: number | null;
|
|
16
|
+
duplicate_cluster_id: string | null;
|
|
17
|
+
duplicate_type: 'exact' | 'near' | 'template_heavy' | 'none' | null;
|
|
18
|
+
is_cluster_primary: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export class MetricsRepository {
|
|
22
|
+
private insertStmt;
|
|
23
|
+
|
|
24
|
+
constructor(private db: Database) {
|
|
25
|
+
this.insertStmt = this.db.prepare(`
|
|
26
|
+
INSERT OR REPLACE INTO metrics (
|
|
27
|
+
snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
|
|
28
|
+
link_role, crawl_status, word_count, thin_content_score, external_link_ratio,
|
|
29
|
+
orphan_score, duplicate_cluster_id, duplicate_type, is_cluster_primary
|
|
30
|
+
) VALUES (
|
|
31
|
+
@snapshot_id, @page_id, @authority_score, @hub_score, @pagerank, @pagerank_score,
|
|
32
|
+
@link_role, @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
|
|
33
|
+
@orphan_score, @duplicate_cluster_id, @duplicate_type, @is_cluster_primary
|
|
34
|
+
)
|
|
35
|
+
`);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
insertMetrics(metrics: DbMetrics) {
|
|
39
|
+
this.insertStmt.run(metrics);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
getMetrics(snapshotId: number): DbMetrics[] {
|
|
43
|
+
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId) as DbMetrics[];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined {
|
|
47
|
+
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?').get(snapshotId, pageId) as DbMetrics | undefined;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
|
|
3
|
+
export interface Page {
|
|
4
|
+
id: number;
|
|
5
|
+
site_id: number;
|
|
6
|
+
normalized_url: string;
|
|
7
|
+
first_seen_snapshot_id: number | null;
|
|
8
|
+
last_seen_snapshot_id: number | null;
|
|
9
|
+
http_status: number | null;
|
|
10
|
+
canonical_url: string | null;
|
|
11
|
+
content_hash: string | null;
|
|
12
|
+
simhash: string | null;
|
|
13
|
+
etag: string | null;
|
|
14
|
+
last_modified: string | null;
|
|
15
|
+
html: string | null;
|
|
16
|
+
soft404_score: number | null;
|
|
17
|
+
noindex: number;
|
|
18
|
+
nofollow: number;
|
|
19
|
+
security_error: string | null;
|
|
20
|
+
retries: number;
|
|
21
|
+
depth: number;
|
|
22
|
+
redirect_chain: string | null;
|
|
23
|
+
bytes_received: number | null;
|
|
24
|
+
crawl_trap_flag: number;
|
|
25
|
+
crawl_trap_risk: number | null;
|
|
26
|
+
trap_type: string | null;
|
|
27
|
+
created_at: string;
|
|
28
|
+
updated_at: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export class PageRepository {
|
|
32
|
+
private upsertStmt;
|
|
33
|
+
private getIdStmt;
|
|
34
|
+
|
|
35
|
+
constructor(private db: Database) {
|
|
36
|
+
this.upsertStmt = this.db.prepare(`
|
|
37
|
+
INSERT INTO pages (
|
|
38
|
+
site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
|
|
39
|
+
http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
|
|
40
|
+
soft404_score, noindex, nofollow, security_error, retries, depth,
|
|
41
|
+
redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
42
|
+
updated_at
|
|
43
|
+
) VALUES (
|
|
44
|
+
@site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
|
|
45
|
+
@http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
|
|
46
|
+
@soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
|
|
47
|
+
@redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
48
|
+
datetime('now')
|
|
49
|
+
)
|
|
50
|
+
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
51
|
+
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
52
|
+
http_status = excluded.http_status,
|
|
53
|
+
canonical_url = excluded.canonical_url,
|
|
54
|
+
content_hash = excluded.content_hash,
|
|
55
|
+
simhash = excluded.simhash,
|
|
56
|
+
etag = excluded.etag,
|
|
57
|
+
last_modified = excluded.last_modified,
|
|
58
|
+
html = excluded.html,
|
|
59
|
+
soft404_score = excluded.soft404_score,
|
|
60
|
+
noindex = excluded.noindex,
|
|
61
|
+
nofollow = excluded.nofollow,
|
|
62
|
+
security_error = excluded.security_error,
|
|
63
|
+
retries = excluded.retries,
|
|
64
|
+
depth = excluded.depth,
|
|
65
|
+
redirect_chain = excluded.redirect_chain,
|
|
66
|
+
bytes_received = excluded.bytes_received,
|
|
67
|
+
crawl_trap_flag = excluded.crawl_trap_flag,
|
|
68
|
+
crawl_trap_risk = excluded.crawl_trap_risk,
|
|
69
|
+
trap_type = excluded.trap_type,
|
|
70
|
+
updated_at = datetime('now')
|
|
71
|
+
`);
|
|
72
|
+
|
|
73
|
+
this.getIdStmt = this.db.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?');
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
upsertPage(page: Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number }) {
|
|
77
|
+
const params = {
|
|
78
|
+
site_id: page.site_id,
|
|
79
|
+
normalized_url: page.normalized_url,
|
|
80
|
+
first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
|
|
81
|
+
last_seen_snapshot_id: page.last_seen_snapshot_id,
|
|
82
|
+
http_status: page.http_status ?? null,
|
|
83
|
+
canonical_url: page.canonical_url ?? null,
|
|
84
|
+
content_hash: page.content_hash ?? null,
|
|
85
|
+
simhash: page.simhash ?? null,
|
|
86
|
+
etag: page.etag ?? null,
|
|
87
|
+
last_modified: page.last_modified ?? null,
|
|
88
|
+
html: page.html ?? null,
|
|
89
|
+
soft404_score: page.soft404_score ?? null,
|
|
90
|
+
noindex: page.noindex ?? 0,
|
|
91
|
+
nofollow: page.nofollow ?? 0,
|
|
92
|
+
security_error: page.security_error ?? null,
|
|
93
|
+
retries: page.retries ?? 0,
|
|
94
|
+
depth: page.depth ?? 0,
|
|
95
|
+
redirect_chain: page.redirect_chain ?? null,
|
|
96
|
+
bytes_received: page.bytes_received ?? null,
|
|
97
|
+
crawl_trap_flag: page.crawl_trap_flag ?? 0,
|
|
98
|
+
crawl_trap_risk: page.crawl_trap_risk ?? null,
|
|
99
|
+
trap_type: page.trap_type ?? null,
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
const info = this.upsertStmt.run(params);
|
|
103
|
+
return info;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
upsertAndGetId(page: Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number }): number {
|
|
107
|
+
const tx = this.db.transaction(() => {
|
|
108
|
+
this.upsertPage(page);
|
|
109
|
+
const row = this.getIdStmt.get(page.site_id, page.normalized_url) as { id: number } | undefined;
|
|
110
|
+
if (!row) throw new Error(`Failed to retrieve ID for upserted page: ${page.normalized_url}`);
|
|
111
|
+
return row.id;
|
|
112
|
+
});
|
|
113
|
+
return tx();
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
getPage(siteId: number, url: string): Page | undefined {
|
|
117
|
+
return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url) as Page | undefined;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
getPagesBySnapshot(snapshotId: number): Page[] {
|
|
121
|
+
return this.db.prepare('SELECT * FROM pages WHERE last_seen_snapshot_id = ?').all(snapshotId) as Page[];
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
getIdByUrl(siteId: number, url: string): number | undefined {
|
|
125
|
+
const row = this.getIdStmt.get(siteId, url) as { id: number } | undefined;
|
|
126
|
+
return row?.id;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
|
|
3
|
+
export interface Site {
|
|
4
|
+
id: number;
|
|
5
|
+
domain: string;
|
|
6
|
+
created_at: string;
|
|
7
|
+
settings_json: string | null;
|
|
8
|
+
is_active: number;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export class SiteRepository {
|
|
12
|
+
constructor(private db: Database) { }
|
|
13
|
+
|
|
14
|
+
getSite(domain: string): Site | undefined {
|
|
15
|
+
return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain) as Site | undefined;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
createSite(domain: string): number {
|
|
19
|
+
const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
|
|
20
|
+
const info = stmt.run(domain);
|
|
21
|
+
return info.lastInsertRowid as number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
firstOrCreateSite(domain: string): Site {
|
|
25
|
+
let site = this.getSite(domain);
|
|
26
|
+
if (!site) {
|
|
27
|
+
this.createSite(domain);
|
|
28
|
+
site = this.getSite(domain);
|
|
29
|
+
}
|
|
30
|
+
return site!;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { Database } from 'better-sqlite3';
|
|
2
|
+
|
|
3
|
+
export interface Snapshot {
|
|
4
|
+
id: number;
|
|
5
|
+
site_id: number;
|
|
6
|
+
type: 'full' | 'partial' | 'incremental';
|
|
7
|
+
created_at: string;
|
|
8
|
+
node_count: number;
|
|
9
|
+
edge_count: number;
|
|
10
|
+
status: 'running' | 'completed' | 'failed';
|
|
11
|
+
limit_reached: number;
|
|
12
|
+
health_score: number | null;
|
|
13
|
+
orphan_count: number | null;
|
|
14
|
+
thin_content_count: number | null;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class SnapshotRepository {
|
|
18
|
+
constructor(private db: Database) {}
|
|
19
|
+
|
|
20
|
+
createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status: 'running' | 'completed' | 'failed' = 'running'): number {
|
|
21
|
+
const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
|
|
22
|
+
const info = stmt.run(siteId, type, status);
|
|
23
|
+
return info.lastInsertRowid as number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined {
|
|
27
|
+
let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
|
|
28
|
+
const params: any[] = [siteId];
|
|
29
|
+
if (status) {
|
|
30
|
+
sql += ' AND status = ?';
|
|
31
|
+
params.push(status);
|
|
32
|
+
}
|
|
33
|
+
sql += ' ORDER BY created_at DESC LIMIT 1';
|
|
34
|
+
return this.db.prepare(sql).get(...params) as Snapshot | undefined;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats: Partial<Snapshot> = {}) {
|
|
38
|
+
const sets: string[] = ['status = ?'];
|
|
39
|
+
const params: any[] = [status];
|
|
40
|
+
|
|
41
|
+
if (stats.node_count !== undefined) {
|
|
42
|
+
sets.push('node_count = ?');
|
|
43
|
+
params.push(stats.node_count);
|
|
44
|
+
}
|
|
45
|
+
if (stats.edge_count !== undefined) {
|
|
46
|
+
sets.push('edge_count = ?');
|
|
47
|
+
params.push(stats.edge_count);
|
|
48
|
+
}
|
|
49
|
+
if (stats.limit_reached !== undefined) {
|
|
50
|
+
sets.push('limit_reached = ?');
|
|
51
|
+
params.push(stats.limit_reached);
|
|
52
|
+
}
|
|
53
|
+
if (stats.health_score !== undefined) {
|
|
54
|
+
sets.push('health_score = ?');
|
|
55
|
+
params.push(stats.health_score);
|
|
56
|
+
}
|
|
57
|
+
if (stats.orphan_count !== undefined) {
|
|
58
|
+
sets.push('orphan_count = ?');
|
|
59
|
+
params.push(stats.orphan_count);
|
|
60
|
+
}
|
|
61
|
+
if (stats.thin_content_count !== undefined) {
|
|
62
|
+
sets.push('thin_content_count = ?');
|
|
63
|
+
params.push(stats.thin_content_count);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
params.push(id);
|
|
67
|
+
const sql = `UPDATE snapshots SET ${sets.join(', ')} WHERE id = ?`;
|
|
68
|
+
this.db.prepare(sql).run(...params);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
getSnapshot(id: number): Snapshot | undefined {
|
|
72
|
+
return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id) as Snapshot | undefined;
|
|
73
|
+
}
|
|
74
|
+
}
|