@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/src/crawler/sitemap.ts
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
import { request } from 'undici';
|
|
2
|
-
import * as cheerio from 'cheerio';
|
|
3
|
-
import { normalizeUrl } from './normalize.js';
|
|
4
|
-
import { EngineContext } from '../events.js';
|
|
5
|
-
|
|
6
|
-
export class Sitemap {
|
|
7
|
-
constructor(private context?: EngineContext) {}
|
|
8
|
-
|
|
9
|
-
/**
|
|
10
|
-
* Fetches and parses a sitemap (or sitemap index) to extract URLs.
|
|
11
|
-
* Recursively handles sitemap indexes with loop detection and depth limits.
|
|
12
|
-
*/
|
|
13
|
-
async fetch(url: string): Promise<string[]> {
|
|
14
|
-
const visited = new Set<string>();
|
|
15
|
-
const urls = new Set<string>();
|
|
16
|
-
|
|
17
|
-
await this.processSitemap(url, visited, urls);
|
|
18
|
-
|
|
19
|
-
return Array.from(urls);
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
private async processSitemap(url: string, visited: Set<string>, urls: Set<string>) {
|
|
23
|
-
if (visited.has(url)) return;
|
|
24
|
-
visited.add(url);
|
|
25
|
-
|
|
26
|
-
// Hard limit on number of sitemaps to fetch to prevent abuse
|
|
27
|
-
if (visited.size > 50) return;
|
|
28
|
-
|
|
29
|
-
try {
|
|
30
|
-
const res = await request(url, {
|
|
31
|
-
maxRedirections: 3,
|
|
32
|
-
headers: { 'User-Agent': 'crawlith/1.0' },
|
|
33
|
-
headersTimeout: 10000,
|
|
34
|
-
bodyTimeout: 10000
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
if (res.statusCode >= 200 && res.statusCode < 300) {
|
|
38
|
-
const xml = await res.body.text();
|
|
39
|
-
// Basic validation: must verify it looks like XML
|
|
40
|
-
if (!xml.trim().startsWith('<')) return;
|
|
41
|
-
|
|
42
|
-
const $ = cheerio.load(xml, { xmlMode: true });
|
|
43
|
-
|
|
44
|
-
// Check if it's a sitemap index
|
|
45
|
-
const sitemaps = $('sitemap > loc');
|
|
46
|
-
if (sitemaps.length > 0) {
|
|
47
|
-
const childSitemaps: string[] = [];
|
|
48
|
-
sitemaps.each((_, el) => {
|
|
49
|
-
const loc = $(el).text().trim();
|
|
50
|
-
if (loc) childSitemaps.push(loc);
|
|
51
|
-
});
|
|
52
|
-
|
|
53
|
-
// Process children sequentially to avoid massive concurrency spike
|
|
54
|
-
for (const childUrl of childSitemaps) {
|
|
55
|
-
await this.processSitemap(childUrl, visited, urls);
|
|
56
|
-
}
|
|
57
|
-
} else {
|
|
58
|
-
// It's a URL Set
|
|
59
|
-
$('url > loc').each((_, el) => {
|
|
60
|
-
const loc = $(el).text().trim();
|
|
61
|
-
if (loc) {
|
|
62
|
-
const normalized = normalizeUrl(loc, '');
|
|
63
|
-
if (normalized) {
|
|
64
|
-
urls.add(normalized);
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
});
|
|
68
|
-
}
|
|
69
|
-
} else {
|
|
70
|
-
await res.body.dump();
|
|
71
|
-
}
|
|
72
|
-
} catch (e) {
|
|
73
|
-
this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url}`, context: e });
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
}
|
package/src/crawler/trap.ts
DELETED
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
export type TrapType = 'faceted_navigation' | 'calendar_trap' | 'pagination_loop' | 'session_trap';
|
|
3
|
-
|
|
4
|
-
export interface TrapResult {
|
|
5
|
-
risk: number;
|
|
6
|
-
type: TrapType | null;
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
export class TrapDetector {
|
|
10
|
-
private pathCounters = new Map<string, Set<string>>();
|
|
11
|
-
private paginationCounters = new Map<string, number>();
|
|
12
|
-
private sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
|
|
13
|
-
|
|
14
|
-
// Configurable thresholds
|
|
15
|
-
private PARAM_EXPLOSION_THRESHOLD = 30;
|
|
16
|
-
private PAGINATION_THRESHOLD = 50;
|
|
17
|
-
|
|
18
|
-
constructor(options: { paramThreshold?: number, paginationThreshold?: number } = {}) {
|
|
19
|
-
if (options.paramThreshold) this.PARAM_EXPLOSION_THRESHOLD = options.paramThreshold;
|
|
20
|
-
if (options.paginationThreshold) this.PAGINATION_THRESHOLD = options.paginationThreshold;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
/**
|
|
24
|
-
* Checks if a URL represents a potential crawl trap.
|
|
25
|
-
*/
|
|
26
|
-
checkTrap(rawUrl: string, _depth: number): TrapResult {
|
|
27
|
-
let risk = 0;
|
|
28
|
-
let type: TrapType | null = null;
|
|
29
|
-
|
|
30
|
-
try {
|
|
31
|
-
const u = new URL(rawUrl);
|
|
32
|
-
const params = new URLSearchParams(u.search);
|
|
33
|
-
const pathname = u.pathname;
|
|
34
|
-
const pathKey = `${u.origin}${pathname}`;
|
|
35
|
-
|
|
36
|
-
// 1. Session IDs / Tracking Parameters
|
|
37
|
-
for (const [key] of params) {
|
|
38
|
-
if (this.sessionParams.has(key.toLowerCase()) || key.toLowerCase().includes('session')) {
|
|
39
|
-
risk = Math.max(risk, 0.9);
|
|
40
|
-
type = 'session_trap';
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
// 2. Calendar Pattern Detection
|
|
45
|
-
// Matches /2023/12/01, /2023-12-01, /12-2023 etc
|
|
46
|
-
const calendarRegex = /\/\d{4}[-/]\d{2}[-/]\d{2}\/|\/\d{2}[-/]\d{2}[-/]\d{4}\//;
|
|
47
|
-
if (calendarRegex.test(pathname)) {
|
|
48
|
-
risk = Math.max(risk, 0.7);
|
|
49
|
-
type = 'calendar_trap';
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
// 3. Pagination Loop
|
|
53
|
-
const pageParam = params.get('page') || params.get('p') || params.get('pg');
|
|
54
|
-
if (pageParam && /^\d+$/.test(pageParam)) {
|
|
55
|
-
const pageNum = parseInt(pageParam, 10);
|
|
56
|
-
const currentMaxPage = this.paginationCounters.get(pathKey) || 0;
|
|
57
|
-
|
|
58
|
-
if (pageNum > currentMaxPage) {
|
|
59
|
-
this.paginationCounters.set(pathKey, pageNum);
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
if (pageNum > this.PAGINATION_THRESHOLD) {
|
|
63
|
-
risk = Math.max(risk, 0.85);
|
|
64
|
-
type = 'pagination_loop';
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
// 4. Infinite Parameter Explosion (Faceted Navigation)
|
|
69
|
-
if (params.size > 0) {
|
|
70
|
-
const paramSet = this.pathCounters.get(pathKey) || new Set<string>();
|
|
71
|
-
params.sort();
|
|
72
|
-
const paramKey = params.toString();
|
|
73
|
-
paramSet.add(paramKey);
|
|
74
|
-
this.pathCounters.set(pathKey, paramSet);
|
|
75
|
-
|
|
76
|
-
if (paramSet.size > this.PARAM_EXPLOSION_THRESHOLD) {
|
|
77
|
-
risk = Math.max(risk, 0.95);
|
|
78
|
-
if (!type) type = 'faceted_navigation';
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
} catch (_e) {
|
|
83
|
-
// Invalid URL
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
return { risk, type };
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
/**
|
|
90
|
-
* Resets internal state (useful for multi-crawl sessions if needed)
|
|
91
|
-
*/
|
|
92
|
-
reset() {
|
|
93
|
-
this.pathCounters.clear();
|
|
94
|
-
this.paginationCounters.clear();
|
|
95
|
-
}
|
|
96
|
-
}
|
package/src/db/graphLoader.ts
DELETED
|
@@ -1,135 +0,0 @@
|
|
|
1
|
-
import { getDb } from './index.js';
|
|
2
|
-
import { PageRepository } from './repositories/PageRepository.js';
|
|
3
|
-
import { EdgeRepository } from './repositories/EdgeRepository.js';
|
|
4
|
-
import { MetricsRepository, DbMetrics } from './repositories/MetricsRepository.js';
|
|
5
|
-
import { SnapshotRepository } from './repositories/SnapshotRepository.js';
|
|
6
|
-
import { Graph } from '../graph/graph.js';
|
|
7
|
-
|
|
8
|
-
export function loadGraphFromSnapshot(snapshotId: number): Graph {
|
|
9
|
-
const db = getDb();
|
|
10
|
-
const pageRepo = new PageRepository(db);
|
|
11
|
-
const edgeRepo = new EdgeRepository(db);
|
|
12
|
-
const metricsRepo = new MetricsRepository(db);
|
|
13
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
14
|
-
|
|
15
|
-
const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId);
|
|
16
|
-
const metrics = metricsRepo.getMetricsIterator(snapshotId);
|
|
17
|
-
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
18
|
-
const metricsMap = new Map<number, DbMetrics>();
|
|
19
|
-
for (const m of metrics) {
|
|
20
|
-
metricsMap.set(m.page_id, m);
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
const graph = new Graph();
|
|
24
|
-
let pagesFetched = 0;
|
|
25
|
-
let pagesCached = 0;
|
|
26
|
-
let pagesSkipped = 0;
|
|
27
|
-
|
|
28
|
-
if (snapshot) {
|
|
29
|
-
graph.limitReached = !!snapshot.limit_reached;
|
|
30
|
-
}
|
|
31
|
-
const idMap = new Map<number, string>();
|
|
32
|
-
|
|
33
|
-
for (const p of pages) {
|
|
34
|
-
idMap.set(p.id, p.normalized_url);
|
|
35
|
-
graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
|
|
36
|
-
|
|
37
|
-
const m = metricsMap.get(p.id);
|
|
38
|
-
if (m) {
|
|
39
|
-
const isProcessed = m.crawl_status === 'fetched' ||
|
|
40
|
-
m.crawl_status === 'fetched_error' ||
|
|
41
|
-
m.crawl_status === 'network_error' ||
|
|
42
|
-
m.crawl_status === 'failed_after_retries' ||
|
|
43
|
-
m.crawl_status === 'blocked_by_robots';
|
|
44
|
-
|
|
45
|
-
if (isProcessed) pagesFetched++;
|
|
46
|
-
else if (m.crawl_status === 'cached') pagesCached++;
|
|
47
|
-
else if (m.crawl_status === 'skipped') pagesSkipped++;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
let incrementalStatus: 'new' | 'changed' | 'unchanged' | undefined;
|
|
51
|
-
if (p.first_seen_snapshot_id === snapshotId) {
|
|
52
|
-
incrementalStatus = 'new';
|
|
53
|
-
} else if (m?.crawl_status === 'cached') {
|
|
54
|
-
incrementalStatus = 'unchanged';
|
|
55
|
-
} else if (m?.crawl_status === 'fetched') {
|
|
56
|
-
incrementalStatus = 'changed';
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
graph.updateNodeData(p.normalized_url, {
|
|
60
|
-
canonical: p.canonical_url || undefined,
|
|
61
|
-
contentHash: p.content_hash || undefined,
|
|
62
|
-
simhash: p.simhash || undefined,
|
|
63
|
-
etag: p.etag || undefined,
|
|
64
|
-
lastModified: p.last_modified || undefined,
|
|
65
|
-
html: p.html || undefined,
|
|
66
|
-
soft404Score: p.soft404_score || undefined,
|
|
67
|
-
noindex: !!p.noindex,
|
|
68
|
-
nofollow: !!p.nofollow,
|
|
69
|
-
incrementalStatus,
|
|
70
|
-
securityError: p.security_error || undefined,
|
|
71
|
-
retries: p.retries || undefined,
|
|
72
|
-
bytesReceived: p.bytes_received || undefined,
|
|
73
|
-
redirectChain: p.redirect_chain ? JSON.parse(p.redirect_chain) : undefined,
|
|
74
|
-
crawlTrapFlag: !!p.crawl_trap_flag,
|
|
75
|
-
crawlTrapRisk: p.crawl_trap_risk || undefined,
|
|
76
|
-
trapType: p.trap_type || undefined,
|
|
77
|
-
// Metrics
|
|
78
|
-
pageRank: m?.pagerank ?? undefined,
|
|
79
|
-
pageRankScore: m?.pagerank_score ?? m?.pagerank ?? undefined,
|
|
80
|
-
authorityScore: m?.authority_score ?? undefined,
|
|
81
|
-
hubScore: m?.hub_score ?? undefined,
|
|
82
|
-
linkRole: m?.link_role ?? undefined,
|
|
83
|
-
// Duplicate info
|
|
84
|
-
duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
|
|
85
|
-
duplicateType: m?.duplicate_type ?? undefined,
|
|
86
|
-
isClusterPrimary: m?.is_cluster_primary ? true : undefined,
|
|
87
|
-
// Additional metrics
|
|
88
|
-
crawlStatus: m?.crawl_status || undefined,
|
|
89
|
-
wordCount: m?.word_count != null ? m.word_count : undefined,
|
|
90
|
-
thinContentScore: m?.thin_content_score != null ? m.thin_content_score : undefined,
|
|
91
|
-
externalLinkRatio: m?.external_link_ratio != null ? m.external_link_ratio : undefined,
|
|
92
|
-
orphanScore: m?.orphan_score != null ? m.orphan_score : undefined,
|
|
93
|
-
});
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
const edges = edgeRepo.getEdgesIteratorBySnapshot(snapshotId);
|
|
97
|
-
|
|
98
|
-
for (const e of edges) {
|
|
99
|
-
const source = idMap.get(e.source_page_id);
|
|
100
|
-
const target = idMap.get(e.target_page_id);
|
|
101
|
-
if (source && target) {
|
|
102
|
-
graph.addEdge(source, target, e.weight || 1.0);
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
// Load duplicate clusters
|
|
107
|
-
const dupClusters = db.prepare('SELECT * FROM duplicate_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
|
|
108
|
-
graph.duplicateClusters = dupClusters.map(c => ({
|
|
109
|
-
id: c.id,
|
|
110
|
-
type: c.type,
|
|
111
|
-
size: c.size,
|
|
112
|
-
representative: c.representative,
|
|
113
|
-
severity: c.severity
|
|
114
|
-
}));
|
|
115
|
-
|
|
116
|
-
// Load content clusters
|
|
117
|
-
const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
|
|
118
|
-
graph.contentClusters = contentClusters.map(c => ({
|
|
119
|
-
id: c.id,
|
|
120
|
-
count: c.count,
|
|
121
|
-
primaryUrl: c.primary_url,
|
|
122
|
-
risk: c.risk,
|
|
123
|
-
sharedPathPrefix: c.shared_path_prefix || undefined
|
|
124
|
-
}));
|
|
125
|
-
|
|
126
|
-
// Set session stats
|
|
127
|
-
graph.sessionStats = {
|
|
128
|
-
pagesFetched,
|
|
129
|
-
pagesCached,
|
|
130
|
-
pagesSkipped,
|
|
131
|
-
totalFound: idMap.size
|
|
132
|
-
};
|
|
133
|
-
|
|
134
|
-
return graph;
|
|
135
|
-
}
|
package/src/db/index.ts
DELETED
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
import Database from 'better-sqlite3';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import fs from 'node:fs';
|
|
4
|
-
import os from 'node:os';
|
|
5
|
-
import { initSchema } from './schema.js';
|
|
6
|
-
|
|
7
|
-
let dbInstance: Database.Database | null = null;
|
|
8
|
-
|
|
9
|
-
export * from './repositories/SiteRepository.js';
|
|
10
|
-
export * from './repositories/SnapshotRepository.js';
|
|
11
|
-
export { initSchema } from './schema.js';
|
|
12
|
-
|
|
13
|
-
export function getDbPath(): string {
|
|
14
|
-
if (process.env.NODE_ENV === 'test') {
|
|
15
|
-
return ':memory:';
|
|
16
|
-
}
|
|
17
|
-
if (process.env.CRAWLITH_DB_PATH) {
|
|
18
|
-
return process.env.CRAWLITH_DB_PATH;
|
|
19
|
-
}
|
|
20
|
-
const homeDir = os.homedir();
|
|
21
|
-
const crawlithDir = path.join(homeDir, '.crawlith');
|
|
22
|
-
if (!fs.existsSync(crawlithDir)) {
|
|
23
|
-
fs.mkdirSync(crawlithDir, { recursive: true });
|
|
24
|
-
// Set permissions to 700 (user only)
|
|
25
|
-
fs.chmodSync(crawlithDir, 0o700);
|
|
26
|
-
}
|
|
27
|
-
return path.join(crawlithDir, 'crawlith.db');
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
export function getDb(): Database.Database {
|
|
31
|
-
if (dbInstance) {
|
|
32
|
-
return dbInstance;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
const dbPath = getDbPath();
|
|
36
|
-
const db = new Database(dbPath);
|
|
37
|
-
|
|
38
|
-
// Hardening & Performance Configuration
|
|
39
|
-
db.pragma('journal_mode = WAL');
|
|
40
|
-
db.pragma('synchronous = NORMAL');
|
|
41
|
-
db.pragma('foreign_keys = ON');
|
|
42
|
-
db.pragma('temp_store = MEMORY');
|
|
43
|
-
db.pragma('mmap_size = 30000000000');
|
|
44
|
-
db.pragma('cache_size = -20000');
|
|
45
|
-
db.pragma('busy_timeout = 5000');
|
|
46
|
-
|
|
47
|
-
// Security controls
|
|
48
|
-
// Ensure file permissions are 600 (user read/write only)
|
|
49
|
-
try {
|
|
50
|
-
fs.chmodSync(dbPath, 0o600);
|
|
51
|
-
} catch (_e) {
|
|
52
|
-
// might fail on first creation if file doesn't exist yet, but better-sqlite3 creates it
|
|
53
|
-
// so we can try again or ignore if it's new
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
// Integrity check on startup
|
|
57
|
-
const integrity = db.pragma('integrity_check', { simple: true });
|
|
58
|
-
if (integrity !== 'ok') {
|
|
59
|
-
// Reverted to console.warn to avoid breaking change
|
|
60
|
-
console.warn('Database integrity check failed:', integrity);
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
// Initialize schema
|
|
64
|
-
initSchema(db);
|
|
65
|
-
|
|
66
|
-
dbInstance = db;
|
|
67
|
-
return db;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
export function closeDb() {
|
|
71
|
-
if (dbInstance) {
|
|
72
|
-
dbInstance.close();
|
|
73
|
-
dbInstance = null;
|
|
74
|
-
}
|
|
75
|
-
}
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import { Database } from 'better-sqlite3';
|
|
2
|
-
|
|
3
|
-
export interface Edge {
|
|
4
|
-
id: number;
|
|
5
|
-
snapshot_id: number;
|
|
6
|
-
source_page_id: number;
|
|
7
|
-
target_page_id: number;
|
|
8
|
-
weight: number;
|
|
9
|
-
rel: 'nofollow' | 'sponsored' | 'ugc' | 'internal' | 'external' | 'unknown';
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
export class EdgeRepository {
|
|
13
|
-
private insertStmt;
|
|
14
|
-
|
|
15
|
-
constructor(private db: Database) {
|
|
16
|
-
this.insertStmt = this.db.prepare(`
|
|
17
|
-
INSERT INTO edges (snapshot_id, source_page_id, target_page_id, weight, rel)
|
|
18
|
-
VALUES (?, ?, ?, ?, ?)
|
|
19
|
-
`);
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight: number = 1.0, rel: string = 'internal') {
|
|
23
|
-
this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
insertEdges(edges: { snapshot_id: number; source_page_id: number; target_page_id: number; weight: number; rel: string }[]) {
|
|
27
|
-
if (edges.length === 0) return;
|
|
28
|
-
const tx = this.db.transaction((edgesBatch) => {
|
|
29
|
-
for (const edge of edgesBatch) {
|
|
30
|
-
this.insertStmt.run(edge.snapshot_id, edge.source_page_id, edge.target_page_id, edge.weight, edge.rel);
|
|
31
|
-
}
|
|
32
|
-
});
|
|
33
|
-
tx(edges);
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
getEdgesBySnapshot(snapshotId: number): Edge[] {
|
|
37
|
-
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId) as Edge[];
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge> {
|
|
41
|
-
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').iterate(snapshotId) as IterableIterator<Edge>;
|
|
42
|
-
}
|
|
43
|
-
}
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
import { Database } from 'better-sqlite3';
|
|
2
|
-
|
|
3
|
-
export interface DbMetrics {
|
|
4
|
-
snapshot_id: number;
|
|
5
|
-
page_id: number;
|
|
6
|
-
authority_score: number | null;
|
|
7
|
-
hub_score: number | null;
|
|
8
|
-
pagerank: number | null;
|
|
9
|
-
pagerank_score: number | null;
|
|
10
|
-
link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
|
|
11
|
-
crawl_status: string | null;
|
|
12
|
-
word_count: number | null;
|
|
13
|
-
thin_content_score: number | null;
|
|
14
|
-
external_link_ratio: number | null;
|
|
15
|
-
orphan_score: number | null;
|
|
16
|
-
duplicate_cluster_id: string | null;
|
|
17
|
-
duplicate_type: 'exact' | 'near' | 'template_heavy' | 'none' | null;
|
|
18
|
-
is_cluster_primary: number;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
export class MetricsRepository {
|
|
22
|
-
private insertStmt;
|
|
23
|
-
private getByPageStmt;
|
|
24
|
-
|
|
25
|
-
constructor(private db: Database) {
|
|
26
|
-
this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
|
|
27
|
-
this.insertStmt = this.db.prepare(`
|
|
28
|
-
INSERT OR REPLACE INTO metrics (
|
|
29
|
-
snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
|
|
30
|
-
link_role, crawl_status, word_count, thin_content_score, external_link_ratio,
|
|
31
|
-
orphan_score, duplicate_cluster_id, duplicate_type, is_cluster_primary
|
|
32
|
-
) VALUES (
|
|
33
|
-
@snapshot_id, @page_id, @authority_score, @hub_score, @pagerank, @pagerank_score,
|
|
34
|
-
@link_role, @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
|
|
35
|
-
@orphan_score, @duplicate_cluster_id, @duplicate_type, @is_cluster_primary
|
|
36
|
-
)
|
|
37
|
-
`);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
insertMetrics(metrics: DbMetrics) {
|
|
41
|
-
this.insertStmt.run(metrics);
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
getMetrics(snapshotId: number): DbMetrics[] {
|
|
45
|
-
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId) as DbMetrics[];
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
getMetricsIterator(snapshotId: number): IterableIterator<DbMetrics> {
|
|
49
|
-
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').iterate(snapshotId) as IterableIterator<DbMetrics>;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined {
|
|
53
|
-
return this.getByPageStmt.get(snapshotId, pageId) as DbMetrics | undefined;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
insertMany(metricsList: DbMetrics[]) {
|
|
57
|
-
const insert = this.insertStmt;
|
|
58
|
-
const tx = this.db.transaction((items: DbMetrics[]) => {
|
|
59
|
-
for (const item of items) insert.run(item);
|
|
60
|
-
});
|
|
61
|
-
tx(metricsList);
|
|
62
|
-
}
|
|
63
|
-
}
|