@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
|
@@ -6,37 +6,71 @@ import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
|
6
6
|
import { computePageRank } from '../graph/pagerank.js';
|
|
7
7
|
import { calculateMetrics } from '../graph/metrics.js';
|
|
8
8
|
import { computeHITS } from '../scoring/hits.js';
|
|
9
|
+
import { EngineContext } from '../events.js';
|
|
10
|
+
import { calculateHealthScore, collectCrawlIssues } from '../scoring/health.js';
|
|
9
11
|
|
|
10
|
-
|
|
12
|
+
import { Graph } from '../graph/graph.js';
|
|
13
|
+
|
|
14
|
+
export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, context?: EngineContext, limitReached: boolean = false, graphInstance?: Graph) {
|
|
11
15
|
const db = getDb();
|
|
12
16
|
const metricsRepo = new MetricsRepository(db);
|
|
13
17
|
const snapshotRepo = new SnapshotRepository(db);
|
|
14
18
|
const pageRepo = new PageRepository(db);
|
|
15
19
|
|
|
20
|
+
const graph = graphInstance || loadGraphFromSnapshot(snapshotId);
|
|
21
|
+
|
|
22
|
+
// Fallback emitter
|
|
23
|
+
const emit = (event: any) => {
|
|
24
|
+
if (context) {
|
|
25
|
+
context.emit(event);
|
|
26
|
+
} else {
|
|
27
|
+
if (event.type === 'error') console.error(event.message);
|
|
28
|
+
else if (event.type !== 'debug') console.log(event.message || event.phase);
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
|
|
16
32
|
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
17
33
|
if (!snapshot) {
|
|
18
|
-
|
|
34
|
+
emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
|
|
19
35
|
return;
|
|
20
36
|
}
|
|
21
37
|
|
|
22
|
-
|
|
23
|
-
|
|
38
|
+
if (!graphInstance) {
|
|
39
|
+
emit({ type: 'metrics:start', phase: 'Loading graph' });
|
|
40
|
+
}
|
|
24
41
|
|
|
25
|
-
|
|
42
|
+
emit({ type: 'metrics:start', phase: 'Computing PageRank' });
|
|
26
43
|
computePageRank(graph);
|
|
27
44
|
|
|
28
|
-
|
|
45
|
+
emit({ type: 'metrics:start', phase: 'Computing HITS' });
|
|
29
46
|
computeHITS(graph);
|
|
30
47
|
|
|
31
|
-
|
|
48
|
+
emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
|
|
32
49
|
const nodes = graph.getNodes();
|
|
33
50
|
|
|
51
|
+
// Pre-fetch all page IDs to avoid N+1 queries
|
|
52
|
+
// Use getPagesIdentityBySnapshot to avoid loading full page content (HTML) into memory again
|
|
53
|
+
const pages = pageRepo.getPagesIdentityBySnapshot(snapshotId);
|
|
54
|
+
const urlToId = new Map<string, number>();
|
|
55
|
+
for (const p of pages) {
|
|
56
|
+
urlToId.set(p.normalized_url, p.id);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const clusterStmt = db.prepare(`
|
|
60
|
+
INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
|
|
61
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
62
|
+
`);
|
|
63
|
+
|
|
64
|
+
const contentStmt = db.prepare(`
|
|
65
|
+
INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
|
|
66
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
67
|
+
`);
|
|
68
|
+
|
|
34
69
|
const tx = db.transaction(() => {
|
|
35
70
|
for (const node of nodes) {
|
|
36
|
-
const pageId =
|
|
71
|
+
const pageId = urlToId.get(node.url);
|
|
37
72
|
if (!pageId) continue;
|
|
38
73
|
|
|
39
|
-
const existing = metricsRepo.getMetricsForPage(snapshotId, pageId);
|
|
40
74
|
|
|
41
75
|
metricsRepo.insertMetrics({
|
|
42
76
|
snapshot_id: snapshotId,
|
|
@@ -46,11 +80,11 @@ export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitR
|
|
|
46
80
|
pagerank: node.pageRank ?? null,
|
|
47
81
|
pagerank_score: node.pageRankScore ?? null,
|
|
48
82
|
link_role: node.linkRole ?? null,
|
|
49
|
-
crawl_status:
|
|
50
|
-
word_count:
|
|
51
|
-
thin_content_score:
|
|
52
|
-
external_link_ratio:
|
|
53
|
-
orphan_score:
|
|
83
|
+
crawl_status: node.crawlStatus ?? null,
|
|
84
|
+
word_count: node.wordCount ?? null,
|
|
85
|
+
thin_content_score: node.thinContentScore ?? null,
|
|
86
|
+
external_link_ratio: node.externalLinkRatio ?? null,
|
|
87
|
+
orphan_score: node.orphanScore ?? null,
|
|
54
88
|
duplicate_cluster_id: node.duplicateClusterId ?? null,
|
|
55
89
|
duplicate_type: node.duplicateType ?? null,
|
|
56
90
|
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
@@ -72,53 +106,32 @@ export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitR
|
|
|
72
106
|
}
|
|
73
107
|
|
|
74
108
|
// Save duplicate clusters
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
|
|
78
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
79
|
-
`);
|
|
80
|
-
for (const cluster of graph.duplicateClusters) {
|
|
81
|
-
clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
|
|
82
|
-
}
|
|
109
|
+
for (const cluster of graph.duplicateClusters) {
|
|
110
|
+
clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
|
|
83
111
|
}
|
|
84
112
|
|
|
85
113
|
// Save content clusters
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
|
|
89
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
90
|
-
`);
|
|
91
|
-
for (const cluster of graph.contentClusters) {
|
|
92
|
-
contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
|
|
93
|
-
}
|
|
114
|
+
for (const cluster of graph.contentClusters) {
|
|
115
|
+
contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
|
|
94
116
|
}
|
|
95
117
|
});
|
|
96
118
|
tx();
|
|
97
119
|
|
|
98
|
-
|
|
120
|
+
emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
|
|
99
121
|
const metrics = calculateMetrics(graph, maxDepth);
|
|
100
122
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
const score = node.authorityScore || node.pageRankScore || 0;
|
|
105
|
-
const depth = node.depth;
|
|
106
|
-
const weight = 1 / (depth + 1);
|
|
107
|
-
totalScore += score * weight;
|
|
108
|
-
totalWeight += weight;
|
|
109
|
-
}
|
|
110
|
-
const healthScore = totalWeight > 0 ? (totalScore / totalWeight) * 100 : 0;
|
|
111
|
-
|
|
112
|
-
const thinCountRow = db.prepare('SELECT count(*) as count FROM metrics WHERE snapshot_id = ? AND thin_content_score >= 70').get(snapshotId) as { count: number };
|
|
123
|
+
// Calculate penalty-based health score (matches CLI)
|
|
124
|
+
const issues = collectCrawlIssues(graph, metrics);
|
|
125
|
+
const health = calculateHealthScore(metrics.totalPages, issues);
|
|
113
126
|
|
|
114
127
|
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
115
128
|
node_count: metrics.totalPages,
|
|
116
129
|
edge_count: metrics.totalEdges,
|
|
117
|
-
health_score:
|
|
118
|
-
orphan_count:
|
|
119
|
-
thin_content_count:
|
|
130
|
+
health_score: health.score,
|
|
131
|
+
orphan_count: issues.orphanPages,
|
|
132
|
+
thin_content_count: issues.thinContent,
|
|
120
133
|
limit_reached: limitReached ? 1 : 0
|
|
121
134
|
});
|
|
122
135
|
|
|
123
|
-
|
|
136
|
+
emit({ type: 'metrics:complete', durationMs: 0 });
|
|
124
137
|
}
|
package/src/crawler/sitemap.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import { request } from 'undici';
|
|
2
2
|
import * as cheerio from 'cheerio';
|
|
3
3
|
import { normalizeUrl } from './normalize.js';
|
|
4
|
+
import { EngineContext } from '../events.js';
|
|
4
5
|
|
|
5
6
|
export class Sitemap {
|
|
7
|
+
constructor(private context?: EngineContext) {}
|
|
8
|
+
|
|
6
9
|
/**
|
|
7
10
|
* Fetches and parses a sitemap (or sitemap index) to extract URLs.
|
|
8
11
|
* Recursively handles sitemap indexes with loop detection and depth limits.
|
|
@@ -67,7 +70,7 @@ export class Sitemap {
|
|
|
67
70
|
await res.body.dump();
|
|
68
71
|
}
|
|
69
72
|
} catch (e) {
|
|
70
|
-
|
|
73
|
+
this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url}`, context: e });
|
|
71
74
|
}
|
|
72
75
|
}
|
|
73
76
|
}
|
package/src/db/graphLoader.ts
CHANGED
|
@@ -12,8 +12,8 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
|
|
|
12
12
|
const metricsRepo = new MetricsRepository(db);
|
|
13
13
|
const snapshotRepo = new SnapshotRepository(db);
|
|
14
14
|
|
|
15
|
-
const pages = pageRepo.
|
|
16
|
-
const metrics = metricsRepo.
|
|
15
|
+
const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId);
|
|
16
|
+
const metrics = metricsRepo.getMetricsIterator(snapshotId);
|
|
17
17
|
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
18
18
|
const metricsMap = new Map<number, DbMetrics>();
|
|
19
19
|
for (const m of metrics) {
|
|
@@ -21,6 +21,10 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
const graph = new Graph();
|
|
24
|
+
let pagesFetched = 0;
|
|
25
|
+
let pagesCached = 0;
|
|
26
|
+
let pagesSkipped = 0;
|
|
27
|
+
|
|
24
28
|
if (snapshot) {
|
|
25
29
|
graph.limitReached = !!snapshot.limit_reached;
|
|
26
30
|
}
|
|
@@ -31,6 +35,18 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
|
|
|
31
35
|
graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
|
|
32
36
|
|
|
33
37
|
const m = metricsMap.get(p.id);
|
|
38
|
+
if (m) {
|
|
39
|
+
const isProcessed = m.crawl_status === 'fetched' ||
|
|
40
|
+
m.crawl_status === 'fetched_error' ||
|
|
41
|
+
m.crawl_status === 'network_error' ||
|
|
42
|
+
m.crawl_status === 'failed_after_retries' ||
|
|
43
|
+
m.crawl_status === 'blocked_by_robots';
|
|
44
|
+
|
|
45
|
+
if (isProcessed) pagesFetched++;
|
|
46
|
+
else if (m.crawl_status === 'cached') pagesCached++;
|
|
47
|
+
else if (m.crawl_status === 'skipped') pagesSkipped++;
|
|
48
|
+
}
|
|
49
|
+
|
|
34
50
|
let incrementalStatus: 'new' | 'changed' | 'unchanged' | undefined;
|
|
35
51
|
if (p.first_seen_snapshot_id === snapshotId) {
|
|
36
52
|
incrementalStatus = 'new';
|
|
@@ -68,10 +84,16 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
|
|
|
68
84
|
duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
|
|
69
85
|
duplicateType: m?.duplicate_type ?? undefined,
|
|
70
86
|
isClusterPrimary: m?.is_cluster_primary ? true : undefined,
|
|
87
|
+
// Additional metrics
|
|
88
|
+
crawlStatus: m?.crawl_status || undefined,
|
|
89
|
+
wordCount: m?.word_count != null ? m.word_count : undefined,
|
|
90
|
+
thinContentScore: m?.thin_content_score != null ? m.thin_content_score : undefined,
|
|
91
|
+
externalLinkRatio: m?.external_link_ratio != null ? m.external_link_ratio : undefined,
|
|
92
|
+
orphanScore: m?.orphan_score != null ? m.orphan_score : undefined,
|
|
71
93
|
});
|
|
72
94
|
}
|
|
73
95
|
|
|
74
|
-
const edges = edgeRepo.
|
|
96
|
+
const edges = edgeRepo.getEdgesIteratorBySnapshot(snapshotId);
|
|
75
97
|
|
|
76
98
|
for (const e of edges) {
|
|
77
99
|
const source = idMap.get(e.source_page_id);
|
|
@@ -101,5 +123,13 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
|
|
|
101
123
|
sharedPathPrefix: c.shared_path_prefix || undefined
|
|
102
124
|
}));
|
|
103
125
|
|
|
126
|
+
// Set session stats
|
|
127
|
+
graph.sessionStats = {
|
|
128
|
+
pagesFetched,
|
|
129
|
+
pagesCached,
|
|
130
|
+
pagesSkipped,
|
|
131
|
+
totalFound: idMap.size
|
|
132
|
+
};
|
|
133
|
+
|
|
104
134
|
return graph;
|
|
105
135
|
}
|
package/src/db/index.ts
CHANGED
|
@@ -6,6 +6,10 @@ import { initSchema } from './schema.js';
|
|
|
6
6
|
|
|
7
7
|
let dbInstance: Database.Database | null = null;
|
|
8
8
|
|
|
9
|
+
export * from './repositories/SiteRepository.js';
|
|
10
|
+
export * from './repositories/SnapshotRepository.js';
|
|
11
|
+
export { initSchema } from './schema.js';
|
|
12
|
+
|
|
9
13
|
export function getDbPath(): string {
|
|
10
14
|
if (process.env.NODE_ENV === 'test') {
|
|
11
15
|
return ':memory:';
|
|
@@ -52,6 +56,7 @@ export function getDb(): Database.Database {
|
|
|
52
56
|
// Integrity check on startup
|
|
53
57
|
const integrity = db.pragma('integrity_check', { simple: true });
|
|
54
58
|
if (integrity !== 'ok') {
|
|
59
|
+
// Reverted to console.warn to avoid breaking change
|
|
55
60
|
console.warn('Database integrity check failed:', integrity);
|
|
56
61
|
}
|
|
57
62
|
|
|
@@ -23,7 +23,21 @@ export class EdgeRepository {
|
|
|
23
23
|
this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
|
|
24
24
|
}
|
|
25
25
|
|
|
26
|
+
insertEdges(edges: { snapshot_id: number; source_page_id: number; target_page_id: number; weight: number; rel: string }[]) {
|
|
27
|
+
if (edges.length === 0) return;
|
|
28
|
+
const tx = this.db.transaction((edgesBatch) => {
|
|
29
|
+
for (const edge of edgesBatch) {
|
|
30
|
+
this.insertStmt.run(edge.snapshot_id, edge.source_page_id, edge.target_page_id, edge.weight, edge.rel);
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
tx(edges);
|
|
34
|
+
}
|
|
35
|
+
|
|
26
36
|
getEdgesBySnapshot(snapshotId: number): Edge[] {
|
|
27
37
|
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId) as Edge[];
|
|
28
38
|
}
|
|
39
|
+
|
|
40
|
+
getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge> {
|
|
41
|
+
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').iterate(snapshotId) as IterableIterator<Edge>;
|
|
42
|
+
}
|
|
29
43
|
}
|
|
@@ -20,8 +20,10 @@ export interface DbMetrics {
|
|
|
20
20
|
|
|
21
21
|
export class MetricsRepository {
|
|
22
22
|
private insertStmt;
|
|
23
|
+
private getByPageStmt;
|
|
23
24
|
|
|
24
25
|
constructor(private db: Database) {
|
|
26
|
+
this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
|
|
25
27
|
this.insertStmt = this.db.prepare(`
|
|
26
28
|
INSERT OR REPLACE INTO metrics (
|
|
27
29
|
snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
|
|
@@ -43,7 +45,19 @@ export class MetricsRepository {
|
|
|
43
45
|
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId) as DbMetrics[];
|
|
44
46
|
}
|
|
45
47
|
|
|
48
|
+
getMetricsIterator(snapshotId: number): IterableIterator<DbMetrics> {
|
|
49
|
+
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').iterate(snapshotId) as IterableIterator<DbMetrics>;
|
|
50
|
+
}
|
|
51
|
+
|
|
46
52
|
getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined {
|
|
47
|
-
return this.
|
|
53
|
+
return this.getByPageStmt.get(snapshotId, pageId) as DbMetrics | undefined;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
insertMany(metricsList: DbMetrics[]) {
|
|
57
|
+
const insert = this.insertStmt;
|
|
58
|
+
const tx = this.db.transaction((items: DbMetrics[]) => {
|
|
59
|
+
for (const item of items) insert.run(item);
|
|
60
|
+
});
|
|
61
|
+
tx(metricsList);
|
|
48
62
|
}
|
|
49
63
|
}
|
|
@@ -49,24 +49,24 @@ export class PageRepository {
|
|
|
49
49
|
)
|
|
50
50
|
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
51
51
|
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
52
|
-
http_status = excluded.http_status,
|
|
53
|
-
canonical_url = excluded.canonical_url,
|
|
54
|
-
content_hash = excluded.content_hash,
|
|
55
|
-
simhash = excluded.simhash,
|
|
56
|
-
etag = excluded.etag,
|
|
57
|
-
last_modified = excluded.last_modified,
|
|
58
|
-
html = excluded.html,
|
|
59
|
-
soft404_score = excluded.soft404_score,
|
|
60
|
-
noindex = excluded.noindex,
|
|
61
|
-
nofollow = excluded.nofollow,
|
|
62
|
-
security_error = excluded.security_error,
|
|
63
|
-
retries = excluded.retries,
|
|
64
|
-
depth = excluded.depth,
|
|
65
|
-
redirect_chain = excluded.redirect_chain,
|
|
66
|
-
bytes_received = excluded.bytes_received,
|
|
67
|
-
crawl_trap_flag = excluded.crawl_trap_flag,
|
|
68
|
-
crawl_trap_risk = excluded.crawl_trap_risk,
|
|
69
|
-
trap_type = excluded.trap_type,
|
|
52
|
+
http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
|
|
53
|
+
canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
|
|
54
|
+
content_hash = COALESCE(excluded.content_hash, pages.content_hash),
|
|
55
|
+
simhash = COALESCE(excluded.simhash, pages.simhash),
|
|
56
|
+
etag = COALESCE(excluded.etag, pages.etag),
|
|
57
|
+
last_modified = COALESCE(excluded.last_modified, pages.last_modified),
|
|
58
|
+
html = COALESCE(excluded.html, pages.html),
|
|
59
|
+
soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
|
|
60
|
+
noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
|
|
61
|
+
nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
|
|
62
|
+
security_error = COALESCE(excluded.security_error, pages.security_error),
|
|
63
|
+
retries = MAX(pages.retries, excluded.retries),
|
|
64
|
+
depth = MIN(pages.depth, excluded.depth),
|
|
65
|
+
redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
|
|
66
|
+
bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
|
|
67
|
+
crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
|
|
68
|
+
crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
|
|
69
|
+
trap_type = COALESCE(excluded.trap_type, pages.trap_type),
|
|
70
70
|
updated_at = datetime('now')
|
|
71
71
|
`);
|
|
72
72
|
|
|
@@ -117,8 +117,108 @@ export class PageRepository {
|
|
|
117
117
|
return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url) as Page | undefined;
|
|
118
118
|
}
|
|
119
119
|
|
|
120
|
+
getPagesByUrls(siteId: number, urls: string[]): Page[] {
|
|
121
|
+
if (urls.length === 0) return [];
|
|
122
|
+
const chunkSize = 900;
|
|
123
|
+
const results: Page[] = [];
|
|
124
|
+
|
|
125
|
+
for (let i = 0; i < urls.length; i += chunkSize) {
|
|
126
|
+
const chunk = urls.slice(i, i + chunkSize);
|
|
127
|
+
const placeholders = chunk.map(() => '?').join(',');
|
|
128
|
+
const chunkResults = this.db.prepare(`SELECT * FROM pages WHERE site_id = ? AND normalized_url IN (${placeholders})`).all(siteId, ...chunk) as Page[];
|
|
129
|
+
results.push(...chunkResults);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return results;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
upsertMany(pages: (Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number })[]): Map<string, number> {
|
|
136
|
+
if (pages.length === 0) return new Map();
|
|
137
|
+
|
|
138
|
+
const upsertStmtWithReturn = this.db.prepare(`
|
|
139
|
+
INSERT INTO pages (
|
|
140
|
+
site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
|
|
141
|
+
http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
|
|
142
|
+
soft404_score, noindex, nofollow, security_error, retries, depth,
|
|
143
|
+
redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
144
|
+
updated_at
|
|
145
|
+
) VALUES (
|
|
146
|
+
@site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
|
|
147
|
+
@http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
|
|
148
|
+
@soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
|
|
149
|
+
@redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
150
|
+
datetime('now')
|
|
151
|
+
)
|
|
152
|
+
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
153
|
+
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
154
|
+
http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
|
|
155
|
+
canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
|
|
156
|
+
content_hash = COALESCE(excluded.content_hash, pages.content_hash),
|
|
157
|
+
simhash = COALESCE(excluded.simhash, pages.simhash),
|
|
158
|
+
etag = COALESCE(excluded.etag, pages.etag),
|
|
159
|
+
last_modified = COALESCE(excluded.last_modified, pages.last_modified),
|
|
160
|
+
html = COALESCE(excluded.html, pages.html),
|
|
161
|
+
soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
|
|
162
|
+
noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
|
|
163
|
+
nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
|
|
164
|
+
security_error = COALESCE(excluded.security_error, pages.security_error),
|
|
165
|
+
retries = MAX(pages.retries, excluded.retries),
|
|
166
|
+
depth = MIN(pages.depth, excluded.depth),
|
|
167
|
+
redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
|
|
168
|
+
bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
|
|
169
|
+
crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
|
|
170
|
+
crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
|
|
171
|
+
trap_type = COALESCE(excluded.trap_type, pages.trap_type),
|
|
172
|
+
updated_at = datetime('now')
|
|
173
|
+
RETURNING id
|
|
174
|
+
`);
|
|
175
|
+
|
|
176
|
+
const urlToId = new Map<string, number>();
|
|
177
|
+
const tx = this.db.transaction((pagesBatch) => {
|
|
178
|
+
for (const page of pagesBatch) {
|
|
179
|
+
const params = {
|
|
180
|
+
site_id: page.site_id,
|
|
181
|
+
normalized_url: page.normalized_url,
|
|
182
|
+
first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
|
|
183
|
+
last_seen_snapshot_id: page.last_seen_snapshot_id,
|
|
184
|
+
http_status: page.http_status ?? null,
|
|
185
|
+
canonical_url: page.canonical_url ?? null,
|
|
186
|
+
content_hash: page.content_hash ?? null,
|
|
187
|
+
simhash: page.simhash ?? null,
|
|
188
|
+
etag: page.etag ?? null,
|
|
189
|
+
last_modified: page.last_modified ?? null,
|
|
190
|
+
html: page.html ?? null,
|
|
191
|
+
soft404_score: page.soft404_score ?? null,
|
|
192
|
+
noindex: page.noindex ?? 0,
|
|
193
|
+
nofollow: page.nofollow ?? 0,
|
|
194
|
+
security_error: page.security_error ?? null,
|
|
195
|
+
retries: page.retries ?? 0,
|
|
196
|
+
depth: page.depth ?? 0,
|
|
197
|
+
redirect_chain: page.redirect_chain ?? null,
|
|
198
|
+
bytes_received: page.bytes_received ?? null,
|
|
199
|
+
crawl_trap_flag: page.crawl_trap_flag ?? 0,
|
|
200
|
+
crawl_trap_risk: page.crawl_trap_risk ?? null,
|
|
201
|
+
trap_type: page.trap_type ?? null,
|
|
202
|
+
};
|
|
203
|
+
const row = upsertStmtWithReturn.get(params) as { id: number };
|
|
204
|
+
urlToId.set(page.normalized_url, row.id);
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
tx(pages);
|
|
209
|
+
return urlToId;
|
|
210
|
+
}
|
|
211
|
+
|
|
120
212
|
getPagesBySnapshot(snapshotId: number): Page[] {
|
|
121
|
-
return this.db.prepare('SELECT
|
|
213
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId) as Page[];
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
getPagesIdentityBySnapshot(snapshotId: number): { id: number; normalized_url: string }[] {
|
|
217
|
+
return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId) as { id: number; normalized_url: string }[];
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
getPagesIteratorBySnapshot(snapshotId: number): IterableIterator<Page> {
|
|
221
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').iterate(snapshotId, snapshotId) as IterableIterator<Page>;
|
|
122
222
|
}
|
|
123
223
|
|
|
124
224
|
getIdByUrl(siteId: number, url: string): number | undefined {
|
|
@@ -11,10 +11,18 @@ export interface Site {
|
|
|
11
11
|
export class SiteRepository {
|
|
12
12
|
constructor(private db: Database) { }
|
|
13
13
|
|
|
14
|
+
getSiteById(id: number): Site | undefined {
|
|
15
|
+
return this.db.prepare('SELECT * FROM sites WHERE id = ?').get(id) as Site | undefined;
|
|
16
|
+
}
|
|
17
|
+
|
|
14
18
|
getSite(domain: string): Site | undefined {
|
|
15
19
|
return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain) as Site | undefined;
|
|
16
20
|
}
|
|
17
21
|
|
|
22
|
+
getAllSites(): Site[] {
|
|
23
|
+
return this.db.prepare('SELECT * FROM sites ORDER BY domain ASC').all() as Site[];
|
|
24
|
+
}
|
|
25
|
+
|
|
18
26
|
createSite(domain: string): number {
|
|
19
27
|
const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
|
|
20
28
|
const info = stmt.run(domain);
|
|
@@ -29,4 +37,7 @@ export class SiteRepository {
|
|
|
29
37
|
}
|
|
30
38
|
return site!;
|
|
31
39
|
}
|
|
40
|
+
deleteSite(id: number): void {
|
|
41
|
+
this.db.prepare('DELETE FROM sites WHERE id = ?').run(id);
|
|
42
|
+
}
|
|
32
43
|
}
|
|
@@ -15,25 +15,35 @@ export interface Snapshot {
|
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
export class SnapshotRepository {
|
|
18
|
-
constructor(private db: Database) {}
|
|
18
|
+
constructor(private db: Database) { }
|
|
19
19
|
|
|
20
20
|
createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status: 'running' | 'completed' | 'failed' = 'running'): number {
|
|
21
|
+
// Basic throttling or sleep if needed for tests, but generally SQLite is fast enough to have diff timestamps if not in same ms.
|
|
22
|
+
// However, if we run in memory, created_at is default current time.
|
|
23
|
+
// If two snapshots created in same second, ORDER BY created_at DESC is unstable or equal.
|
|
24
|
+
// We should rely on ID for stability if timestamps are equal, but the query uses created_at.
|
|
25
|
+
// Let's ensure we can also order by ID as tie-breaker.
|
|
21
26
|
const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
|
|
22
27
|
const info = stmt.run(siteId, type, status);
|
|
23
28
|
return info.lastInsertRowid as number;
|
|
24
29
|
}
|
|
25
30
|
|
|
26
31
|
getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined {
|
|
27
|
-
let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
|
|
32
|
+
let sql = 'SELECT * FROM snapshots WHERE site_id = ? AND type != \'partial\'';
|
|
28
33
|
const params: any[] = [siteId];
|
|
29
34
|
if (status) {
|
|
30
35
|
sql += ' AND status = ?';
|
|
31
36
|
params.push(status);
|
|
32
37
|
}
|
|
33
|
-
sql += ' ORDER BY created_at DESC LIMIT 1';
|
|
38
|
+
sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
|
|
34
39
|
return this.db.prepare(sql).get(...params) as Snapshot | undefined;
|
|
35
40
|
}
|
|
36
41
|
|
|
42
|
+
getSnapshotCount(siteId: number): number {
|
|
43
|
+
const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId) as { count: number };
|
|
44
|
+
return result.count;
|
|
45
|
+
}
|
|
46
|
+
|
|
37
47
|
updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats: Partial<Snapshot> = {}) {
|
|
38
48
|
const sets: string[] = ['status = ?'];
|
|
39
49
|
const params: any[] = [status];
|
|
@@ -71,4 +81,19 @@ export class SnapshotRepository {
|
|
|
71
81
|
getSnapshot(id: number): Snapshot | undefined {
|
|
72
82
|
return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id) as Snapshot | undefined;
|
|
73
83
|
}
|
|
84
|
+
|
|
85
|
+
deleteSnapshot(id: number): void {
|
|
86
|
+
const tx = this.db.transaction(() => {
|
|
87
|
+
// Unlink pages from this snapshot to prevent FK constraint violations or data inconsistencies
|
|
88
|
+
this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
|
|
89
|
+
this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
|
|
90
|
+
|
|
91
|
+
// Cleanup: Delete pages that are no longer referenced by any snapshot
|
|
92
|
+
this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
|
|
93
|
+
|
|
94
|
+
// Delete the snapshot
|
|
95
|
+
this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
|
|
96
|
+
});
|
|
97
|
+
tx();
|
|
98
|
+
}
|
|
74
99
|
}
|
package/src/events.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export type CrawlEvent =
|
|
2
|
+
| { type: 'crawl:start'; url: string }
|
|
3
|
+
| { type: 'crawl:success'; url: string; status: number; durationMs: number; depth?: number }
|
|
4
|
+
| { type: 'crawl:error'; url: string; error: string; depth?: number }
|
|
5
|
+
| { type: 'crawl:limit-reached'; limit: number }
|
|
6
|
+
| { type: 'queue:enqueue'; url: string; depth: number }
|
|
7
|
+
| { type: 'metrics:start'; phase: string }
|
|
8
|
+
| { type: 'metrics:complete'; durationMs: number }
|
|
9
|
+
| { type: 'debug'; message: string; context?: unknown }
|
|
10
|
+
| { type: 'info'; message: string; context?: unknown }
|
|
11
|
+
| { type: 'warn'; message: string; context?: unknown }
|
|
12
|
+
| { type: 'error'; message: string; error?: unknown; context?: unknown };
|
|
13
|
+
|
|
14
|
+
export interface EngineContext {
|
|
15
|
+
emit: (event: CrawlEvent) => void;
|
|
16
|
+
}
|