@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
|
@@ -6,30 +6,59 @@ import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
|
6
6
|
import { computePageRank } from '../graph/pagerank.js';
|
|
7
7
|
import { calculateMetrics } from '../graph/metrics.js';
|
|
8
8
|
import { computeHITS } from '../scoring/hits.js';
|
|
9
|
-
|
|
9
|
+
import { calculateHealthScore, collectCrawlIssues } from '../scoring/health.js';
|
|
10
|
+
export function runPostCrawlMetrics(snapshotId, maxDepth, context, limitReached = false, graphInstance) {
|
|
10
11
|
const db = getDb();
|
|
11
12
|
const metricsRepo = new MetricsRepository(db);
|
|
12
13
|
const snapshotRepo = new SnapshotRepository(db);
|
|
13
14
|
const pageRepo = new PageRepository(db);
|
|
15
|
+
const graph = graphInstance || loadGraphFromSnapshot(snapshotId);
|
|
16
|
+
// Fallback emitter
|
|
17
|
+
const emit = (event) => {
|
|
18
|
+
if (context) {
|
|
19
|
+
context.emit(event);
|
|
20
|
+
}
|
|
21
|
+
else {
|
|
22
|
+
if (event.type === 'error')
|
|
23
|
+
console.error(event.message);
|
|
24
|
+
else if (event.type !== 'debug')
|
|
25
|
+
console.log(event.message || event.phase);
|
|
26
|
+
}
|
|
27
|
+
};
|
|
14
28
|
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
15
29
|
if (!snapshot) {
|
|
16
|
-
|
|
30
|
+
emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
|
|
17
31
|
return;
|
|
18
32
|
}
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
33
|
+
if (!graphInstance) {
|
|
34
|
+
emit({ type: 'metrics:start', phase: 'Loading graph' });
|
|
35
|
+
}
|
|
36
|
+
emit({ type: 'metrics:start', phase: 'Computing PageRank' });
|
|
22
37
|
computePageRank(graph);
|
|
23
|
-
|
|
38
|
+
emit({ type: 'metrics:start', phase: 'Computing HITS' });
|
|
24
39
|
computeHITS(graph);
|
|
25
|
-
|
|
40
|
+
emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
|
|
26
41
|
const nodes = graph.getNodes();
|
|
42
|
+
// Pre-fetch all page IDs to avoid N+1 queries
|
|
43
|
+
// Use getPagesIdentityBySnapshot to avoid loading full page content (HTML) into memory again
|
|
44
|
+
const pages = pageRepo.getPagesIdentityBySnapshot(snapshotId);
|
|
45
|
+
const urlToId = new Map();
|
|
46
|
+
for (const p of pages) {
|
|
47
|
+
urlToId.set(p.normalized_url, p.id);
|
|
48
|
+
}
|
|
49
|
+
const clusterStmt = db.prepare(`
|
|
50
|
+
INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
|
|
51
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
52
|
+
`);
|
|
53
|
+
const contentStmt = db.prepare(`
|
|
54
|
+
INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
|
|
55
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
56
|
+
`);
|
|
27
57
|
const tx = db.transaction(() => {
|
|
28
58
|
for (const node of nodes) {
|
|
29
|
-
const pageId =
|
|
59
|
+
const pageId = urlToId.get(node.url);
|
|
30
60
|
if (!pageId)
|
|
31
61
|
continue;
|
|
32
|
-
const existing = metricsRepo.getMetricsForPage(snapshotId, pageId);
|
|
33
62
|
metricsRepo.insertMetrics({
|
|
34
63
|
snapshot_id: snapshotId,
|
|
35
64
|
page_id: pageId,
|
|
@@ -38,11 +67,11 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false)
|
|
|
38
67
|
pagerank: node.pageRank ?? null,
|
|
39
68
|
pagerank_score: node.pageRankScore ?? null,
|
|
40
69
|
link_role: node.linkRole ?? null,
|
|
41
|
-
crawl_status:
|
|
42
|
-
word_count:
|
|
43
|
-
thin_content_score:
|
|
44
|
-
external_link_ratio:
|
|
45
|
-
orphan_score:
|
|
70
|
+
crawl_status: node.crawlStatus ?? null,
|
|
71
|
+
word_count: node.wordCount ?? null,
|
|
72
|
+
thin_content_score: node.thinContentScore ?? null,
|
|
73
|
+
external_link_ratio: node.externalLinkRatio ?? null,
|
|
74
|
+
orphan_score: node.orphanScore ?? null,
|
|
46
75
|
duplicate_cluster_id: node.duplicateClusterId ?? null,
|
|
47
76
|
duplicate_type: node.duplicateType ?? null,
|
|
48
77
|
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
@@ -62,47 +91,27 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false)
|
|
|
62
91
|
}
|
|
63
92
|
}
|
|
64
93
|
// Save duplicate clusters
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
|
|
68
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
69
|
-
`);
|
|
70
|
-
for (const cluster of graph.duplicateClusters) {
|
|
71
|
-
clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
|
|
72
|
-
}
|
|
94
|
+
for (const cluster of graph.duplicateClusters) {
|
|
95
|
+
clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
|
|
73
96
|
}
|
|
74
97
|
// Save content clusters
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
|
|
78
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
79
|
-
`);
|
|
80
|
-
for (const cluster of graph.contentClusters) {
|
|
81
|
-
contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
|
|
82
|
-
}
|
|
98
|
+
for (const cluster of graph.contentClusters) {
|
|
99
|
+
contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
|
|
83
100
|
}
|
|
84
101
|
});
|
|
85
102
|
tx();
|
|
86
|
-
|
|
103
|
+
emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
|
|
87
104
|
const metrics = calculateMetrics(graph, maxDepth);
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
const score = node.authorityScore || node.pageRankScore || 0;
|
|
92
|
-
const depth = node.depth;
|
|
93
|
-
const weight = 1 / (depth + 1);
|
|
94
|
-
totalScore += score * weight;
|
|
95
|
-
totalWeight += weight;
|
|
96
|
-
}
|
|
97
|
-
const healthScore = totalWeight > 0 ? (totalScore / totalWeight) * 100 : 0;
|
|
98
|
-
const thinCountRow = db.prepare('SELECT count(*) as count FROM metrics WHERE snapshot_id = ? AND thin_content_score >= 70').get(snapshotId);
|
|
105
|
+
// Calculate penalty-based health score (matches CLI)
|
|
106
|
+
const issues = collectCrawlIssues(graph, metrics);
|
|
107
|
+
const health = calculateHealthScore(metrics.totalPages, issues);
|
|
99
108
|
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
100
109
|
node_count: metrics.totalPages,
|
|
101
110
|
edge_count: metrics.totalEdges,
|
|
102
|
-
health_score:
|
|
103
|
-
orphan_count:
|
|
104
|
-
thin_content_count:
|
|
111
|
+
health_score: health.score,
|
|
112
|
+
orphan_count: issues.orphanPages,
|
|
113
|
+
thin_content_count: issues.thinContent,
|
|
105
114
|
limit_reached: limitReached ? 1 : 0
|
|
106
115
|
});
|
|
107
|
-
|
|
116
|
+
emit({ type: 'metrics:complete', durationMs: 0 });
|
|
108
117
|
}
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import { EngineContext } from '../events.js';
|
|
1
2
|
export declare class Sitemap {
|
|
3
|
+
private context?;
|
|
4
|
+
constructor(context?: EngineContext | undefined);
|
|
2
5
|
/**
|
|
3
6
|
* Fetches and parses a sitemap (or sitemap index) to extract URLs.
|
|
4
7
|
* Recursively handles sitemap indexes with loop detection and depth limits.
|
package/dist/crawler/sitemap.js
CHANGED
|
@@ -2,6 +2,10 @@ import { request } from 'undici';
|
|
|
2
2
|
import * as cheerio from 'cheerio';
|
|
3
3
|
import { normalizeUrl } from './normalize.js';
|
|
4
4
|
export class Sitemap {
|
|
5
|
+
context;
|
|
6
|
+
constructor(context) {
|
|
7
|
+
this.context = context;
|
|
8
|
+
}
|
|
5
9
|
/**
|
|
6
10
|
* Fetches and parses a sitemap (or sitemap index) to extract URLs.
|
|
7
11
|
* Recursively handles sitemap indexes with loop detection and depth limits.
|
|
@@ -64,7 +68,7 @@ export class Sitemap {
|
|
|
64
68
|
}
|
|
65
69
|
}
|
|
66
70
|
catch (e) {
|
|
67
|
-
|
|
71
|
+
this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url}`, context: e });
|
|
68
72
|
}
|
|
69
73
|
}
|
|
70
74
|
}
|
package/dist/db/graphLoader.js
CHANGED
|
@@ -10,14 +10,17 @@ export function loadGraphFromSnapshot(snapshotId) {
|
|
|
10
10
|
const edgeRepo = new EdgeRepository(db);
|
|
11
11
|
const metricsRepo = new MetricsRepository(db);
|
|
12
12
|
const snapshotRepo = new SnapshotRepository(db);
|
|
13
|
-
const pages = pageRepo.
|
|
14
|
-
const metrics = metricsRepo.
|
|
13
|
+
const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId);
|
|
14
|
+
const metrics = metricsRepo.getMetricsIterator(snapshotId);
|
|
15
15
|
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
16
16
|
const metricsMap = new Map();
|
|
17
17
|
for (const m of metrics) {
|
|
18
18
|
metricsMap.set(m.page_id, m);
|
|
19
19
|
}
|
|
20
20
|
const graph = new Graph();
|
|
21
|
+
let pagesFetched = 0;
|
|
22
|
+
let pagesCached = 0;
|
|
23
|
+
let pagesSkipped = 0;
|
|
21
24
|
if (snapshot) {
|
|
22
25
|
graph.limitReached = !!snapshot.limit_reached;
|
|
23
26
|
}
|
|
@@ -26,6 +29,19 @@ export function loadGraphFromSnapshot(snapshotId) {
|
|
|
26
29
|
idMap.set(p.id, p.normalized_url);
|
|
27
30
|
graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
|
|
28
31
|
const m = metricsMap.get(p.id);
|
|
32
|
+
if (m) {
|
|
33
|
+
const isProcessed = m.crawl_status === 'fetched' ||
|
|
34
|
+
m.crawl_status === 'fetched_error' ||
|
|
35
|
+
m.crawl_status === 'network_error' ||
|
|
36
|
+
m.crawl_status === 'failed_after_retries' ||
|
|
37
|
+
m.crawl_status === 'blocked_by_robots';
|
|
38
|
+
if (isProcessed)
|
|
39
|
+
pagesFetched++;
|
|
40
|
+
else if (m.crawl_status === 'cached')
|
|
41
|
+
pagesCached++;
|
|
42
|
+
else if (m.crawl_status === 'skipped')
|
|
43
|
+
pagesSkipped++;
|
|
44
|
+
}
|
|
29
45
|
let incrementalStatus;
|
|
30
46
|
if (p.first_seen_snapshot_id === snapshotId) {
|
|
31
47
|
incrementalStatus = 'new';
|
|
@@ -64,9 +80,15 @@ export function loadGraphFromSnapshot(snapshotId) {
|
|
|
64
80
|
duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
|
|
65
81
|
duplicateType: m?.duplicate_type ?? undefined,
|
|
66
82
|
isClusterPrimary: m?.is_cluster_primary ? true : undefined,
|
|
83
|
+
// Additional metrics
|
|
84
|
+
crawlStatus: m?.crawl_status || undefined,
|
|
85
|
+
wordCount: m?.word_count != null ? m.word_count : undefined,
|
|
86
|
+
thinContentScore: m?.thin_content_score != null ? m.thin_content_score : undefined,
|
|
87
|
+
externalLinkRatio: m?.external_link_ratio != null ? m.external_link_ratio : undefined,
|
|
88
|
+
orphanScore: m?.orphan_score != null ? m.orphan_score : undefined,
|
|
67
89
|
});
|
|
68
90
|
}
|
|
69
|
-
const edges = edgeRepo.
|
|
91
|
+
const edges = edgeRepo.getEdgesIteratorBySnapshot(snapshotId);
|
|
70
92
|
for (const e of edges) {
|
|
71
93
|
const source = idMap.get(e.source_page_id);
|
|
72
94
|
const target = idMap.get(e.target_page_id);
|
|
@@ -92,5 +114,12 @@ export function loadGraphFromSnapshot(snapshotId) {
|
|
|
92
114
|
risk: c.risk,
|
|
93
115
|
sharedPathPrefix: c.shared_path_prefix || undefined
|
|
94
116
|
}));
|
|
117
|
+
// Set session stats
|
|
118
|
+
graph.sessionStats = {
|
|
119
|
+
pagesFetched,
|
|
120
|
+
pagesCached,
|
|
121
|
+
pagesSkipped,
|
|
122
|
+
totalFound: idMap.size
|
|
123
|
+
};
|
|
95
124
|
return graph;
|
|
96
125
|
}
|
package/dist/db/index.d.ts
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
import Database from 'better-sqlite3';
|
|
2
|
+
export * from './repositories/SiteRepository.js';
|
|
3
|
+
export * from './repositories/SnapshotRepository.js';
|
|
4
|
+
export { initSchema } from './schema.js';
|
|
2
5
|
export declare function getDbPath(): string;
|
|
3
6
|
export declare function getDb(): Database.Database;
|
|
4
7
|
export declare function closeDb(): void;
|
package/dist/db/index.js
CHANGED
|
@@ -4,6 +4,9 @@ import fs from 'node:fs';
|
|
|
4
4
|
import os from 'node:os';
|
|
5
5
|
import { initSchema } from './schema.js';
|
|
6
6
|
let dbInstance = null;
|
|
7
|
+
export * from './repositories/SiteRepository.js';
|
|
8
|
+
export * from './repositories/SnapshotRepository.js';
|
|
9
|
+
export { initSchema } from './schema.js';
|
|
7
10
|
export function getDbPath() {
|
|
8
11
|
if (process.env.NODE_ENV === 'test') {
|
|
9
12
|
return ':memory:';
|
|
@@ -46,6 +49,7 @@ export function getDb() {
|
|
|
46
49
|
// Integrity check on startup
|
|
47
50
|
const integrity = db.pragma('integrity_check', { simple: true });
|
|
48
51
|
if (integrity !== 'ok') {
|
|
52
|
+
// Reverted to console.warn to avoid breaking change
|
|
49
53
|
console.warn('Database integrity check failed:', integrity);
|
|
50
54
|
}
|
|
51
55
|
// Initialize schema
|
|
@@ -12,5 +12,13 @@ export declare class EdgeRepository {
|
|
|
12
12
|
private insertStmt;
|
|
13
13
|
constructor(db: Database);
|
|
14
14
|
insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight?: number, rel?: string): void;
|
|
15
|
+
insertEdges(edges: {
|
|
16
|
+
snapshot_id: number;
|
|
17
|
+
source_page_id: number;
|
|
18
|
+
target_page_id: number;
|
|
19
|
+
weight: number;
|
|
20
|
+
rel: string;
|
|
21
|
+
}[]): void;
|
|
15
22
|
getEdgesBySnapshot(snapshotId: number): Edge[];
|
|
23
|
+
getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge>;
|
|
16
24
|
}
|
|
@@ -11,7 +11,20 @@ export class EdgeRepository {
|
|
|
11
11
|
insertEdge(snapshotId, sourcePageId, targetPageId, weight = 1.0, rel = 'internal') {
|
|
12
12
|
this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
|
|
13
13
|
}
|
|
14
|
+
insertEdges(edges) {
|
|
15
|
+
if (edges.length === 0)
|
|
16
|
+
return;
|
|
17
|
+
const tx = this.db.transaction((edgesBatch) => {
|
|
18
|
+
for (const edge of edgesBatch) {
|
|
19
|
+
this.insertStmt.run(edge.snapshot_id, edge.source_page_id, edge.target_page_id, edge.weight, edge.rel);
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
tx(edges);
|
|
23
|
+
}
|
|
14
24
|
getEdgesBySnapshot(snapshotId) {
|
|
15
25
|
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId);
|
|
16
26
|
}
|
|
27
|
+
getEdgesIteratorBySnapshot(snapshotId) {
|
|
28
|
+
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').iterate(snapshotId);
|
|
29
|
+
}
|
|
17
30
|
}
|
|
@@ -19,8 +19,11 @@ export interface DbMetrics {
|
|
|
19
19
|
export declare class MetricsRepository {
|
|
20
20
|
private db;
|
|
21
21
|
private insertStmt;
|
|
22
|
+
private getByPageStmt;
|
|
22
23
|
constructor(db: Database);
|
|
23
24
|
insertMetrics(metrics: DbMetrics): void;
|
|
24
25
|
getMetrics(snapshotId: number): DbMetrics[];
|
|
26
|
+
getMetricsIterator(snapshotId: number): IterableIterator<DbMetrics>;
|
|
25
27
|
getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined;
|
|
28
|
+
insertMany(metricsList: DbMetrics[]): void;
|
|
26
29
|
}
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
export class MetricsRepository {
|
|
2
2
|
db;
|
|
3
3
|
insertStmt;
|
|
4
|
+
getByPageStmt;
|
|
4
5
|
constructor(db) {
|
|
5
6
|
this.db = db;
|
|
7
|
+
this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
|
|
6
8
|
this.insertStmt = this.db.prepare(`
|
|
7
9
|
INSERT OR REPLACE INTO metrics (
|
|
8
10
|
snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
|
|
@@ -21,7 +23,18 @@ export class MetricsRepository {
|
|
|
21
23
|
getMetrics(snapshotId) {
|
|
22
24
|
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId);
|
|
23
25
|
}
|
|
26
|
+
getMetricsIterator(snapshotId) {
|
|
27
|
+
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').iterate(snapshotId);
|
|
28
|
+
}
|
|
24
29
|
getMetricsForPage(snapshotId, pageId) {
|
|
25
|
-
return this.
|
|
30
|
+
return this.getByPageStmt.get(snapshotId, pageId);
|
|
31
|
+
}
|
|
32
|
+
insertMany(metricsList) {
|
|
33
|
+
const insert = this.insertStmt;
|
|
34
|
+
const tx = this.db.transaction((items) => {
|
|
35
|
+
for (const item of items)
|
|
36
|
+
insert.run(item);
|
|
37
|
+
});
|
|
38
|
+
tx(metricsList);
|
|
26
39
|
}
|
|
27
40
|
}
|
|
@@ -42,6 +42,17 @@ export declare class PageRepository {
|
|
|
42
42
|
last_seen_snapshot_id: number;
|
|
43
43
|
}): number;
|
|
44
44
|
getPage(siteId: number, url: string): Page | undefined;
|
|
45
|
+
getPagesByUrls(siteId: number, urls: string[]): Page[];
|
|
46
|
+
upsertMany(pages: (Partial<Page> & {
|
|
47
|
+
site_id: number;
|
|
48
|
+
normalized_url: string;
|
|
49
|
+
last_seen_snapshot_id: number;
|
|
50
|
+
})[]): Map<string, number>;
|
|
45
51
|
getPagesBySnapshot(snapshotId: number): Page[];
|
|
52
|
+
getPagesIdentityBySnapshot(snapshotId: number): {
|
|
53
|
+
id: number;
|
|
54
|
+
normalized_url: string;
|
|
55
|
+
}[];
|
|
56
|
+
getPagesIteratorBySnapshot(snapshotId: number): IterableIterator<Page>;
|
|
46
57
|
getIdByUrl(siteId: number, url: string): number | undefined;
|
|
47
58
|
}
|
|
@@ -20,24 +20,24 @@ export class PageRepository {
|
|
|
20
20
|
)
|
|
21
21
|
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
22
22
|
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
23
|
-
http_status = excluded.http_status,
|
|
24
|
-
canonical_url = excluded.canonical_url,
|
|
25
|
-
content_hash = excluded.content_hash,
|
|
26
|
-
simhash = excluded.simhash,
|
|
27
|
-
etag = excluded.etag,
|
|
28
|
-
last_modified = excluded.last_modified,
|
|
29
|
-
html = excluded.html,
|
|
30
|
-
soft404_score = excluded.soft404_score,
|
|
31
|
-
noindex = excluded.noindex,
|
|
32
|
-
nofollow = excluded.nofollow,
|
|
33
|
-
security_error = excluded.security_error,
|
|
34
|
-
retries = excluded.retries,
|
|
35
|
-
depth = excluded.depth,
|
|
36
|
-
redirect_chain = excluded.redirect_chain,
|
|
37
|
-
bytes_received = excluded.bytes_received,
|
|
38
|
-
crawl_trap_flag = excluded.crawl_trap_flag,
|
|
39
|
-
crawl_trap_risk = excluded.crawl_trap_risk,
|
|
40
|
-
trap_type = excluded.trap_type,
|
|
23
|
+
http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
|
|
24
|
+
canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
|
|
25
|
+
content_hash = COALESCE(excluded.content_hash, pages.content_hash),
|
|
26
|
+
simhash = COALESCE(excluded.simhash, pages.simhash),
|
|
27
|
+
etag = COALESCE(excluded.etag, pages.etag),
|
|
28
|
+
last_modified = COALESCE(excluded.last_modified, pages.last_modified),
|
|
29
|
+
html = COALESCE(excluded.html, pages.html),
|
|
30
|
+
soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
|
|
31
|
+
noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
|
|
32
|
+
nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
|
|
33
|
+
security_error = COALESCE(excluded.security_error, pages.security_error),
|
|
34
|
+
retries = MAX(pages.retries, excluded.retries),
|
|
35
|
+
depth = MIN(pages.depth, excluded.depth),
|
|
36
|
+
redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
|
|
37
|
+
bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
|
|
38
|
+
crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
|
|
39
|
+
crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
|
|
40
|
+
trap_type = COALESCE(excluded.trap_type, pages.trap_type),
|
|
41
41
|
updated_at = datetime('now')
|
|
42
42
|
`);
|
|
43
43
|
this.getIdStmt = this.db.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?');
|
|
@@ -83,8 +83,101 @@ export class PageRepository {
|
|
|
83
83
|
getPage(siteId, url) {
|
|
84
84
|
return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url);
|
|
85
85
|
}
|
|
86
|
+
getPagesByUrls(siteId, urls) {
|
|
87
|
+
if (urls.length === 0)
|
|
88
|
+
return [];
|
|
89
|
+
const chunkSize = 900;
|
|
90
|
+
const results = [];
|
|
91
|
+
for (let i = 0; i < urls.length; i += chunkSize) {
|
|
92
|
+
const chunk = urls.slice(i, i + chunkSize);
|
|
93
|
+
const placeholders = chunk.map(() => '?').join(',');
|
|
94
|
+
const chunkResults = this.db.prepare(`SELECT * FROM pages WHERE site_id = ? AND normalized_url IN (${placeholders})`).all(siteId, ...chunk);
|
|
95
|
+
results.push(...chunkResults);
|
|
96
|
+
}
|
|
97
|
+
return results;
|
|
98
|
+
}
|
|
99
|
+
upsertMany(pages) {
|
|
100
|
+
if (pages.length === 0)
|
|
101
|
+
return new Map();
|
|
102
|
+
const upsertStmtWithReturn = this.db.prepare(`
|
|
103
|
+
INSERT INTO pages (
|
|
104
|
+
site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
|
|
105
|
+
http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
|
|
106
|
+
soft404_score, noindex, nofollow, security_error, retries, depth,
|
|
107
|
+
redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
108
|
+
updated_at
|
|
109
|
+
) VALUES (
|
|
110
|
+
@site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
|
|
111
|
+
@http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
|
|
112
|
+
@soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
|
|
113
|
+
@redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
114
|
+
datetime('now')
|
|
115
|
+
)
|
|
116
|
+
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
117
|
+
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
118
|
+
http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
|
|
119
|
+
canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
|
|
120
|
+
content_hash = COALESCE(excluded.content_hash, pages.content_hash),
|
|
121
|
+
simhash = COALESCE(excluded.simhash, pages.simhash),
|
|
122
|
+
etag = COALESCE(excluded.etag, pages.etag),
|
|
123
|
+
last_modified = COALESCE(excluded.last_modified, pages.last_modified),
|
|
124
|
+
html = COALESCE(excluded.html, pages.html),
|
|
125
|
+
soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
|
|
126
|
+
noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
|
|
127
|
+
nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
|
|
128
|
+
security_error = COALESCE(excluded.security_error, pages.security_error),
|
|
129
|
+
retries = MAX(pages.retries, excluded.retries),
|
|
130
|
+
depth = MIN(pages.depth, excluded.depth),
|
|
131
|
+
redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
|
|
132
|
+
bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
|
|
133
|
+
crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
|
|
134
|
+
crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
|
|
135
|
+
trap_type = COALESCE(excluded.trap_type, pages.trap_type),
|
|
136
|
+
updated_at = datetime('now')
|
|
137
|
+
RETURNING id
|
|
138
|
+
`);
|
|
139
|
+
const urlToId = new Map();
|
|
140
|
+
const tx = this.db.transaction((pagesBatch) => {
|
|
141
|
+
for (const page of pagesBatch) {
|
|
142
|
+
const params = {
|
|
143
|
+
site_id: page.site_id,
|
|
144
|
+
normalized_url: page.normalized_url,
|
|
145
|
+
first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
|
|
146
|
+
last_seen_snapshot_id: page.last_seen_snapshot_id,
|
|
147
|
+
http_status: page.http_status ?? null,
|
|
148
|
+
canonical_url: page.canonical_url ?? null,
|
|
149
|
+
content_hash: page.content_hash ?? null,
|
|
150
|
+
simhash: page.simhash ?? null,
|
|
151
|
+
etag: page.etag ?? null,
|
|
152
|
+
last_modified: page.last_modified ?? null,
|
|
153
|
+
html: page.html ?? null,
|
|
154
|
+
soft404_score: page.soft404_score ?? null,
|
|
155
|
+
noindex: page.noindex ?? 0,
|
|
156
|
+
nofollow: page.nofollow ?? 0,
|
|
157
|
+
security_error: page.security_error ?? null,
|
|
158
|
+
retries: page.retries ?? 0,
|
|
159
|
+
depth: page.depth ?? 0,
|
|
160
|
+
redirect_chain: page.redirect_chain ?? null,
|
|
161
|
+
bytes_received: page.bytes_received ?? null,
|
|
162
|
+
crawl_trap_flag: page.crawl_trap_flag ?? 0,
|
|
163
|
+
crawl_trap_risk: page.crawl_trap_risk ?? null,
|
|
164
|
+
trap_type: page.trap_type ?? null,
|
|
165
|
+
};
|
|
166
|
+
const row = upsertStmtWithReturn.get(params);
|
|
167
|
+
urlToId.set(page.normalized_url, row.id);
|
|
168
|
+
}
|
|
169
|
+
});
|
|
170
|
+
tx(pages);
|
|
171
|
+
return urlToId;
|
|
172
|
+
}
|
|
86
173
|
getPagesBySnapshot(snapshotId) {
|
|
87
|
-
return this.db.prepare('SELECT
|
|
174
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId);
|
|
175
|
+
}
|
|
176
|
+
getPagesIdentityBySnapshot(snapshotId) {
|
|
177
|
+
return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId);
|
|
178
|
+
}
|
|
179
|
+
getPagesIteratorBySnapshot(snapshotId) {
|
|
180
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').iterate(snapshotId, snapshotId);
|
|
88
181
|
}
|
|
89
182
|
getIdByUrl(siteId, url) {
|
|
90
183
|
const row = this.getIdStmt.get(siteId, url);
|
|
@@ -9,7 +9,10 @@ export interface Site {
|
|
|
9
9
|
export declare class SiteRepository {
|
|
10
10
|
private db;
|
|
11
11
|
constructor(db: Database);
|
|
12
|
+
getSiteById(id: number): Site | undefined;
|
|
12
13
|
getSite(domain: string): Site | undefined;
|
|
14
|
+
getAllSites(): Site[];
|
|
13
15
|
createSite(domain: string): number;
|
|
14
16
|
firstOrCreateSite(domain: string): Site;
|
|
17
|
+
deleteSite(id: number): void;
|
|
15
18
|
}
|
|
@@ -3,9 +3,15 @@ export class SiteRepository {
|
|
|
3
3
|
constructor(db) {
|
|
4
4
|
this.db = db;
|
|
5
5
|
}
|
|
6
|
+
getSiteById(id) {
|
|
7
|
+
return this.db.prepare('SELECT * FROM sites WHERE id = ?').get(id);
|
|
8
|
+
}
|
|
6
9
|
getSite(domain) {
|
|
7
10
|
return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain);
|
|
8
11
|
}
|
|
12
|
+
getAllSites() {
|
|
13
|
+
return this.db.prepare('SELECT * FROM sites ORDER BY domain ASC').all();
|
|
14
|
+
}
|
|
9
15
|
createSite(domain) {
|
|
10
16
|
const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
|
|
11
17
|
const info = stmt.run(domain);
|
|
@@ -19,4 +25,7 @@ export class SiteRepository {
|
|
|
19
25
|
}
|
|
20
26
|
return site;
|
|
21
27
|
}
|
|
28
|
+
deleteSite(id) {
|
|
29
|
+
this.db.prepare('DELETE FROM sites WHERE id = ?').run(id);
|
|
30
|
+
}
|
|
22
31
|
}
|
|
@@ -17,6 +17,8 @@ export declare class SnapshotRepository {
|
|
|
17
17
|
constructor(db: Database);
|
|
18
18
|
createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status?: 'running' | 'completed' | 'failed'): number;
|
|
19
19
|
getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined;
|
|
20
|
+
getSnapshotCount(siteId: number): number;
|
|
20
21
|
updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats?: Partial<Snapshot>): void;
|
|
21
22
|
getSnapshot(id: number): Snapshot | undefined;
|
|
23
|
+
deleteSnapshot(id: number): void;
|
|
22
24
|
}
|
|
@@ -4,20 +4,29 @@ export class SnapshotRepository {
|
|
|
4
4
|
this.db = db;
|
|
5
5
|
}
|
|
6
6
|
createSnapshot(siteId, type, status = 'running') {
|
|
7
|
+
// Basic throttling or sleep if needed for tests, but generally SQLite is fast enough to have diff timestamps if not in same ms.
|
|
8
|
+
// However, if we run in memory, created_at is default current time.
|
|
9
|
+
// If two snapshots created in same second, ORDER BY created_at DESC is unstable or equal.
|
|
10
|
+
// We should rely on ID for stability if timestamps are equal, but the query uses created_at.
|
|
11
|
+
// Let's ensure we can also order by ID as tie-breaker.
|
|
7
12
|
const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
|
|
8
13
|
const info = stmt.run(siteId, type, status);
|
|
9
14
|
return info.lastInsertRowid;
|
|
10
15
|
}
|
|
11
16
|
getLatestSnapshot(siteId, status) {
|
|
12
|
-
let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
|
|
17
|
+
let sql = 'SELECT * FROM snapshots WHERE site_id = ? AND type != \'partial\'';
|
|
13
18
|
const params = [siteId];
|
|
14
19
|
if (status) {
|
|
15
20
|
sql += ' AND status = ?';
|
|
16
21
|
params.push(status);
|
|
17
22
|
}
|
|
18
|
-
sql += ' ORDER BY created_at DESC LIMIT 1';
|
|
23
|
+
sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
|
|
19
24
|
return this.db.prepare(sql).get(...params);
|
|
20
25
|
}
|
|
26
|
+
getSnapshotCount(siteId) {
|
|
27
|
+
const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId);
|
|
28
|
+
return result.count;
|
|
29
|
+
}
|
|
21
30
|
updateSnapshotStatus(id, status, stats = {}) {
|
|
22
31
|
const sets = ['status = ?'];
|
|
23
32
|
const params = [status];
|
|
@@ -52,4 +61,16 @@ export class SnapshotRepository {
|
|
|
52
61
|
getSnapshot(id) {
|
|
53
62
|
return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id);
|
|
54
63
|
}
|
|
64
|
+
deleteSnapshot(id) {
|
|
65
|
+
const tx = this.db.transaction(() => {
|
|
66
|
+
// Unlink pages from this snapshot to prevent FK constraint violations or data inconsistencies
|
|
67
|
+
this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
|
|
68
|
+
this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
|
|
69
|
+
// Cleanup: Delete pages that are no longer referenced by any snapshot
|
|
70
|
+
this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
|
|
71
|
+
// Delete the snapshot
|
|
72
|
+
this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
|
|
73
|
+
});
|
|
74
|
+
tx();
|
|
75
|
+
}
|
|
55
76
|
}
|