@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -6,30 +6,59 @@ import { PageRepository } from '../db/repositories/PageRepository.js';
6
6
  import { computePageRank } from '../graph/pagerank.js';
7
7
  import { calculateMetrics } from '../graph/metrics.js';
8
8
  import { computeHITS } from '../scoring/hits.js';
9
- export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false) {
9
+ import { calculateHealthScore, collectCrawlIssues } from '../scoring/health.js';
10
+ export function runPostCrawlMetrics(snapshotId, maxDepth, context, limitReached = false, graphInstance) {
10
11
  const db = getDb();
11
12
  const metricsRepo = new MetricsRepository(db);
12
13
  const snapshotRepo = new SnapshotRepository(db);
13
14
  const pageRepo = new PageRepository(db);
15
+ const graph = graphInstance || loadGraphFromSnapshot(snapshotId);
16
+ // Fallback emitter
17
+ const emit = (event) => {
18
+ if (context) {
19
+ context.emit(event);
20
+ }
21
+ else {
22
+ if (event.type === 'error')
23
+ console.error(event.message);
24
+ else if (event.type !== 'debug')
25
+ console.log(event.message || event.phase);
26
+ }
27
+ };
14
28
  const snapshot = snapshotRepo.getSnapshot(snapshotId);
15
29
  if (!snapshot) {
16
- console.error(`Snapshot ${snapshotId} not found`);
30
+ emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
17
31
  return;
18
32
  }
19
- console.log('Loading graph for metrics calculation...');
20
- const graph = loadGraphFromSnapshot(snapshotId);
21
- console.log('Computing PageRank...');
33
+ if (!graphInstance) {
34
+ emit({ type: 'metrics:start', phase: 'Loading graph' });
35
+ }
36
+ emit({ type: 'metrics:start', phase: 'Computing PageRank' });
22
37
  computePageRank(graph);
23
- console.log('Computing HITS...');
38
+ emit({ type: 'metrics:start', phase: 'Computing HITS' });
24
39
  computeHITS(graph);
25
- console.log('Updating metrics in DB...');
40
+ emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
26
41
  const nodes = graph.getNodes();
42
+ // Pre-fetch all page IDs to avoid N+1 queries
43
+ // Use getPagesIdentityBySnapshot to avoid loading full page content (HTML) into memory again
44
+ const pages = pageRepo.getPagesIdentityBySnapshot(snapshotId);
45
+ const urlToId = new Map();
46
+ for (const p of pages) {
47
+ urlToId.set(p.normalized_url, p.id);
48
+ }
49
+ const clusterStmt = db.prepare(`
50
+ INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
51
+ VALUES (?, ?, ?, ?, ?, ?)
52
+ `);
53
+ const contentStmt = db.prepare(`
54
+ INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
55
+ VALUES (?, ?, ?, ?, ?, ?)
56
+ `);
27
57
  const tx = db.transaction(() => {
28
58
  for (const node of nodes) {
29
- const pageId = pageRepo.getIdByUrl(snapshot.site_id, node.url);
59
+ const pageId = urlToId.get(node.url);
30
60
  if (!pageId)
31
61
  continue;
32
- const existing = metricsRepo.getMetricsForPage(snapshotId, pageId);
33
62
  metricsRepo.insertMetrics({
34
63
  snapshot_id: snapshotId,
35
64
  page_id: pageId,
@@ -38,11 +67,11 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false)
38
67
  pagerank: node.pageRank ?? null,
39
68
  pagerank_score: node.pageRankScore ?? null,
40
69
  link_role: node.linkRole ?? null,
41
- crawl_status: existing?.crawl_status ?? null,
42
- word_count: existing?.word_count ?? null,
43
- thin_content_score: existing?.thin_content_score ?? null,
44
- external_link_ratio: existing?.external_link_ratio ?? null,
45
- orphan_score: existing?.orphan_score ?? null,
70
+ crawl_status: node.crawlStatus ?? null,
71
+ word_count: node.wordCount ?? null,
72
+ thin_content_score: node.thinContentScore ?? null,
73
+ external_link_ratio: node.externalLinkRatio ?? null,
74
+ orphan_score: node.orphanScore ?? null,
46
75
  duplicate_cluster_id: node.duplicateClusterId ?? null,
47
76
  duplicate_type: node.duplicateType ?? null,
48
77
  is_cluster_primary: node.isClusterPrimary ? 1 : 0
@@ -62,47 +91,27 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false)
62
91
  }
63
92
  }
64
93
  // Save duplicate clusters
65
- if (graph.duplicateClusters.length > 0) {
66
- const clusterStmt = db.prepare(`
67
- INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
68
- VALUES (?, ?, ?, ?, ?, ?)
69
- `);
70
- for (const cluster of graph.duplicateClusters) {
71
- clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
72
- }
94
+ for (const cluster of graph.duplicateClusters) {
95
+ clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
73
96
  }
74
97
  // Save content clusters
75
- if (graph.contentClusters.length > 0) {
76
- const contentStmt = db.prepare(`
77
- INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
78
- VALUES (?, ?, ?, ?, ?, ?)
79
- `);
80
- for (const cluster of graph.contentClusters) {
81
- contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
82
- }
98
+ for (const cluster of graph.contentClusters) {
99
+ contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
83
100
  }
84
101
  });
85
102
  tx();
86
- console.log('Computing aggregate stats...');
103
+ emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
87
104
  const metrics = calculateMetrics(graph, maxDepth);
88
- let totalScore = 0;
89
- let totalWeight = 0;
90
- for (const node of nodes) {
91
- const score = node.authorityScore || node.pageRankScore || 0;
92
- const depth = node.depth;
93
- const weight = 1 / (depth + 1);
94
- totalScore += score * weight;
95
- totalWeight += weight;
96
- }
97
- const healthScore = totalWeight > 0 ? (totalScore / totalWeight) * 100 : 0;
98
- const thinCountRow = db.prepare('SELECT count(*) as count FROM metrics WHERE snapshot_id = ? AND thin_content_score >= 70').get(snapshotId);
105
+ // Calculate penalty-based health score (matches CLI)
106
+ const issues = collectCrawlIssues(graph, metrics);
107
+ const health = calculateHealthScore(metrics.totalPages, issues);
99
108
  snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
100
109
  node_count: metrics.totalPages,
101
110
  edge_count: metrics.totalEdges,
102
- health_score: healthScore,
103
- orphan_count: metrics.orphanPages.length,
104
- thin_content_count: thinCountRow.count,
111
+ health_score: health.score,
112
+ orphan_count: issues.orphanPages,
113
+ thin_content_count: issues.thinContent,
105
114
  limit_reached: limitReached ? 1 : 0
106
115
  });
107
- console.log('Metrics calculation complete.');
116
+ emit({ type: 'metrics:complete', durationMs: 0 });
108
117
  }
@@ -1,4 +1,7 @@
1
+ import { EngineContext } from '../events.js';
1
2
  export declare class Sitemap {
3
+ private context?;
4
+ constructor(context?: EngineContext | undefined);
2
5
  /**
3
6
  * Fetches and parses a sitemap (or sitemap index) to extract URLs.
4
7
  * Recursively handles sitemap indexes with loop detection and depth limits.
@@ -2,6 +2,10 @@ import { request } from 'undici';
2
2
  import * as cheerio from 'cheerio';
3
3
  import { normalizeUrl } from './normalize.js';
4
4
  export class Sitemap {
5
+ context;
6
+ constructor(context) {
7
+ this.context = context;
8
+ }
5
9
  /**
6
10
  * Fetches and parses a sitemap (or sitemap index) to extract URLs.
7
11
  * Recursively handles sitemap indexes with loop detection and depth limits.
@@ -64,7 +68,7 @@ export class Sitemap {
64
68
  }
65
69
  }
66
70
  catch (e) {
67
- console.warn(`Failed to fetch sitemap ${url}:`, e);
71
+ this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url}`, context: e });
68
72
  }
69
73
  }
70
74
  }
@@ -10,14 +10,17 @@ export function loadGraphFromSnapshot(snapshotId) {
10
10
  const edgeRepo = new EdgeRepository(db);
11
11
  const metricsRepo = new MetricsRepository(db);
12
12
  const snapshotRepo = new SnapshotRepository(db);
13
- const pages = pageRepo.getPagesBySnapshot(snapshotId);
14
- const metrics = metricsRepo.getMetrics(snapshotId);
13
+ const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId);
14
+ const metrics = metricsRepo.getMetricsIterator(snapshotId);
15
15
  const snapshot = snapshotRepo.getSnapshot(snapshotId);
16
16
  const metricsMap = new Map();
17
17
  for (const m of metrics) {
18
18
  metricsMap.set(m.page_id, m);
19
19
  }
20
20
  const graph = new Graph();
21
+ let pagesFetched = 0;
22
+ let pagesCached = 0;
23
+ let pagesSkipped = 0;
21
24
  if (snapshot) {
22
25
  graph.limitReached = !!snapshot.limit_reached;
23
26
  }
@@ -26,6 +29,19 @@ export function loadGraphFromSnapshot(snapshotId) {
26
29
  idMap.set(p.id, p.normalized_url);
27
30
  graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
28
31
  const m = metricsMap.get(p.id);
32
+ if (m) {
33
+ const isProcessed = m.crawl_status === 'fetched' ||
34
+ m.crawl_status === 'fetched_error' ||
35
+ m.crawl_status === 'network_error' ||
36
+ m.crawl_status === 'failed_after_retries' ||
37
+ m.crawl_status === 'blocked_by_robots';
38
+ if (isProcessed)
39
+ pagesFetched++;
40
+ else if (m.crawl_status === 'cached')
41
+ pagesCached++;
42
+ else if (m.crawl_status === 'skipped')
43
+ pagesSkipped++;
44
+ }
29
45
  let incrementalStatus;
30
46
  if (p.first_seen_snapshot_id === snapshotId) {
31
47
  incrementalStatus = 'new';
@@ -64,9 +80,15 @@ export function loadGraphFromSnapshot(snapshotId) {
64
80
  duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
65
81
  duplicateType: m?.duplicate_type ?? undefined,
66
82
  isClusterPrimary: m?.is_cluster_primary ? true : undefined,
83
+ // Additional metrics
84
+ crawlStatus: m?.crawl_status || undefined,
85
+ wordCount: m?.word_count != null ? m.word_count : undefined,
86
+ thinContentScore: m?.thin_content_score != null ? m.thin_content_score : undefined,
87
+ externalLinkRatio: m?.external_link_ratio != null ? m.external_link_ratio : undefined,
88
+ orphanScore: m?.orphan_score != null ? m.orphan_score : undefined,
67
89
  });
68
90
  }
69
- const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
91
+ const edges = edgeRepo.getEdgesIteratorBySnapshot(snapshotId);
70
92
  for (const e of edges) {
71
93
  const source = idMap.get(e.source_page_id);
72
94
  const target = idMap.get(e.target_page_id);
@@ -92,5 +114,12 @@ export function loadGraphFromSnapshot(snapshotId) {
92
114
  risk: c.risk,
93
115
  sharedPathPrefix: c.shared_path_prefix || undefined
94
116
  }));
117
+ // Set session stats
118
+ graph.sessionStats = {
119
+ pagesFetched,
120
+ pagesCached,
121
+ pagesSkipped,
122
+ totalFound: idMap.size
123
+ };
95
124
  return graph;
96
125
  }
@@ -1,4 +1,7 @@
1
1
  import Database from 'better-sqlite3';
2
+ export * from './repositories/SiteRepository.js';
3
+ export * from './repositories/SnapshotRepository.js';
4
+ export { initSchema } from './schema.js';
2
5
  export declare function getDbPath(): string;
3
6
  export declare function getDb(): Database.Database;
4
7
  export declare function closeDb(): void;
package/dist/db/index.js CHANGED
@@ -4,6 +4,9 @@ import fs from 'node:fs';
4
4
  import os from 'node:os';
5
5
  import { initSchema } from './schema.js';
6
6
  let dbInstance = null;
7
+ export * from './repositories/SiteRepository.js';
8
+ export * from './repositories/SnapshotRepository.js';
9
+ export { initSchema } from './schema.js';
7
10
  export function getDbPath() {
8
11
  if (process.env.NODE_ENV === 'test') {
9
12
  return ':memory:';
@@ -46,6 +49,7 @@ export function getDb() {
46
49
  // Integrity check on startup
47
50
  const integrity = db.pragma('integrity_check', { simple: true });
48
51
  if (integrity !== 'ok') {
52
+ // Reverted to console.warn to avoid breaking change
49
53
  console.warn('Database integrity check failed:', integrity);
50
54
  }
51
55
  // Initialize schema
@@ -12,5 +12,13 @@ export declare class EdgeRepository {
12
12
  private insertStmt;
13
13
  constructor(db: Database);
14
14
  insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight?: number, rel?: string): void;
15
+ insertEdges(edges: {
16
+ snapshot_id: number;
17
+ source_page_id: number;
18
+ target_page_id: number;
19
+ weight: number;
20
+ rel: string;
21
+ }[]): void;
15
22
  getEdgesBySnapshot(snapshotId: number): Edge[];
23
+ getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge>;
16
24
  }
@@ -11,7 +11,20 @@ export class EdgeRepository {
11
11
  insertEdge(snapshotId, sourcePageId, targetPageId, weight = 1.0, rel = 'internal') {
12
12
  this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
13
13
  }
14
+ insertEdges(edges) {
15
+ if (edges.length === 0)
16
+ return;
17
+ const tx = this.db.transaction((edgesBatch) => {
18
+ for (const edge of edgesBatch) {
19
+ this.insertStmt.run(edge.snapshot_id, edge.source_page_id, edge.target_page_id, edge.weight, edge.rel);
20
+ }
21
+ });
22
+ tx(edges);
23
+ }
14
24
  getEdgesBySnapshot(snapshotId) {
15
25
  return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId);
16
26
  }
27
+ getEdgesIteratorBySnapshot(snapshotId) {
28
+ return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').iterate(snapshotId);
29
+ }
17
30
  }
@@ -19,8 +19,11 @@ export interface DbMetrics {
19
19
  export declare class MetricsRepository {
20
20
  private db;
21
21
  private insertStmt;
22
+ private getByPageStmt;
22
23
  constructor(db: Database);
23
24
  insertMetrics(metrics: DbMetrics): void;
24
25
  getMetrics(snapshotId: number): DbMetrics[];
26
+ getMetricsIterator(snapshotId: number): IterableIterator<DbMetrics>;
25
27
  getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined;
28
+ insertMany(metricsList: DbMetrics[]): void;
26
29
  }
@@ -1,8 +1,10 @@
1
1
  export class MetricsRepository {
2
2
  db;
3
3
  insertStmt;
4
+ getByPageStmt;
4
5
  constructor(db) {
5
6
  this.db = db;
7
+ this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
6
8
  this.insertStmt = this.db.prepare(`
7
9
  INSERT OR REPLACE INTO metrics (
8
10
  snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
@@ -21,7 +23,18 @@ export class MetricsRepository {
21
23
  getMetrics(snapshotId) {
22
24
  return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId);
23
25
  }
26
+ getMetricsIterator(snapshotId) {
27
+ return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').iterate(snapshotId);
28
+ }
24
29
  getMetricsForPage(snapshotId, pageId) {
25
- return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?').get(snapshotId, pageId);
30
+ return this.getByPageStmt.get(snapshotId, pageId);
31
+ }
32
+ insertMany(metricsList) {
33
+ const insert = this.insertStmt;
34
+ const tx = this.db.transaction((items) => {
35
+ for (const item of items)
36
+ insert.run(item);
37
+ });
38
+ tx(metricsList);
26
39
  }
27
40
  }
@@ -42,6 +42,17 @@ export declare class PageRepository {
42
42
  last_seen_snapshot_id: number;
43
43
  }): number;
44
44
  getPage(siteId: number, url: string): Page | undefined;
45
+ getPagesByUrls(siteId: number, urls: string[]): Page[];
46
+ upsertMany(pages: (Partial<Page> & {
47
+ site_id: number;
48
+ normalized_url: string;
49
+ last_seen_snapshot_id: number;
50
+ })[]): Map<string, number>;
45
51
  getPagesBySnapshot(snapshotId: number): Page[];
52
+ getPagesIdentityBySnapshot(snapshotId: number): {
53
+ id: number;
54
+ normalized_url: string;
55
+ }[];
56
+ getPagesIteratorBySnapshot(snapshotId: number): IterableIterator<Page>;
46
57
  getIdByUrl(siteId: number, url: string): number | undefined;
47
58
  }
@@ -20,24 +20,24 @@ export class PageRepository {
20
20
  )
21
21
  ON CONFLICT(site_id, normalized_url) DO UPDATE SET
22
22
  last_seen_snapshot_id = excluded.last_seen_snapshot_id,
23
- http_status = excluded.http_status,
24
- canonical_url = excluded.canonical_url,
25
- content_hash = excluded.content_hash,
26
- simhash = excluded.simhash,
27
- etag = excluded.etag,
28
- last_modified = excluded.last_modified,
29
- html = excluded.html,
30
- soft404_score = excluded.soft404_score,
31
- noindex = excluded.noindex,
32
- nofollow = excluded.nofollow,
33
- security_error = excluded.security_error,
34
- retries = excluded.retries,
35
- depth = excluded.depth,
36
- redirect_chain = excluded.redirect_chain,
37
- bytes_received = excluded.bytes_received,
38
- crawl_trap_flag = excluded.crawl_trap_flag,
39
- crawl_trap_risk = excluded.crawl_trap_risk,
40
- trap_type = excluded.trap_type,
23
+ http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
24
+ canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
25
+ content_hash = COALESCE(excluded.content_hash, pages.content_hash),
26
+ simhash = COALESCE(excluded.simhash, pages.simhash),
27
+ etag = COALESCE(excluded.etag, pages.etag),
28
+ last_modified = COALESCE(excluded.last_modified, pages.last_modified),
29
+ html = COALESCE(excluded.html, pages.html),
30
+ soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
31
+ noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
32
+ nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
33
+ security_error = COALESCE(excluded.security_error, pages.security_error),
34
+ retries = MAX(pages.retries, excluded.retries),
35
+ depth = MIN(pages.depth, excluded.depth),
36
+ redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
37
+ bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
38
+ crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
39
+ crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
40
+ trap_type = COALESCE(excluded.trap_type, pages.trap_type),
41
41
  updated_at = datetime('now')
42
42
  `);
43
43
  this.getIdStmt = this.db.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?');
@@ -83,8 +83,101 @@ export class PageRepository {
83
83
  getPage(siteId, url) {
84
84
  return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url);
85
85
  }
86
+ getPagesByUrls(siteId, urls) {
87
+ if (urls.length === 0)
88
+ return [];
89
+ const chunkSize = 900;
90
+ const results = [];
91
+ for (let i = 0; i < urls.length; i += chunkSize) {
92
+ const chunk = urls.slice(i, i + chunkSize);
93
+ const placeholders = chunk.map(() => '?').join(',');
94
+ const chunkResults = this.db.prepare(`SELECT * FROM pages WHERE site_id = ? AND normalized_url IN (${placeholders})`).all(siteId, ...chunk);
95
+ results.push(...chunkResults);
96
+ }
97
+ return results;
98
+ }
99
+ upsertMany(pages) {
100
+ if (pages.length === 0)
101
+ return new Map();
102
+ const upsertStmtWithReturn = this.db.prepare(`
103
+ INSERT INTO pages (
104
+ site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
105
+ http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
106
+ soft404_score, noindex, nofollow, security_error, retries, depth,
107
+ redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
108
+ updated_at
109
+ ) VALUES (
110
+ @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
111
+ @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
112
+ @soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
113
+ @redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
114
+ datetime('now')
115
+ )
116
+ ON CONFLICT(site_id, normalized_url) DO UPDATE SET
117
+ last_seen_snapshot_id = excluded.last_seen_snapshot_id,
118
+ http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
119
+ canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
120
+ content_hash = COALESCE(excluded.content_hash, pages.content_hash),
121
+ simhash = COALESCE(excluded.simhash, pages.simhash),
122
+ etag = COALESCE(excluded.etag, pages.etag),
123
+ last_modified = COALESCE(excluded.last_modified, pages.last_modified),
124
+ html = COALESCE(excluded.html, pages.html),
125
+ soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
126
+ noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
127
+ nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
128
+ security_error = COALESCE(excluded.security_error, pages.security_error),
129
+ retries = MAX(pages.retries, excluded.retries),
130
+ depth = MIN(pages.depth, excluded.depth),
131
+ redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
132
+ bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
133
+ crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
134
+ crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
135
+ trap_type = COALESCE(excluded.trap_type, pages.trap_type),
136
+ updated_at = datetime('now')
137
+ RETURNING id
138
+ `);
139
+ const urlToId = new Map();
140
+ const tx = this.db.transaction((pagesBatch) => {
141
+ for (const page of pagesBatch) {
142
+ const params = {
143
+ site_id: page.site_id,
144
+ normalized_url: page.normalized_url,
145
+ first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
146
+ last_seen_snapshot_id: page.last_seen_snapshot_id,
147
+ http_status: page.http_status ?? null,
148
+ canonical_url: page.canonical_url ?? null,
149
+ content_hash: page.content_hash ?? null,
150
+ simhash: page.simhash ?? null,
151
+ etag: page.etag ?? null,
152
+ last_modified: page.last_modified ?? null,
153
+ html: page.html ?? null,
154
+ soft404_score: page.soft404_score ?? null,
155
+ noindex: page.noindex ?? 0,
156
+ nofollow: page.nofollow ?? 0,
157
+ security_error: page.security_error ?? null,
158
+ retries: page.retries ?? 0,
159
+ depth: page.depth ?? 0,
160
+ redirect_chain: page.redirect_chain ?? null,
161
+ bytes_received: page.bytes_received ?? null,
162
+ crawl_trap_flag: page.crawl_trap_flag ?? 0,
163
+ crawl_trap_risk: page.crawl_trap_risk ?? null,
164
+ trap_type: page.trap_type ?? null,
165
+ };
166
+ const row = upsertStmtWithReturn.get(params);
167
+ urlToId.set(page.normalized_url, row.id);
168
+ }
169
+ });
170
+ tx(pages);
171
+ return urlToId;
172
+ }
86
173
  getPagesBySnapshot(snapshotId) {
87
- return this.db.prepare('SELECT * FROM pages WHERE last_seen_snapshot_id = ?').all(snapshotId);
174
+ return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId);
175
+ }
176
+ getPagesIdentityBySnapshot(snapshotId) {
177
+ return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId);
178
+ }
179
+ getPagesIteratorBySnapshot(snapshotId) {
180
+ return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').iterate(snapshotId, snapshotId);
88
181
  }
89
182
  getIdByUrl(siteId, url) {
90
183
  const row = this.getIdStmt.get(siteId, url);
@@ -9,7 +9,10 @@ export interface Site {
9
9
  export declare class SiteRepository {
10
10
  private db;
11
11
  constructor(db: Database);
12
+ getSiteById(id: number): Site | undefined;
12
13
  getSite(domain: string): Site | undefined;
14
+ getAllSites(): Site[];
13
15
  createSite(domain: string): number;
14
16
  firstOrCreateSite(domain: string): Site;
17
+ deleteSite(id: number): void;
15
18
  }
@@ -3,9 +3,15 @@ export class SiteRepository {
3
3
  constructor(db) {
4
4
  this.db = db;
5
5
  }
6
+ getSiteById(id) {
7
+ return this.db.prepare('SELECT * FROM sites WHERE id = ?').get(id);
8
+ }
6
9
  getSite(domain) {
7
10
  return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain);
8
11
  }
12
+ getAllSites() {
13
+ return this.db.prepare('SELECT * FROM sites ORDER BY domain ASC').all();
14
+ }
9
15
  createSite(domain) {
10
16
  const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
11
17
  const info = stmt.run(domain);
@@ -19,4 +25,7 @@ export class SiteRepository {
19
25
  }
20
26
  return site;
21
27
  }
28
+ deleteSite(id) {
29
+ this.db.prepare('DELETE FROM sites WHERE id = ?').run(id);
30
+ }
22
31
  }
@@ -17,6 +17,8 @@ export declare class SnapshotRepository {
17
17
  constructor(db: Database);
18
18
  createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status?: 'running' | 'completed' | 'failed'): number;
19
19
  getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined;
20
+ getSnapshotCount(siteId: number): number;
20
21
  updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats?: Partial<Snapshot>): void;
21
22
  getSnapshot(id: number): Snapshot | undefined;
23
+ deleteSnapshot(id: number): void;
22
24
  }
@@ -4,20 +4,29 @@ export class SnapshotRepository {
4
4
  this.db = db;
5
5
  }
6
6
  createSnapshot(siteId, type, status = 'running') {
7
+ // Basic throttling or sleep if needed for tests, but generally SQLite is fast enough to have diff timestamps if not in same ms.
8
+ // However, if we run in memory, created_at is default current time.
9
+ // If two snapshots created in same second, ORDER BY created_at DESC is unstable or equal.
10
+ // We should rely on ID for stability if timestamps are equal, but the query uses created_at.
11
+ // Let's ensure we can also order by ID as tie-breaker.
7
12
  const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
8
13
  const info = stmt.run(siteId, type, status);
9
14
  return info.lastInsertRowid;
10
15
  }
11
16
  getLatestSnapshot(siteId, status) {
12
- let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
17
+ let sql = 'SELECT * FROM snapshots WHERE site_id = ? AND type != \'partial\'';
13
18
  const params = [siteId];
14
19
  if (status) {
15
20
  sql += ' AND status = ?';
16
21
  params.push(status);
17
22
  }
18
- sql += ' ORDER BY created_at DESC LIMIT 1';
23
+ sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
19
24
  return this.db.prepare(sql).get(...params);
20
25
  }
26
+ getSnapshotCount(siteId) {
27
+ const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId);
28
+ return result.count;
29
+ }
21
30
  updateSnapshotStatus(id, status, stats = {}) {
22
31
  const sets = ['status = ?'];
23
32
  const params = [status];
@@ -52,4 +61,16 @@ export class SnapshotRepository {
52
61
  getSnapshot(id) {
53
62
  return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id);
54
63
  }
64
+ deleteSnapshot(id) {
65
+ const tx = this.db.transaction(() => {
66
+ // Unlink pages from this snapshot to prevent FK constraint violations or data inconsistencies
67
+ this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
68
+ this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
69
+ // Cleanup: Delete pages that are no longer referenced by any snapshot
70
+ this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
71
+ // Delete the snapshot
72
+ this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
73
+ });
74
+ tx();
75
+ }
55
76
  }