@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -6,37 +6,71 @@ import { PageRepository } from '../db/repositories/PageRepository.js';
6
6
  import { computePageRank } from '../graph/pagerank.js';
7
7
  import { calculateMetrics } from '../graph/metrics.js';
8
8
  import { computeHITS } from '../scoring/hits.js';
9
+ import { EngineContext } from '../events.js';
10
+ import { calculateHealthScore, collectCrawlIssues } from '../scoring/health.js';
9
11
 
10
- export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitReached: boolean = false) {
12
+ import { Graph } from '../graph/graph.js';
13
+
14
+ export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, context?: EngineContext, limitReached: boolean = false, graphInstance?: Graph) {
11
15
  const db = getDb();
12
16
  const metricsRepo = new MetricsRepository(db);
13
17
  const snapshotRepo = new SnapshotRepository(db);
14
18
  const pageRepo = new PageRepository(db);
15
19
 
20
+ const graph = graphInstance || loadGraphFromSnapshot(snapshotId);
21
+
22
+ // Fallback emitter
23
+ const emit = (event: any) => {
24
+ if (context) {
25
+ context.emit(event);
26
+ } else {
27
+ if (event.type === 'error') console.error(event.message);
28
+ else if (event.type !== 'debug') console.log(event.message || event.phase);
29
+ }
30
+ };
31
+
16
32
  const snapshot = snapshotRepo.getSnapshot(snapshotId);
17
33
  if (!snapshot) {
18
- console.error(`Snapshot ${snapshotId} not found`);
34
+ emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
19
35
  return;
20
36
  }
21
37
 
22
- console.log('Loading graph for metrics calculation...');
23
- const graph = loadGraphFromSnapshot(snapshotId);
38
+ if (!graphInstance) {
39
+ emit({ type: 'metrics:start', phase: 'Loading graph' });
40
+ }
24
41
 
25
- console.log('Computing PageRank...');
42
+ emit({ type: 'metrics:start', phase: 'Computing PageRank' });
26
43
  computePageRank(graph);
27
44
 
28
- console.log('Computing HITS...');
45
+ emit({ type: 'metrics:start', phase: 'Computing HITS' });
29
46
  computeHITS(graph);
30
47
 
31
- console.log('Updating metrics in DB...');
48
+ emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
32
49
  const nodes = graph.getNodes();
33
50
 
51
+ // Pre-fetch all page IDs to avoid N+1 queries
52
+ // Use getPagesIdentityBySnapshot to avoid loading full page content (HTML) into memory again
53
+ const pages = pageRepo.getPagesIdentityBySnapshot(snapshotId);
54
+ const urlToId = new Map<string, number>();
55
+ for (const p of pages) {
56
+ urlToId.set(p.normalized_url, p.id);
57
+ }
58
+
59
+ const clusterStmt = db.prepare(`
60
+ INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
61
+ VALUES (?, ?, ?, ?, ?, ?)
62
+ `);
63
+
64
+ const contentStmt = db.prepare(`
65
+ INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
66
+ VALUES (?, ?, ?, ?, ?, ?)
67
+ `);
68
+
34
69
  const tx = db.transaction(() => {
35
70
  for (const node of nodes) {
36
- const pageId = pageRepo.getIdByUrl(snapshot.site_id, node.url);
71
+ const pageId = urlToId.get(node.url);
37
72
  if (!pageId) continue;
38
73
 
39
- const existing = metricsRepo.getMetricsForPage(snapshotId, pageId);
40
74
 
41
75
  metricsRepo.insertMetrics({
42
76
  snapshot_id: snapshotId,
@@ -46,11 +80,11 @@ export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitR
46
80
  pagerank: node.pageRank ?? null,
47
81
  pagerank_score: node.pageRankScore ?? null,
48
82
  link_role: node.linkRole ?? null,
49
- crawl_status: existing?.crawl_status ?? null,
50
- word_count: existing?.word_count ?? null,
51
- thin_content_score: existing?.thin_content_score ?? null,
52
- external_link_ratio: existing?.external_link_ratio ?? null,
53
- orphan_score: existing?.orphan_score ?? null,
83
+ crawl_status: node.crawlStatus ?? null,
84
+ word_count: node.wordCount ?? null,
85
+ thin_content_score: node.thinContentScore ?? null,
86
+ external_link_ratio: node.externalLinkRatio ?? null,
87
+ orphan_score: node.orphanScore ?? null,
54
88
  duplicate_cluster_id: node.duplicateClusterId ?? null,
55
89
  duplicate_type: node.duplicateType ?? null,
56
90
  is_cluster_primary: node.isClusterPrimary ? 1 : 0
@@ -72,53 +106,32 @@ export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitR
72
106
  }
73
107
 
74
108
  // Save duplicate clusters
75
- if (graph.duplicateClusters.length > 0) {
76
- const clusterStmt = db.prepare(`
77
- INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
78
- VALUES (?, ?, ?, ?, ?, ?)
79
- `);
80
- for (const cluster of graph.duplicateClusters) {
81
- clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
82
- }
109
+ for (const cluster of graph.duplicateClusters) {
110
+ clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
83
111
  }
84
112
 
85
113
  // Save content clusters
86
- if (graph.contentClusters.length > 0) {
87
- const contentStmt = db.prepare(`
88
- INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
89
- VALUES (?, ?, ?, ?, ?, ?)
90
- `);
91
- for (const cluster of graph.contentClusters) {
92
- contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
93
- }
114
+ for (const cluster of graph.contentClusters) {
115
+ contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
94
116
  }
95
117
  });
96
118
  tx();
97
119
 
98
- console.log('Computing aggregate stats...');
120
+ emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
99
121
  const metrics = calculateMetrics(graph, maxDepth);
100
122
 
101
- let totalScore = 0;
102
- let totalWeight = 0;
103
- for (const node of nodes) {
104
- const score = node.authorityScore || node.pageRankScore || 0;
105
- const depth = node.depth;
106
- const weight = 1 / (depth + 1);
107
- totalScore += score * weight;
108
- totalWeight += weight;
109
- }
110
- const healthScore = totalWeight > 0 ? (totalScore / totalWeight) * 100 : 0;
111
-
112
- const thinCountRow = db.prepare('SELECT count(*) as count FROM metrics WHERE snapshot_id = ? AND thin_content_score >= 70').get(snapshotId) as { count: number };
123
+ // Calculate penalty-based health score (matches CLI)
124
+ const issues = collectCrawlIssues(graph, metrics);
125
+ const health = calculateHealthScore(metrics.totalPages, issues);
113
126
 
114
127
  snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
115
128
  node_count: metrics.totalPages,
116
129
  edge_count: metrics.totalEdges,
117
- health_score: healthScore,
118
- orphan_count: metrics.orphanPages.length,
119
- thin_content_count: thinCountRow.count,
130
+ health_score: health.score,
131
+ orphan_count: issues.orphanPages,
132
+ thin_content_count: issues.thinContent,
120
133
  limit_reached: limitReached ? 1 : 0
121
134
  });
122
135
 
123
- console.log('Metrics calculation complete.');
136
+ emit({ type: 'metrics:complete', durationMs: 0 });
124
137
  }
@@ -1,8 +1,11 @@
1
1
  import { request } from 'undici';
2
2
  import * as cheerio from 'cheerio';
3
3
  import { normalizeUrl } from './normalize.js';
4
+ import { EngineContext } from '../events.js';
4
5
 
5
6
  export class Sitemap {
7
+ constructor(private context?: EngineContext) {}
8
+
6
9
  /**
7
10
  * Fetches and parses a sitemap (or sitemap index) to extract URLs.
8
11
  * Recursively handles sitemap indexes with loop detection and depth limits.
@@ -67,7 +70,7 @@ export class Sitemap {
67
70
  await res.body.dump();
68
71
  }
69
72
  } catch (e) {
70
- console.warn(`Failed to fetch sitemap ${url}:`, e);
73
+ this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url}`, context: e });
71
74
  }
72
75
  }
73
76
  }
@@ -12,8 +12,8 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
12
12
  const metricsRepo = new MetricsRepository(db);
13
13
  const snapshotRepo = new SnapshotRepository(db);
14
14
 
15
- const pages = pageRepo.getPagesBySnapshot(snapshotId);
16
- const metrics = metricsRepo.getMetrics(snapshotId);
15
+ const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId);
16
+ const metrics = metricsRepo.getMetricsIterator(snapshotId);
17
17
  const snapshot = snapshotRepo.getSnapshot(snapshotId);
18
18
  const metricsMap = new Map<number, DbMetrics>();
19
19
  for (const m of metrics) {
@@ -21,6 +21,10 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
21
21
  }
22
22
 
23
23
  const graph = new Graph();
24
+ let pagesFetched = 0;
25
+ let pagesCached = 0;
26
+ let pagesSkipped = 0;
27
+
24
28
  if (snapshot) {
25
29
  graph.limitReached = !!snapshot.limit_reached;
26
30
  }
@@ -31,6 +35,18 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
31
35
  graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
32
36
 
33
37
  const m = metricsMap.get(p.id);
38
+ if (m) {
39
+ const isProcessed = m.crawl_status === 'fetched' ||
40
+ m.crawl_status === 'fetched_error' ||
41
+ m.crawl_status === 'network_error' ||
42
+ m.crawl_status === 'failed_after_retries' ||
43
+ m.crawl_status === 'blocked_by_robots';
44
+
45
+ if (isProcessed) pagesFetched++;
46
+ else if (m.crawl_status === 'cached') pagesCached++;
47
+ else if (m.crawl_status === 'skipped') pagesSkipped++;
48
+ }
49
+
34
50
  let incrementalStatus: 'new' | 'changed' | 'unchanged' | undefined;
35
51
  if (p.first_seen_snapshot_id === snapshotId) {
36
52
  incrementalStatus = 'new';
@@ -68,10 +84,16 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
68
84
  duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
69
85
  duplicateType: m?.duplicate_type ?? undefined,
70
86
  isClusterPrimary: m?.is_cluster_primary ? true : undefined,
87
+ // Additional metrics
88
+ crawlStatus: m?.crawl_status || undefined,
89
+ wordCount: m?.word_count != null ? m.word_count : undefined,
90
+ thinContentScore: m?.thin_content_score != null ? m.thin_content_score : undefined,
91
+ externalLinkRatio: m?.external_link_ratio != null ? m.external_link_ratio : undefined,
92
+ orphanScore: m?.orphan_score != null ? m.orphan_score : undefined,
71
93
  });
72
94
  }
73
95
 
74
- const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
96
+ const edges = edgeRepo.getEdgesIteratorBySnapshot(snapshotId);
75
97
 
76
98
  for (const e of edges) {
77
99
  const source = idMap.get(e.source_page_id);
@@ -101,5 +123,13 @@ export function loadGraphFromSnapshot(snapshotId: number): Graph {
101
123
  sharedPathPrefix: c.shared_path_prefix || undefined
102
124
  }));
103
125
 
126
+ // Set session stats
127
+ graph.sessionStats = {
128
+ pagesFetched,
129
+ pagesCached,
130
+ pagesSkipped,
131
+ totalFound: idMap.size
132
+ };
133
+
104
134
  return graph;
105
135
  }
package/src/db/index.ts CHANGED
@@ -6,6 +6,10 @@ import { initSchema } from './schema.js';
6
6
 
7
7
  let dbInstance: Database.Database | null = null;
8
8
 
9
+ export * from './repositories/SiteRepository.js';
10
+ export * from './repositories/SnapshotRepository.js';
11
+ export { initSchema } from './schema.js';
12
+
9
13
  export function getDbPath(): string {
10
14
  if (process.env.NODE_ENV === 'test') {
11
15
  return ':memory:';
@@ -52,6 +56,7 @@ export function getDb(): Database.Database {
52
56
  // Integrity check on startup
53
57
  const integrity = db.pragma('integrity_check', { simple: true });
54
58
  if (integrity !== 'ok') {
59
+ // Reverted to console.warn to avoid breaking change
55
60
  console.warn('Database integrity check failed:', integrity);
56
61
  }
57
62
 
@@ -23,7 +23,21 @@ export class EdgeRepository {
23
23
  this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
24
24
  }
25
25
 
26
+ insertEdges(edges: { snapshot_id: number; source_page_id: number; target_page_id: number; weight: number; rel: string }[]) {
27
+ if (edges.length === 0) return;
28
+ const tx = this.db.transaction((edgesBatch) => {
29
+ for (const edge of edgesBatch) {
30
+ this.insertStmt.run(edge.snapshot_id, edge.source_page_id, edge.target_page_id, edge.weight, edge.rel);
31
+ }
32
+ });
33
+ tx(edges);
34
+ }
35
+
26
36
  getEdgesBySnapshot(snapshotId: number): Edge[] {
27
37
  return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId) as Edge[];
28
38
  }
39
+
40
+ getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge> {
41
+ return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').iterate(snapshotId) as IterableIterator<Edge>;
42
+ }
29
43
  }
@@ -20,8 +20,10 @@ export interface DbMetrics {
20
20
 
21
21
  export class MetricsRepository {
22
22
  private insertStmt;
23
+ private getByPageStmt;
23
24
 
24
25
  constructor(private db: Database) {
26
+ this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
25
27
  this.insertStmt = this.db.prepare(`
26
28
  INSERT OR REPLACE INTO metrics (
27
29
  snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
@@ -43,7 +45,19 @@ export class MetricsRepository {
43
45
  return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId) as DbMetrics[];
44
46
  }
45
47
 
48
+ getMetricsIterator(snapshotId: number): IterableIterator<DbMetrics> {
49
+ return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').iterate(snapshotId) as IterableIterator<DbMetrics>;
50
+ }
51
+
46
52
  getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined {
47
- return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?').get(snapshotId, pageId) as DbMetrics | undefined;
53
+ return this.getByPageStmt.get(snapshotId, pageId) as DbMetrics | undefined;
54
+ }
55
+
56
+ insertMany(metricsList: DbMetrics[]) {
57
+ const insert = this.insertStmt;
58
+ const tx = this.db.transaction((items: DbMetrics[]) => {
59
+ for (const item of items) insert.run(item);
60
+ });
61
+ tx(metricsList);
48
62
  }
49
63
  }
@@ -49,24 +49,24 @@ export class PageRepository {
49
49
  )
50
50
  ON CONFLICT(site_id, normalized_url) DO UPDATE SET
51
51
  last_seen_snapshot_id = excluded.last_seen_snapshot_id,
52
- http_status = excluded.http_status,
53
- canonical_url = excluded.canonical_url,
54
- content_hash = excluded.content_hash,
55
- simhash = excluded.simhash,
56
- etag = excluded.etag,
57
- last_modified = excluded.last_modified,
58
- html = excluded.html,
59
- soft404_score = excluded.soft404_score,
60
- noindex = excluded.noindex,
61
- nofollow = excluded.nofollow,
62
- security_error = excluded.security_error,
63
- retries = excluded.retries,
64
- depth = excluded.depth,
65
- redirect_chain = excluded.redirect_chain,
66
- bytes_received = excluded.bytes_received,
67
- crawl_trap_flag = excluded.crawl_trap_flag,
68
- crawl_trap_risk = excluded.crawl_trap_risk,
69
- trap_type = excluded.trap_type,
52
+ http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
53
+ canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
54
+ content_hash = COALESCE(excluded.content_hash, pages.content_hash),
55
+ simhash = COALESCE(excluded.simhash, pages.simhash),
56
+ etag = COALESCE(excluded.etag, pages.etag),
57
+ last_modified = COALESCE(excluded.last_modified, pages.last_modified),
58
+ html = COALESCE(excluded.html, pages.html),
59
+ soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
60
+ noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
61
+ nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
62
+ security_error = COALESCE(excluded.security_error, pages.security_error),
63
+ retries = MAX(pages.retries, excluded.retries),
64
+ depth = MIN(pages.depth, excluded.depth),
65
+ redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
66
+ bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
67
+ crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
68
+ crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
69
+ trap_type = COALESCE(excluded.trap_type, pages.trap_type),
70
70
  updated_at = datetime('now')
71
71
  `);
72
72
 
@@ -117,8 +117,108 @@ export class PageRepository {
117
117
  return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url) as Page | undefined;
118
118
  }
119
119
 
120
+ getPagesByUrls(siteId: number, urls: string[]): Page[] {
121
+ if (urls.length === 0) return [];
122
+ const chunkSize = 900;
123
+ const results: Page[] = [];
124
+
125
+ for (let i = 0; i < urls.length; i += chunkSize) {
126
+ const chunk = urls.slice(i, i + chunkSize);
127
+ const placeholders = chunk.map(() => '?').join(',');
128
+ const chunkResults = this.db.prepare(`SELECT * FROM pages WHERE site_id = ? AND normalized_url IN (${placeholders})`).all(siteId, ...chunk) as Page[];
129
+ results.push(...chunkResults);
130
+ }
131
+
132
+ return results;
133
+ }
134
+
135
+ upsertMany(pages: (Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number })[]): Map<string, number> {
136
+ if (pages.length === 0) return new Map();
137
+
138
+ const upsertStmtWithReturn = this.db.prepare(`
139
+ INSERT INTO pages (
140
+ site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
141
+ http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
142
+ soft404_score, noindex, nofollow, security_error, retries, depth,
143
+ redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
144
+ updated_at
145
+ ) VALUES (
146
+ @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
147
+ @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
148
+ @soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
149
+ @redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
150
+ datetime('now')
151
+ )
152
+ ON CONFLICT(site_id, normalized_url) DO UPDATE SET
153
+ last_seen_snapshot_id = excluded.last_seen_snapshot_id,
154
+ http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
155
+ canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
156
+ content_hash = COALESCE(excluded.content_hash, pages.content_hash),
157
+ simhash = COALESCE(excluded.simhash, pages.simhash),
158
+ etag = COALESCE(excluded.etag, pages.etag),
159
+ last_modified = COALESCE(excluded.last_modified, pages.last_modified),
160
+ html = COALESCE(excluded.html, pages.html),
161
+ soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
162
+ noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
163
+ nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
164
+ security_error = COALESCE(excluded.security_error, pages.security_error),
165
+ retries = MAX(pages.retries, excluded.retries),
166
+ depth = MIN(pages.depth, excluded.depth),
167
+ redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
168
+ bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
169
+ crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
170
+ crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
171
+ trap_type = COALESCE(excluded.trap_type, pages.trap_type),
172
+ updated_at = datetime('now')
173
+ RETURNING id
174
+ `);
175
+
176
+ const urlToId = new Map<string, number>();
177
+ const tx = this.db.transaction((pagesBatch) => {
178
+ for (const page of pagesBatch) {
179
+ const params = {
180
+ site_id: page.site_id,
181
+ normalized_url: page.normalized_url,
182
+ first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
183
+ last_seen_snapshot_id: page.last_seen_snapshot_id,
184
+ http_status: page.http_status ?? null,
185
+ canonical_url: page.canonical_url ?? null,
186
+ content_hash: page.content_hash ?? null,
187
+ simhash: page.simhash ?? null,
188
+ etag: page.etag ?? null,
189
+ last_modified: page.last_modified ?? null,
190
+ html: page.html ?? null,
191
+ soft404_score: page.soft404_score ?? null,
192
+ noindex: page.noindex ?? 0,
193
+ nofollow: page.nofollow ?? 0,
194
+ security_error: page.security_error ?? null,
195
+ retries: page.retries ?? 0,
196
+ depth: page.depth ?? 0,
197
+ redirect_chain: page.redirect_chain ?? null,
198
+ bytes_received: page.bytes_received ?? null,
199
+ crawl_trap_flag: page.crawl_trap_flag ?? 0,
200
+ crawl_trap_risk: page.crawl_trap_risk ?? null,
201
+ trap_type: page.trap_type ?? null,
202
+ };
203
+ const row = upsertStmtWithReturn.get(params) as { id: number };
204
+ urlToId.set(page.normalized_url, row.id);
205
+ }
206
+ });
207
+
208
+ tx(pages);
209
+ return urlToId;
210
+ }
211
+
120
212
  getPagesBySnapshot(snapshotId: number): Page[] {
121
- return this.db.prepare('SELECT * FROM pages WHERE last_seen_snapshot_id = ?').all(snapshotId) as Page[];
213
+ return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId) as Page[];
214
+ }
215
+
216
+ getPagesIdentityBySnapshot(snapshotId: number): { id: number; normalized_url: string }[] {
217
+ return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId) as { id: number; normalized_url: string }[];
218
+ }
219
+
220
+ getPagesIteratorBySnapshot(snapshotId: number): IterableIterator<Page> {
221
+ return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').iterate(snapshotId, snapshotId) as IterableIterator<Page>;
122
222
  }
123
223
 
124
224
  getIdByUrl(siteId: number, url: string): number | undefined {
@@ -11,10 +11,18 @@ export interface Site {
11
11
  export class SiteRepository {
12
12
  constructor(private db: Database) { }
13
13
 
14
+ getSiteById(id: number): Site | undefined {
15
+ return this.db.prepare('SELECT * FROM sites WHERE id = ?').get(id) as Site | undefined;
16
+ }
17
+
14
18
  getSite(domain: string): Site | undefined {
15
19
  return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain) as Site | undefined;
16
20
  }
17
21
 
22
+ getAllSites(): Site[] {
23
+ return this.db.prepare('SELECT * FROM sites ORDER BY domain ASC').all() as Site[];
24
+ }
25
+
18
26
  createSite(domain: string): number {
19
27
  const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
20
28
  const info = stmt.run(domain);
@@ -29,4 +37,7 @@ export class SiteRepository {
29
37
  }
30
38
  return site!;
31
39
  }
40
+ deleteSite(id: number): void {
41
+ this.db.prepare('DELETE FROM sites WHERE id = ?').run(id);
42
+ }
32
43
  }
@@ -15,25 +15,35 @@ export interface Snapshot {
15
15
  }
16
16
 
17
17
  export class SnapshotRepository {
18
- constructor(private db: Database) {}
18
+ constructor(private db: Database) { }
19
19
 
20
20
  createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status: 'running' | 'completed' | 'failed' = 'running'): number {
21
+ // Basic throttling or sleep if needed for tests, but generally SQLite is fast enough to have diff timestamps if not in same ms.
22
+ // However, if we run in memory, created_at is default current time.
23
+ // If two snapshots created in same second, ORDER BY created_at DESC is unstable or equal.
24
+ // We should rely on ID for stability if timestamps are equal, but the query uses created_at.
25
+ // Let's ensure we can also order by ID as tie-breaker.
21
26
  const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
22
27
  const info = stmt.run(siteId, type, status);
23
28
  return info.lastInsertRowid as number;
24
29
  }
25
30
 
26
31
  getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined {
27
- let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
32
+ let sql = 'SELECT * FROM snapshots WHERE site_id = ? AND type != \'partial\'';
28
33
  const params: any[] = [siteId];
29
34
  if (status) {
30
35
  sql += ' AND status = ?';
31
36
  params.push(status);
32
37
  }
33
- sql += ' ORDER BY created_at DESC LIMIT 1';
38
+ sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
34
39
  return this.db.prepare(sql).get(...params) as Snapshot | undefined;
35
40
  }
36
41
 
42
+ getSnapshotCount(siteId: number): number {
43
+ const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId) as { count: number };
44
+ return result.count;
45
+ }
46
+
37
47
  updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats: Partial<Snapshot> = {}) {
38
48
  const sets: string[] = ['status = ?'];
39
49
  const params: any[] = [status];
@@ -71,4 +81,19 @@ export class SnapshotRepository {
71
81
  getSnapshot(id: number): Snapshot | undefined {
72
82
  return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id) as Snapshot | undefined;
73
83
  }
84
+
85
+ deleteSnapshot(id: number): void {
86
+ const tx = this.db.transaction(() => {
87
+ // Unlink pages from this snapshot to prevent FK constraint violations or data inconsistencies
88
+ this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
89
+ this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
90
+
91
+ // Cleanup: Delete pages that are no longer referenced by any snapshot
92
+ this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
93
+
94
+ // Delete the snapshot
95
+ this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
96
+ });
97
+ tx();
98
+ }
74
99
  }
package/src/events.ts ADDED
@@ -0,0 +1,16 @@
1
+ export type CrawlEvent =
2
+ | { type: 'crawl:start'; url: string }
3
+ | { type: 'crawl:success'; url: string; status: number; durationMs: number; depth?: number }
4
+ | { type: 'crawl:error'; url: string; error: string; depth?: number }
5
+ | { type: 'crawl:limit-reached'; limit: number }
6
+ | { type: 'queue:enqueue'; url: string; depth: number }
7
+ | { type: 'metrics:start'; phase: string }
8
+ | { type: 'metrics:complete'; durationMs: number }
9
+ | { type: 'debug'; message: string; context?: unknown }
10
+ | { type: 'info'; message: string; context?: unknown }
11
+ | { type: 'warn'; message: string; context?: unknown }
12
+ | { type: 'error'; message: string; error?: unknown; context?: unknown };
13
+
14
+ export interface EngineContext {
15
+ emit: (event: CrawlEvent) => void;
16
+ }