@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -0,0 +1,118 @@
1
+ import { describe, it, expect, beforeEach } from 'vitest';
2
+ import { Graph } from '../src/graph/graph.js';
3
+ import { detectContentClusters } from '../src/graph/cluster.js';
4
+
5
+ describe('Cluster Risk Heuristic', () => {
6
+ let graph: Graph;
7
+
8
+ beforeEach(() => {
9
+ graph = new Graph();
10
+ });
11
+
12
+ it('should assign HIGH risk to clusters with identical titles', () => {
13
+ const html = '<html><head><title>Duplicate Title</title></head><body>Content</body></html>';
14
+ const h = 0b101010n.toString();
15
+
16
+ graph.addNode('https://example.com/p1', 0, 200);
17
+ graph.addNode('https://example.com/p2', 0, 200);
18
+ graph.addNode('https://example.com/p3', 0, 200);
19
+
20
+ graph.updateNodeData('https://example.com/p1', { simhash: h, html });
21
+ graph.updateNodeData('https://example.com/p2', { simhash: h, html });
22
+ graph.updateNodeData('https://example.com/p3', { simhash: h, html });
23
+
24
+ const clusters = detectContentClusters(graph, 2, 2);
25
+
26
+ expect(clusters.length).toBe(1);
27
+ expect(clusters[0].risk).toBe('high');
28
+ });
29
+
30
+ it('should assign HIGH risk to clusters with identical H1s', () => {
31
+ const h = 0b101010n.toString();
32
+
33
+ graph.addNode('https://example.com/p1', 0, 200);
34
+ graph.addNode('https://example.com/p2', 0, 200);
35
+ graph.addNode('https://example.com/p3', 0, 200);
36
+
37
+ // Different titles, same H1
38
+ graph.updateNodeData('https://example.com/p1', {
39
+ simhash: h,
40
+ html: '<html><head><title>Page 1</title></head><body><h1>Duplicate Header</h1></body></html>'
41
+ });
42
+ graph.updateNodeData('https://example.com/p2', {
43
+ simhash: h,
44
+ html: '<html><head><title>Page 2</title></head><body><h1>Duplicate Header</h1></body></html>'
45
+ });
46
+ graph.updateNodeData('https://example.com/p3', {
47
+ simhash: h,
48
+ html: '<html><head><title>Page 3</title></head><body><h1>Duplicate Header</h1></body></html>'
49
+ });
50
+
51
+ const clusters = detectContentClusters(graph, 2, 2);
52
+
53
+ expect(clusters.length).toBe(1);
54
+ expect(clusters[0].risk).toBe('high');
55
+ });
56
+
57
+ it('should assign LOW risk to small clusters with unique titles and H1s', () => {
58
+ const h = 0b101010n.toString();
59
+
60
+ graph.addNode('https://example.com/p1', 0, 200);
61
+ graph.addNode('https://example.com/p2', 0, 200);
62
+ graph.addNode('https://example.com/p3', 0, 200);
63
+
64
+ graph.updateNodeData('https://example.com/p1', {
65
+ simhash: h,
66
+ html: '<html><head><title>Page 1</title></head><body><h1>Header 1</h1></body></html>'
67
+ });
68
+ graph.updateNodeData('https://example.com/p2', {
69
+ simhash: h,
70
+ html: '<html><head><title>Page 2</title></head><body><h1>Header 2</h1></body></html>'
71
+ });
72
+ graph.updateNodeData('https://example.com/p3', {
73
+ simhash: h,
74
+ html: '<html><head><title>Page 3</title></head><body><h1>Header 3</h1></body></html>'
75
+ });
76
+
77
+ const clusters = detectContentClusters(graph, 2, 2);
78
+
79
+ expect(clusters.length).toBe(1);
80
+ expect(clusters[0].risk).toBe('low');
81
+ });
82
+
83
+ it('should assign MEDIUM risk to large clusters even with unique titles', () => {
84
+ const h = 0b101010n.toString();
85
+
86
+ // 12 nodes, all unique titles
87
+ for (let i = 0; i < 12; i++) {
88
+ const url = `https://example.com/p${i}`;
89
+ graph.addNode(url, 0, 200);
90
+ graph.updateNodeData(url, {
91
+ simhash: h,
92
+ html: `<html><head><title>Page ${i}</title></head><body><h1>Header ${i}</h1></body></html>`
93
+ });
94
+ }
95
+
96
+ const clusters = detectContentClusters(graph, 2, 2);
97
+
98
+ expect(clusters.length).toBe(1);
99
+ expect(clusters[0].risk).toBe('medium');
100
+ });
101
+
102
+ it('should handle missing HTML gracefully', () => {
103
+ const h = 0b101010n.toString();
104
+
105
+ graph.addNode('https://example.com/p1', 0, 200);
106
+ graph.addNode('https://example.com/p2', 0, 200);
107
+
108
+ // No HTML provided
109
+ graph.updateNodeData('https://example.com/p1', { simhash: h });
110
+ graph.updateNodeData('https://example.com/p2', { simhash: h });
111
+
112
+ const clusters = detectContentClusters(graph, 2, 2);
113
+
114
+ expect(clusters.length).toBe(1);
115
+ // Fallback to size based? 2 nodes -> low risk
116
+ expect(clusters[0].risk).toBe('low');
117
+ });
118
+ });
@@ -1,16 +1,22 @@
1
- import { test, expect, beforeEach, afterEach } from 'vitest';
1
+ import { test, expect, beforeEach, afterEach, vi } from 'vitest';
2
2
  import { crawl } from '../src/crawler/crawl.js';
3
3
  import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
4
4
  import { closeDb } from '../src/db/index.js';
5
5
  import { MockAgent, setGlobalDispatcher } from 'undici';
6
+ import { IPGuard } from '../src/core/security/ipGuard.js';
7
+ import { EngineContext } from '../src/events.js';
6
8
 
7
9
  let mockAgent: MockAgent;
10
+ const mockContext: EngineContext = { emit: vi.fn() };
8
11
 
9
12
  beforeEach(() => {
10
13
  process.env.CRAWLITH_DB_PATH = ':memory:';
11
14
  mockAgent = new MockAgent();
12
15
  mockAgent.disableNetConnect();
13
16
  setGlobalDispatcher(mockAgent);
17
+
18
+ // IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
19
+ vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
14
20
  });
15
21
 
16
22
  afterEach(() => {
@@ -68,7 +74,7 @@ test('crawler should crawl and build graph', async () => {
68
74
  depth: 2,
69
75
  ignoreRobots: false,
70
76
  rate: 1000
71
- });
77
+ }, mockContext);
72
78
  const graph = loadGraphFromSnapshot(snapshotId);
73
79
 
74
80
  const nodes = graph.getNodes();
@@ -110,7 +116,7 @@ test('hard page limit', async () => {
110
116
  depth: 5,
111
117
  ignoreRobots: true,
112
118
  rate: 1000
113
- });
119
+ }, mockContext);
114
120
  const graph = loadGraphFromSnapshot(snapshotId);
115
121
 
116
122
  // Should have visited root + 1 other page (total 2 nodes with status > 0)
@@ -138,7 +144,7 @@ test('hard depth cap', async () => {
138
144
  depth: 20, // requested 20, but internal hard cap is 10
139
145
  ignoreRobots: true,
140
146
  rate: 1000
141
- });
147
+ }, mockContext);
142
148
  const graph = loadGraphFromSnapshot(snapshotId);
143
149
 
144
150
  const crawledNodes = graph.getNodes().filter(n => n.status > 0);
@@ -172,7 +178,7 @@ test('parameter explosion control', async () => {
172
178
  stripQuery: false,
173
179
  detectTraps: true,
174
180
  rate: 1000
175
- });
181
+ }, mockContext);
176
182
  const graph = loadGraphFromSnapshot(snapshotId);
177
183
 
178
184
  // Should only crawl 5 variations + root
@@ -205,7 +211,7 @@ test('redirect safety', async () => {
205
211
  depth: 5,
206
212
  ignoreRobots: true,
207
213
  rate: 1000
208
- });
214
+ }, mockContext);
209
215
  const graph = loadGraphFromSnapshot(snapshotId);
210
216
 
211
217
  const destNode = graph.nodes.get('https://redirect.com/dest');
@@ -223,7 +229,7 @@ test('redirect safety', async () => {
223
229
  clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
224
230
  // We might mock /a again if it retries, but it shouldn't infinitely loop
225
231
 
226
- const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
232
+ const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
227
233
  const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
228
234
  // It should eventually stop
229
235
  expect(graphLoop.getNodes().length).toBeGreaterThan(0);
@@ -246,7 +252,7 @@ test('mime check', async () => {
246
252
  <html><a href="/data">Data</a></html>
247
253
  `, { headers: { 'content-type': 'text/html' } });
248
254
 
249
- const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
255
+ const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
250
256
  const graph = loadGraphFromSnapshot(snapshotId);
251
257
 
252
258
  // /data should be in graph
@@ -267,7 +273,7 @@ test('self-link guard', async () => {
267
273
 
268
274
  client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
269
275
 
270
- const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
276
+ const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
271
277
  const graph = loadGraphFromSnapshot(snapshotId);
272
278
 
273
279
  const edges = graph.getEdges();
@@ -288,7 +294,7 @@ test('limit warning', async () => {
288
294
 
289
295
  client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
290
296
 
291
- const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 });
297
+ const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
292
298
  const graph = loadGraphFromSnapshot(snapshotId);
293
299
 
294
300
  expect(graph.limitReached).toBe(true);
@@ -315,7 +321,7 @@ test('seeds from sitemap', async () => {
315
321
  ignoreRobots: true,
316
322
  sitemap: 'true',
317
323
  rate: 1000
318
- });
324
+ }, mockContext);
319
325
  const graph = loadGraphFromSnapshot(snapshotId);
320
326
 
321
327
  const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
@@ -332,7 +338,7 @@ test('incremental crawl uses etags', async () => {
332
338
  headers: { 'content-type': 'text/html', 'etag': '"v1"' }
333
339
  });
334
340
 
335
- const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 });
341
+ const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 }, mockContext);
336
342
  const graph1 = loadGraphFromSnapshot(snapshotId1);
337
343
  const node1 = graph1.nodes.get('https://incremental.com/');
338
344
  expect(node1?.etag).toBe('"v1"');
@@ -350,7 +356,7 @@ test('incremental crawl uses etags', async () => {
350
356
  ignoreRobots: true,
351
357
  previousGraph: graph1,
352
358
  rate: 1000
353
- });
359
+ }, mockContext);
354
360
  const graph2 = loadGraphFromSnapshot(snapshotId2);
355
361
 
356
362
  const node2 = graph2.nodes.get('https://incremental.com/');
@@ -0,0 +1,134 @@
1
+ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
2
+ import { getDbPath, getDb, closeDb } from '../../src/db/index.js';
3
+ import fs from 'node:fs';
4
+ import os from 'node:os';
5
+ import path from 'node:path';
6
+
7
+ vi.mock('node:fs');
8
+ vi.mock('node:os');
9
+ vi.mock('better-sqlite3', () => {
10
+ return {
11
+ default: vi.fn(function () {
12
+ return {
13
+ pragma: vi.fn().mockReturnValue('ok'),
14
+ prepare: vi.fn().mockReturnValue({
15
+ run: vi.fn(),
16
+ get: vi.fn(),
17
+ iterate: vi.fn(),
18
+ all: vi.fn()
19
+ }),
20
+ exec: vi.fn(),
21
+ close: vi.fn(),
22
+ transaction: vi.fn((fn) => fn),
23
+ };
24
+ }),
25
+ };
26
+ });
27
+ vi.mock('../../src/db/schema.js', () => ({
28
+ initSchema: vi.fn(),
29
+ }));
30
+
31
+ describe('DB Index', () => {
32
+ const originalEnv = process.env;
33
+
34
+ beforeEach(() => {
35
+ vi.resetAllMocks();
36
+ closeDb();
37
+ process.env = { ...originalEnv };
38
+ // Default mock behaviors
39
+ vi.mocked(os.homedir).mockReturnValue('/home/user');
40
+ vi.mocked(fs.existsSync).mockReturnValue(false);
41
+ vi.mocked(fs.mkdirSync).mockImplementation(() => undefined as any);
42
+ vi.mocked(fs.chmodSync).mockImplementation(() => undefined);
43
+ });
44
+
45
+ afterEach(() => {
46
+ process.env = originalEnv;
47
+ closeDb();
48
+ });
49
+
50
+ describe('getDbPath', () => {
51
+ it('should return :memory: in test environment', () => {
52
+ process.env.NODE_ENV = 'test';
53
+ expect(getDbPath()).toBe(':memory:');
54
+ });
55
+
56
+ it('should return custom path if CRAWLITH_DB_PATH is set', () => {
57
+ process.env.NODE_ENV = 'production';
58
+ process.env.CRAWLITH_DB_PATH = '/custom/path/db.sqlite';
59
+ expect(getDbPath()).toBe('/custom/path/db.sqlite');
60
+ });
61
+
62
+ it('should return default path in home dir if no env var', () => {
63
+ process.env.NODE_ENV = 'production';
64
+ delete process.env.CRAWLITH_DB_PATH;
65
+
66
+ const expectedPath = path.join('/home/user', '.crawlith', 'crawlith.db');
67
+ expect(getDbPath()).toBe(expectedPath);
68
+
69
+ expect(fs.mkdirSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), { recursive: true });
70
+ expect(fs.chmodSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), 0o700);
71
+ });
72
+
73
+ it('should not create dir if it exists', () => {
74
+ process.env.NODE_ENV = 'production';
75
+ vi.mocked(fs.existsSync).mockReturnValue(true);
76
+
77
+ getDbPath();
78
+
79
+ expect(fs.mkdirSync).not.toHaveBeenCalled();
80
+ });
81
+ });
82
+
83
+ describe('getDb', () => {
84
+ it('should create a new database instance', () => {
85
+ process.env.NODE_ENV = 'production';
86
+ const db = getDb();
87
+ expect(db).toBeDefined();
88
+ // Check if pragma was called
89
+ expect(db.pragma).toHaveBeenCalledWith('journal_mode = WAL');
90
+ });
91
+
92
+ it('should return existing instance if called twice', () => {
93
+ process.env.NODE_ENV = 'production';
94
+ const db1 = getDb();
95
+ const db2 = getDb();
96
+ expect(db1).toBe(db2);
97
+ });
98
+
99
+ it('should handle permission errors gracefully', () => {
100
+ process.env.NODE_ENV = 'production';
101
+ // Avoid getDbPath throwing
102
+ vi.mocked(fs.existsSync).mockReturnValue(true);
103
+
104
+ vi.mocked(fs.chmodSync).mockImplementation((path) => {
105
+ if (path.toString().endsWith('crawlith.db')) {
106
+ throw new Error('EPERM');
107
+ }
108
+ });
109
+
110
+ expect(() => getDb()).not.toThrow();
111
+ });
112
+
113
+ it('should warn if integrity check fails', async () => {
114
+ const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
115
+ process.env.NODE_ENV = 'production';
116
+ vi.mocked(fs.existsSync).mockReturnValue(true);
117
+
118
+ const MockDatabase = (await import('better-sqlite3')).default;
119
+ vi.mocked(MockDatabase).mockImplementationOnce(function() {
120
+ return {
121
+ pragma: vi.fn().mockReturnValue('corrupt'),
122
+ prepare: vi.fn(),
123
+ exec: vi.fn(),
124
+ close: vi.fn(),
125
+ transaction: vi.fn(),
126
+ } as any;
127
+ });
128
+
129
+ getDb();
130
+
131
+ expect(warnSpy).toHaveBeenCalledWith('Database integrity check failed:', 'corrupt');
132
+ });
133
+ });
134
+ });
@@ -0,0 +1,115 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import Database from 'better-sqlite3';
3
+ import { PageRepository } from '../../src/db/repositories/PageRepository.js';
4
+ import { initSchema } from '../../src/db/schema.js';
5
+
6
+ describe('PageRepository', () => {
7
+ let db: Database.Database;
8
+ let repo: PageRepository;
9
+
10
+ beforeEach(() => {
11
+ db = new Database(':memory:');
12
+ initSchema(db);
13
+ repo = new PageRepository(db);
14
+
15
+ // Seed required tables (sites, snapshots)
16
+ db.prepare("INSERT INTO sites (domain) VALUES ('example.com')").run();
17
+ db.prepare("INSERT INTO snapshots (site_id, type) VALUES (1, 'full')").run();
18
+ });
19
+
20
+ afterEach(() => {
21
+ db.close();
22
+ });
23
+
24
+ it('should get pages by URLs in chunks', () => {
25
+ const urls: string[] = [];
26
+ const siteId = 1;
27
+ const snapshotId = 1;
28
+
29
+ // Create 1000 pages (chunk size is 900)
30
+ const insertStmt = db.prepare(`
31
+ INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
32
+ VALUES (?, ?, ?, ?)
33
+ `);
34
+
35
+ const tx = db.transaction(() => {
36
+ for (let i = 0; i < 1000; i++) {
37
+ const url = `http://example.com/page${i}`;
38
+ urls.push(url);
39
+ insertStmt.run(siteId, url, snapshotId, snapshotId);
40
+ }
41
+ });
42
+ tx();
43
+
44
+ // Fetch pages
45
+ const pages = repo.getPagesByUrls(siteId, urls);
46
+
47
+ expect(pages).toHaveLength(1000);
48
+ expect(pages[0].normalized_url).toBe('http://example.com/page0');
49
+ expect(pages[999].normalized_url).toBe('http://example.com/page999');
50
+ });
51
+
52
+ it('should return empty array for empty URL list', () => {
53
+ const pages = repo.getPagesByUrls(1, []);
54
+ expect(pages).toEqual([]);
55
+ });
56
+
57
+ it('should iterate over pages by snapshot', () => {
58
+ const siteId = 1;
59
+ const snapshotId = 1;
60
+ const insertStmt = db.prepare(`
61
+ INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
62
+ VALUES (?, ?, ?, ?)
63
+ `);
64
+
65
+ db.transaction(() => {
66
+ insertStmt.run(siteId, 'http://example.com/1', snapshotId, snapshotId);
67
+ insertStmt.run(siteId, 'http://example.com/2', snapshotId, snapshotId);
68
+ insertStmt.run(siteId, 'http://example.com/3', snapshotId, snapshotId);
69
+ })();
70
+
71
+ const iterator = repo.getPagesIteratorBySnapshot(snapshotId);
72
+ const pages = Array.from(iterator);
73
+
74
+ expect(pages).toHaveLength(3);
75
+ expect(pages.map(p => p.normalized_url).sort()).toEqual([
76
+ 'http://example.com/1',
77
+ 'http://example.com/2',
78
+ 'http://example.com/3'
79
+ ]);
80
+ });
81
+
82
+ it('should upsert and get ID', () => {
83
+ const pageData = {
84
+ site_id: 1,
85
+ normalized_url: 'http://example.com/new',
86
+ last_seen_snapshot_id: 1,
87
+ http_status: 200,
88
+ };
89
+
90
+ const id = repo.upsertAndGetId(pageData);
91
+ expect(id).toBeGreaterThan(0);
92
+
93
+ const sameId = repo.upsertAndGetId({ ...pageData, http_status: 404 });
94
+ expect(sameId).toBe(id);
95
+
96
+ const page = repo.getPage(1, 'http://example.com/new');
97
+ expect(page?.http_status).toBe(404);
98
+ });
99
+
100
+ it('should get ID by URL', () => {
101
+ const pageData = {
102
+ site_id: 1,
103
+ normalized_url: 'http://example.com/id-test',
104
+ last_seen_snapshot_id: 1,
105
+ };
106
+ repo.upsertPage(pageData);
107
+
108
+ const id = repo.getIdByUrl(1, 'http://example.com/id-test');
109
+ expect(id).toBeDefined();
110
+ expect(id).toBeGreaterThan(0);
111
+
112
+ const missingId = repo.getIdByUrl(1, 'http://example.com/missing');
113
+ expect(missingId).toBeUndefined();
114
+ });
115
+ });
@@ -0,0 +1,72 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import Database from 'better-sqlite3';
3
+ import { initSchema } from '../src/db/schema.js';
4
+ import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
5
+ import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
6
+
7
+ describe('SiteRepository & SnapshotRepository', () => {
8
+ let db: Database.Database;
9
+ let siteRepo: SiteRepository;
10
+ let snapshotRepo: SnapshotRepository;
11
+
12
+ beforeEach(() => {
13
+ db = new Database(':memory:');
14
+ initSchema(db);
15
+ siteRepo = new SiteRepository(db);
16
+ snapshotRepo = new SnapshotRepository(db);
17
+ });
18
+
19
+ afterEach(() => {
20
+ db.close();
21
+ });
22
+
23
+ it('getAllSites should return all sites ordered by domain', () => {
24
+ siteRepo.createSite('b.com');
25
+ siteRepo.createSite('a.com');
26
+ siteRepo.createSite('c.com');
27
+
28
+ const sites = siteRepo.getAllSites();
29
+ expect(sites).toHaveLength(3);
30
+ expect(sites[0].domain).toBe('a.com');
31
+ expect(sites[1].domain).toBe('b.com');
32
+ expect(sites[2].domain).toBe('c.com');
33
+ });
34
+
35
+ it('getSnapshotCount should return correct count', () => {
36
+ const siteId = siteRepo.createSite('test.com');
37
+
38
+ expect(snapshotRepo.getSnapshotCount(siteId)).toBe(0);
39
+
40
+ snapshotRepo.createSnapshot(siteId, 'full');
41
+ expect(snapshotRepo.getSnapshotCount(siteId)).toBe(1);
42
+
43
+ snapshotRepo.createSnapshot(siteId, 'partial');
44
+ expect(snapshotRepo.getSnapshotCount(siteId)).toBe(2);
45
+ });
46
+
47
+ it('getLatestSnapshot should return the latest snapshot', () => {
48
+ const siteId = siteRepo.createSite('test.com');
49
+
50
+ // First snapshot
51
+ snapshotRepo.createSnapshot(siteId, 'full', 'completed');
52
+ // Wait a tiny bit to ensure timestamp diff if needed, but synchronous execution usually implies order
53
+
54
+ // Second snapshot
55
+ const secondId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
56
+
57
+ const latest = snapshotRepo.getLatestSnapshot(siteId);
58
+ expect(latest).toBeDefined();
59
+ expect(latest?.id).toBe(secondId);
60
+ expect(latest?.status).toBe('running');
61
+ });
62
+
63
+ it('getLatestSnapshot with status filter', () => {
64
+ const siteId = siteRepo.createSite('test.com');
65
+ const firstId = snapshotRepo.createSnapshot(siteId, 'full', 'completed');
66
+ snapshotRepo.createSnapshot(siteId, 'full', 'running');
67
+
68
+ const latestCompleted = snapshotRepo.getLatestSnapshot(siteId, 'completed');
69
+ expect(latestCompleted).toBeDefined();
70
+ expect(latestCompleted?.id).toBe(firstId);
71
+ });
72
+ });
@@ -89,7 +89,7 @@ describe('Duplicate Detection', () => {
89
89
  graph.updateNodeData('https://example.com/b', { contentHash: 'h1' });
90
90
 
91
91
  // Add edge pointing to B
92
- graph.edges.set('https://example.com/source|https://example.com/b', 1);
92
+ graph.addEdge('https://example.com/source', 'https://example.com/b', 1);
93
93
 
94
94
  // Force A to be the representative by giving it higher inLinks manually, though it's determined dynamically
95
95
  graph.nodes.get('https://example.com/a')!.inLinks = 10;
@@ -105,6 +105,6 @@ describe('Duplicate Detection', () => {
105
105
  expect(b.collapseInto).toBe('https://example.com/a');
106
106
 
107
107
  // Check edge transfer
108
- expect(graph.edges.has('https://example.com/source|https://example.com/a')).toBe(true);
108
+ expect(graph.edges.has(Graph.getEdgeKey('https://example.com/source', 'https://example.com/a'))).toBe(true);
109
109
  });
110
110
  });
@@ -0,0 +1,86 @@
1
+ import { extractLinks } from '../src/crawler/extract.js';
2
+ import { test, expect, describe, vi, afterEach } from 'vitest';
3
+ import * as cheerio from 'cheerio';
4
+
5
+ // Mock cheerio.load to allow us to simulate errors
6
+ vi.mock('cheerio', async (importOriginal) => {
7
+ const mod = await importOriginal<any>();
8
+ return {
9
+ ...mod,
10
+ load: vi.fn((...args: any[]) => mod.load(...args))
11
+ };
12
+ });
13
+
14
+ describe('extractLinks', () => {
15
+ afterEach(() => {
16
+ vi.restoreAllMocks();
17
+ });
18
+
19
+ test('should extract links correctly', () => {
20
+ const html = `
21
+ <html>
22
+ <body>
23
+ <a href="/foo">Foo</a>
24
+ <a href="bar">Bar</a>
25
+ <a href="https://other.com/baz">Baz</a>
26
+ <a href="#top">Top</a>
27
+ </body>
28
+ </html>
29
+ `;
30
+ const links = extractLinks(html, 'https://example.com/page/');
31
+ expect(links).toContain('https://example.com/foo');
32
+ expect(links).toContain('https://example.com/page/bar');
33
+ expect(links).toContain('https://other.com/baz');
34
+ expect(links).not.toContain('https://example.com/page/#top');
35
+ expect(links).toContain('https://example.com/page/');
36
+ });
37
+
38
+ test('should handle cheerio errors gracefully', () => {
39
+ const error = new Error('Cheerio error');
40
+
41
+ // Mock cheerio.load to throw an error
42
+ vi.mocked(cheerio.load).mockImplementationOnce(() => {
43
+ throw error;
44
+ });
45
+
46
+ const links = extractLinks('<html></html>', 'https://example.com');
47
+
48
+ expect(links).toEqual([]);
49
+ // No console error expected
50
+ });
51
+
52
+ test('should handle non-Error exceptions gracefully', () => {
53
+ const error = 'String error'; // Simulate a thrown string
54
+
55
+ vi.mocked(cheerio.load).mockImplementationOnce(() => {
56
+ throw error;
57
+ });
58
+
59
+ const links = extractLinks('<html></html>', 'https://example.com');
60
+
61
+ expect(links).toEqual([]);
62
+ // No console error expected
63
+ });
64
+
65
+ test('should ignore invalid URLs that cause URL constructor to throw', () => {
66
+ const html = '<a href="http://[">Invalid</a>';
67
+ const links = extractLinks(html, 'https://example.com');
68
+ expect(links).toEqual([]);
69
+ });
70
+
71
+ test('should ignore non-http protocols', () => {
72
+ const html = `
73
+ <a href="mailto:test@example.com">Mail</a>
74
+ <a href="javascript:void(0)">JS</a>
75
+ <a href="ftp://example.com/file">FTP</a>
76
+ `;
77
+ const links = extractLinks(html, 'https://example.com');
78
+ expect(links).toEqual([]);
79
+ });
80
+
81
+ test('should ignore links without href', () => {
82
+ const html = '<a>No Href</a>';
83
+ const links = extractLinks(html, 'https://example.com');
84
+ expect(links).toEqual([]);
85
+ });
86
+ });