@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach } from 'vitest';
|
|
2
|
+
import { Graph } from '../src/graph/graph.js';
|
|
3
|
+
import { detectContentClusters } from '../src/graph/cluster.js';
|
|
4
|
+
|
|
5
|
+
describe('Cluster Risk Heuristic', () => {
|
|
6
|
+
let graph: Graph;
|
|
7
|
+
|
|
8
|
+
beforeEach(() => {
|
|
9
|
+
graph = new Graph();
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
it('should assign HIGH risk to clusters with identical titles', () => {
|
|
13
|
+
const html = '<html><head><title>Duplicate Title</title></head><body>Content</body></html>';
|
|
14
|
+
const h = 0b101010n.toString();
|
|
15
|
+
|
|
16
|
+
graph.addNode('https://example.com/p1', 0, 200);
|
|
17
|
+
graph.addNode('https://example.com/p2', 0, 200);
|
|
18
|
+
graph.addNode('https://example.com/p3', 0, 200);
|
|
19
|
+
|
|
20
|
+
graph.updateNodeData('https://example.com/p1', { simhash: h, html });
|
|
21
|
+
graph.updateNodeData('https://example.com/p2', { simhash: h, html });
|
|
22
|
+
graph.updateNodeData('https://example.com/p3', { simhash: h, html });
|
|
23
|
+
|
|
24
|
+
const clusters = detectContentClusters(graph, 2, 2);
|
|
25
|
+
|
|
26
|
+
expect(clusters.length).toBe(1);
|
|
27
|
+
expect(clusters[0].risk).toBe('high');
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('should assign HIGH risk to clusters with identical H1s', () => {
|
|
31
|
+
const h = 0b101010n.toString();
|
|
32
|
+
|
|
33
|
+
graph.addNode('https://example.com/p1', 0, 200);
|
|
34
|
+
graph.addNode('https://example.com/p2', 0, 200);
|
|
35
|
+
graph.addNode('https://example.com/p3', 0, 200);
|
|
36
|
+
|
|
37
|
+
// Different titles, same H1
|
|
38
|
+
graph.updateNodeData('https://example.com/p1', {
|
|
39
|
+
simhash: h,
|
|
40
|
+
html: '<html><head><title>Page 1</title></head><body><h1>Duplicate Header</h1></body></html>'
|
|
41
|
+
});
|
|
42
|
+
graph.updateNodeData('https://example.com/p2', {
|
|
43
|
+
simhash: h,
|
|
44
|
+
html: '<html><head><title>Page 2</title></head><body><h1>Duplicate Header</h1></body></html>'
|
|
45
|
+
});
|
|
46
|
+
graph.updateNodeData('https://example.com/p3', {
|
|
47
|
+
simhash: h,
|
|
48
|
+
html: '<html><head><title>Page 3</title></head><body><h1>Duplicate Header</h1></body></html>'
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
const clusters = detectContentClusters(graph, 2, 2);
|
|
52
|
+
|
|
53
|
+
expect(clusters.length).toBe(1);
|
|
54
|
+
expect(clusters[0].risk).toBe('high');
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('should assign LOW risk to small clusters with unique titles and H1s', () => {
|
|
58
|
+
const h = 0b101010n.toString();
|
|
59
|
+
|
|
60
|
+
graph.addNode('https://example.com/p1', 0, 200);
|
|
61
|
+
graph.addNode('https://example.com/p2', 0, 200);
|
|
62
|
+
graph.addNode('https://example.com/p3', 0, 200);
|
|
63
|
+
|
|
64
|
+
graph.updateNodeData('https://example.com/p1', {
|
|
65
|
+
simhash: h,
|
|
66
|
+
html: '<html><head><title>Page 1</title></head><body><h1>Header 1</h1></body></html>'
|
|
67
|
+
});
|
|
68
|
+
graph.updateNodeData('https://example.com/p2', {
|
|
69
|
+
simhash: h,
|
|
70
|
+
html: '<html><head><title>Page 2</title></head><body><h1>Header 2</h1></body></html>'
|
|
71
|
+
});
|
|
72
|
+
graph.updateNodeData('https://example.com/p3', {
|
|
73
|
+
simhash: h,
|
|
74
|
+
html: '<html><head><title>Page 3</title></head><body><h1>Header 3</h1></body></html>'
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
const clusters = detectContentClusters(graph, 2, 2);
|
|
78
|
+
|
|
79
|
+
expect(clusters.length).toBe(1);
|
|
80
|
+
expect(clusters[0].risk).toBe('low');
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
it('should assign MEDIUM risk to large clusters even with unique titles', () => {
|
|
84
|
+
const h = 0b101010n.toString();
|
|
85
|
+
|
|
86
|
+
// 12 nodes, all unique titles
|
|
87
|
+
for (let i = 0; i < 12; i++) {
|
|
88
|
+
const url = `https://example.com/p${i}`;
|
|
89
|
+
graph.addNode(url, 0, 200);
|
|
90
|
+
graph.updateNodeData(url, {
|
|
91
|
+
simhash: h,
|
|
92
|
+
html: `<html><head><title>Page ${i}</title></head><body><h1>Header ${i}</h1></body></html>`
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const clusters = detectContentClusters(graph, 2, 2);
|
|
97
|
+
|
|
98
|
+
expect(clusters.length).toBe(1);
|
|
99
|
+
expect(clusters[0].risk).toBe('medium');
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
it('should handle missing HTML gracefully', () => {
|
|
103
|
+
const h = 0b101010n.toString();
|
|
104
|
+
|
|
105
|
+
graph.addNode('https://example.com/p1', 0, 200);
|
|
106
|
+
graph.addNode('https://example.com/p2', 0, 200);
|
|
107
|
+
|
|
108
|
+
// No HTML provided
|
|
109
|
+
graph.updateNodeData('https://example.com/p1', { simhash: h });
|
|
110
|
+
graph.updateNodeData('https://example.com/p2', { simhash: h });
|
|
111
|
+
|
|
112
|
+
const clusters = detectContentClusters(graph, 2, 2);
|
|
113
|
+
|
|
114
|
+
expect(clusters.length).toBe(1);
|
|
115
|
+
// Fallback to size based? 2 nodes -> low risk
|
|
116
|
+
expect(clusters[0].risk).toBe('low');
|
|
117
|
+
});
|
|
118
|
+
});
|
package/tests/crawler.test.ts
CHANGED
|
@@ -1,16 +1,22 @@
|
|
|
1
|
-
import { test, expect, beforeEach, afterEach } from 'vitest';
|
|
1
|
+
import { test, expect, beforeEach, afterEach, vi } from 'vitest';
|
|
2
2
|
import { crawl } from '../src/crawler/crawl.js';
|
|
3
3
|
import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
|
|
4
4
|
import { closeDb } from '../src/db/index.js';
|
|
5
5
|
import { MockAgent, setGlobalDispatcher } from 'undici';
|
|
6
|
+
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
7
|
+
import { EngineContext } from '../src/events.js';
|
|
6
8
|
|
|
7
9
|
let mockAgent: MockAgent;
|
|
10
|
+
const mockContext: EngineContext = { emit: vi.fn() };
|
|
8
11
|
|
|
9
12
|
beforeEach(() => {
|
|
10
13
|
process.env.CRAWLITH_DB_PATH = ':memory:';
|
|
11
14
|
mockAgent = new MockAgent();
|
|
12
15
|
mockAgent.disableNetConnect();
|
|
13
16
|
setGlobalDispatcher(mockAgent);
|
|
17
|
+
|
|
18
|
+
// IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
|
|
19
|
+
vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
|
|
14
20
|
});
|
|
15
21
|
|
|
16
22
|
afterEach(() => {
|
|
@@ -68,7 +74,7 @@ test('crawler should crawl and build graph', async () => {
|
|
|
68
74
|
depth: 2,
|
|
69
75
|
ignoreRobots: false,
|
|
70
76
|
rate: 1000
|
|
71
|
-
});
|
|
77
|
+
}, mockContext);
|
|
72
78
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
73
79
|
|
|
74
80
|
const nodes = graph.getNodes();
|
|
@@ -110,7 +116,7 @@ test('hard page limit', async () => {
|
|
|
110
116
|
depth: 5,
|
|
111
117
|
ignoreRobots: true,
|
|
112
118
|
rate: 1000
|
|
113
|
-
});
|
|
119
|
+
}, mockContext);
|
|
114
120
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
115
121
|
|
|
116
122
|
// Should have visited root + 1 other page (total 2 nodes with status > 0)
|
|
@@ -138,7 +144,7 @@ test('hard depth cap', async () => {
|
|
|
138
144
|
depth: 20, // requested 20, but internal hard cap is 10
|
|
139
145
|
ignoreRobots: true,
|
|
140
146
|
rate: 1000
|
|
141
|
-
});
|
|
147
|
+
}, mockContext);
|
|
142
148
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
143
149
|
|
|
144
150
|
const crawledNodes = graph.getNodes().filter(n => n.status > 0);
|
|
@@ -172,7 +178,7 @@ test('parameter explosion control', async () => {
|
|
|
172
178
|
stripQuery: false,
|
|
173
179
|
detectTraps: true,
|
|
174
180
|
rate: 1000
|
|
175
|
-
});
|
|
181
|
+
}, mockContext);
|
|
176
182
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
177
183
|
|
|
178
184
|
// Should only crawl 5 variations + root
|
|
@@ -205,7 +211,7 @@ test('redirect safety', async () => {
|
|
|
205
211
|
depth: 5,
|
|
206
212
|
ignoreRobots: true,
|
|
207
213
|
rate: 1000
|
|
208
|
-
});
|
|
214
|
+
}, mockContext);
|
|
209
215
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
210
216
|
|
|
211
217
|
const destNode = graph.nodes.get('https://redirect.com/dest');
|
|
@@ -223,7 +229,7 @@ test('redirect safety', async () => {
|
|
|
223
229
|
clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
|
|
224
230
|
// We might mock /a again if it retries, but it shouldn't infinitely loop
|
|
225
231
|
|
|
226
|
-
const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
|
|
232
|
+
const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
227
233
|
const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
|
|
228
234
|
// It should eventually stop
|
|
229
235
|
expect(graphLoop.getNodes().length).toBeGreaterThan(0);
|
|
@@ -246,7 +252,7 @@ test('mime check', async () => {
|
|
|
246
252
|
<html><a href="/data">Data</a></html>
|
|
247
253
|
`, { headers: { 'content-type': 'text/html' } });
|
|
248
254
|
|
|
249
|
-
const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
|
|
255
|
+
const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
250
256
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
251
257
|
|
|
252
258
|
// /data should be in graph
|
|
@@ -267,7 +273,7 @@ test('self-link guard', async () => {
|
|
|
267
273
|
|
|
268
274
|
client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
|
|
269
275
|
|
|
270
|
-
const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
|
|
276
|
+
const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
271
277
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
272
278
|
|
|
273
279
|
const edges = graph.getEdges();
|
|
@@ -288,7 +294,7 @@ test('limit warning', async () => {
|
|
|
288
294
|
|
|
289
295
|
client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
|
|
290
296
|
|
|
291
|
-
const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 });
|
|
297
|
+
const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
292
298
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
293
299
|
|
|
294
300
|
expect(graph.limitReached).toBe(true);
|
|
@@ -315,7 +321,7 @@ test('seeds from sitemap', async () => {
|
|
|
315
321
|
ignoreRobots: true,
|
|
316
322
|
sitemap: 'true',
|
|
317
323
|
rate: 1000
|
|
318
|
-
});
|
|
324
|
+
}, mockContext);
|
|
319
325
|
const graph = loadGraphFromSnapshot(snapshotId);
|
|
320
326
|
|
|
321
327
|
const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
|
|
@@ -332,7 +338,7 @@ test('incremental crawl uses etags', async () => {
|
|
|
332
338
|
headers: { 'content-type': 'text/html', 'etag': '"v1"' }
|
|
333
339
|
});
|
|
334
340
|
|
|
335
|
-
const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 });
|
|
341
|
+
const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
336
342
|
const graph1 = loadGraphFromSnapshot(snapshotId1);
|
|
337
343
|
const node1 = graph1.nodes.get('https://incremental.com/');
|
|
338
344
|
expect(node1?.etag).toBe('"v1"');
|
|
@@ -350,7 +356,7 @@ test('incremental crawl uses etags', async () => {
|
|
|
350
356
|
ignoreRobots: true,
|
|
351
357
|
previousGraph: graph1,
|
|
352
358
|
rate: 1000
|
|
353
|
-
});
|
|
359
|
+
}, mockContext);
|
|
354
360
|
const graph2 = loadGraphFromSnapshot(snapshotId2);
|
|
355
361
|
|
|
356
362
|
const node2 = graph2.nodes.get('https://incremental.com/');
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import { getDbPath, getDb, closeDb } from '../../src/db/index.js';
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import path from 'node:path';
|
|
6
|
+
|
|
7
|
+
vi.mock('node:fs');
|
|
8
|
+
vi.mock('node:os');
|
|
9
|
+
vi.mock('better-sqlite3', () => {
|
|
10
|
+
return {
|
|
11
|
+
default: vi.fn(function () {
|
|
12
|
+
return {
|
|
13
|
+
pragma: vi.fn().mockReturnValue('ok'),
|
|
14
|
+
prepare: vi.fn().mockReturnValue({
|
|
15
|
+
run: vi.fn(),
|
|
16
|
+
get: vi.fn(),
|
|
17
|
+
iterate: vi.fn(),
|
|
18
|
+
all: vi.fn()
|
|
19
|
+
}),
|
|
20
|
+
exec: vi.fn(),
|
|
21
|
+
close: vi.fn(),
|
|
22
|
+
transaction: vi.fn((fn) => fn),
|
|
23
|
+
};
|
|
24
|
+
}),
|
|
25
|
+
};
|
|
26
|
+
});
|
|
27
|
+
vi.mock('../../src/db/schema.js', () => ({
|
|
28
|
+
initSchema: vi.fn(),
|
|
29
|
+
}));
|
|
30
|
+
|
|
31
|
+
describe('DB Index', () => {
|
|
32
|
+
const originalEnv = process.env;
|
|
33
|
+
|
|
34
|
+
beforeEach(() => {
|
|
35
|
+
vi.resetAllMocks();
|
|
36
|
+
closeDb();
|
|
37
|
+
process.env = { ...originalEnv };
|
|
38
|
+
// Default mock behaviors
|
|
39
|
+
vi.mocked(os.homedir).mockReturnValue('/home/user');
|
|
40
|
+
vi.mocked(fs.existsSync).mockReturnValue(false);
|
|
41
|
+
vi.mocked(fs.mkdirSync).mockImplementation(() => undefined as any);
|
|
42
|
+
vi.mocked(fs.chmodSync).mockImplementation(() => undefined);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
afterEach(() => {
|
|
46
|
+
process.env = originalEnv;
|
|
47
|
+
closeDb();
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
describe('getDbPath', () => {
|
|
51
|
+
it('should return :memory: in test environment', () => {
|
|
52
|
+
process.env.NODE_ENV = 'test';
|
|
53
|
+
expect(getDbPath()).toBe(':memory:');
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
it('should return custom path if CRAWLITH_DB_PATH is set', () => {
|
|
57
|
+
process.env.NODE_ENV = 'production';
|
|
58
|
+
process.env.CRAWLITH_DB_PATH = '/custom/path/db.sqlite';
|
|
59
|
+
expect(getDbPath()).toBe('/custom/path/db.sqlite');
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it('should return default path in home dir if no env var', () => {
|
|
63
|
+
process.env.NODE_ENV = 'production';
|
|
64
|
+
delete process.env.CRAWLITH_DB_PATH;
|
|
65
|
+
|
|
66
|
+
const expectedPath = path.join('/home/user', '.crawlith', 'crawlith.db');
|
|
67
|
+
expect(getDbPath()).toBe(expectedPath);
|
|
68
|
+
|
|
69
|
+
expect(fs.mkdirSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), { recursive: true });
|
|
70
|
+
expect(fs.chmodSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), 0o700);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it('should not create dir if it exists', () => {
|
|
74
|
+
process.env.NODE_ENV = 'production';
|
|
75
|
+
vi.mocked(fs.existsSync).mockReturnValue(true);
|
|
76
|
+
|
|
77
|
+
getDbPath();
|
|
78
|
+
|
|
79
|
+
expect(fs.mkdirSync).not.toHaveBeenCalled();
|
|
80
|
+
});
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
describe('getDb', () => {
|
|
84
|
+
it('should create a new database instance', () => {
|
|
85
|
+
process.env.NODE_ENV = 'production';
|
|
86
|
+
const db = getDb();
|
|
87
|
+
expect(db).toBeDefined();
|
|
88
|
+
// Check if pragma was called
|
|
89
|
+
expect(db.pragma).toHaveBeenCalledWith('journal_mode = WAL');
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('should return existing instance if called twice', () => {
|
|
93
|
+
process.env.NODE_ENV = 'production';
|
|
94
|
+
const db1 = getDb();
|
|
95
|
+
const db2 = getDb();
|
|
96
|
+
expect(db1).toBe(db2);
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
it('should handle permission errors gracefully', () => {
|
|
100
|
+
process.env.NODE_ENV = 'production';
|
|
101
|
+
// Avoid getDbPath throwing
|
|
102
|
+
vi.mocked(fs.existsSync).mockReturnValue(true);
|
|
103
|
+
|
|
104
|
+
vi.mocked(fs.chmodSync).mockImplementation((path) => {
|
|
105
|
+
if (path.toString().endsWith('crawlith.db')) {
|
|
106
|
+
throw new Error('EPERM');
|
|
107
|
+
}
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
expect(() => getDb()).not.toThrow();
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it('should warn if integrity check fails', async () => {
|
|
114
|
+
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
|
|
115
|
+
process.env.NODE_ENV = 'production';
|
|
116
|
+
vi.mocked(fs.existsSync).mockReturnValue(true);
|
|
117
|
+
|
|
118
|
+
const MockDatabase = (await import('better-sqlite3')).default;
|
|
119
|
+
vi.mocked(MockDatabase).mockImplementationOnce(function() {
|
|
120
|
+
return {
|
|
121
|
+
pragma: vi.fn().mockReturnValue('corrupt'),
|
|
122
|
+
prepare: vi.fn(),
|
|
123
|
+
exec: vi.fn(),
|
|
124
|
+
close: vi.fn(),
|
|
125
|
+
transaction: vi.fn(),
|
|
126
|
+
} as any;
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
getDb();
|
|
130
|
+
|
|
131
|
+
expect(warnSpy).toHaveBeenCalledWith('Database integrity check failed:', 'corrupt');
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
});
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import Database from 'better-sqlite3';
|
|
3
|
+
import { PageRepository } from '../../src/db/repositories/PageRepository.js';
|
|
4
|
+
import { initSchema } from '../../src/db/schema.js';
|
|
5
|
+
|
|
6
|
+
describe('PageRepository', () => {
|
|
7
|
+
let db: Database.Database;
|
|
8
|
+
let repo: PageRepository;
|
|
9
|
+
|
|
10
|
+
beforeEach(() => {
|
|
11
|
+
db = new Database(':memory:');
|
|
12
|
+
initSchema(db);
|
|
13
|
+
repo = new PageRepository(db);
|
|
14
|
+
|
|
15
|
+
// Seed required tables (sites, snapshots)
|
|
16
|
+
db.prepare("INSERT INTO sites (domain) VALUES ('example.com')").run();
|
|
17
|
+
db.prepare("INSERT INTO snapshots (site_id, type) VALUES (1, 'full')").run();
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
afterEach(() => {
|
|
21
|
+
db.close();
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it('should get pages by URLs in chunks', () => {
|
|
25
|
+
const urls: string[] = [];
|
|
26
|
+
const siteId = 1;
|
|
27
|
+
const snapshotId = 1;
|
|
28
|
+
|
|
29
|
+
// Create 1000 pages (chunk size is 900)
|
|
30
|
+
const insertStmt = db.prepare(`
|
|
31
|
+
INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
|
|
32
|
+
VALUES (?, ?, ?, ?)
|
|
33
|
+
`);
|
|
34
|
+
|
|
35
|
+
const tx = db.transaction(() => {
|
|
36
|
+
for (let i = 0; i < 1000; i++) {
|
|
37
|
+
const url = `http://example.com/page${i}`;
|
|
38
|
+
urls.push(url);
|
|
39
|
+
insertStmt.run(siteId, url, snapshotId, snapshotId);
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
tx();
|
|
43
|
+
|
|
44
|
+
// Fetch pages
|
|
45
|
+
const pages = repo.getPagesByUrls(siteId, urls);
|
|
46
|
+
|
|
47
|
+
expect(pages).toHaveLength(1000);
|
|
48
|
+
expect(pages[0].normalized_url).toBe('http://example.com/page0');
|
|
49
|
+
expect(pages[999].normalized_url).toBe('http://example.com/page999');
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it('should return empty array for empty URL list', () => {
|
|
53
|
+
const pages = repo.getPagesByUrls(1, []);
|
|
54
|
+
expect(pages).toEqual([]);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('should iterate over pages by snapshot', () => {
|
|
58
|
+
const siteId = 1;
|
|
59
|
+
const snapshotId = 1;
|
|
60
|
+
const insertStmt = db.prepare(`
|
|
61
|
+
INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
|
|
62
|
+
VALUES (?, ?, ?, ?)
|
|
63
|
+
`);
|
|
64
|
+
|
|
65
|
+
db.transaction(() => {
|
|
66
|
+
insertStmt.run(siteId, 'http://example.com/1', snapshotId, snapshotId);
|
|
67
|
+
insertStmt.run(siteId, 'http://example.com/2', snapshotId, snapshotId);
|
|
68
|
+
insertStmt.run(siteId, 'http://example.com/3', snapshotId, snapshotId);
|
|
69
|
+
})();
|
|
70
|
+
|
|
71
|
+
const iterator = repo.getPagesIteratorBySnapshot(snapshotId);
|
|
72
|
+
const pages = Array.from(iterator);
|
|
73
|
+
|
|
74
|
+
expect(pages).toHaveLength(3);
|
|
75
|
+
expect(pages.map(p => p.normalized_url).sort()).toEqual([
|
|
76
|
+
'http://example.com/1',
|
|
77
|
+
'http://example.com/2',
|
|
78
|
+
'http://example.com/3'
|
|
79
|
+
]);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('should upsert and get ID', () => {
|
|
83
|
+
const pageData = {
|
|
84
|
+
site_id: 1,
|
|
85
|
+
normalized_url: 'http://example.com/new',
|
|
86
|
+
last_seen_snapshot_id: 1,
|
|
87
|
+
http_status: 200,
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
const id = repo.upsertAndGetId(pageData);
|
|
91
|
+
expect(id).toBeGreaterThan(0);
|
|
92
|
+
|
|
93
|
+
const sameId = repo.upsertAndGetId({ ...pageData, http_status: 404 });
|
|
94
|
+
expect(sameId).toBe(id);
|
|
95
|
+
|
|
96
|
+
const page = repo.getPage(1, 'http://example.com/new');
|
|
97
|
+
expect(page?.http_status).toBe(404);
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it('should get ID by URL', () => {
|
|
101
|
+
const pageData = {
|
|
102
|
+
site_id: 1,
|
|
103
|
+
normalized_url: 'http://example.com/id-test',
|
|
104
|
+
last_seen_snapshot_id: 1,
|
|
105
|
+
};
|
|
106
|
+
repo.upsertPage(pageData);
|
|
107
|
+
|
|
108
|
+
const id = repo.getIdByUrl(1, 'http://example.com/id-test');
|
|
109
|
+
expect(id).toBeDefined();
|
|
110
|
+
expect(id).toBeGreaterThan(0);
|
|
111
|
+
|
|
112
|
+
const missingId = repo.getIdByUrl(1, 'http://example.com/missing');
|
|
113
|
+
expect(missingId).toBeUndefined();
|
|
114
|
+
});
|
|
115
|
+
});
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import Database from 'better-sqlite3';
|
|
3
|
+
import { initSchema } from '../src/db/schema.js';
|
|
4
|
+
import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
|
|
5
|
+
import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
|
|
6
|
+
|
|
7
|
+
describe('SiteRepository & SnapshotRepository', () => {
|
|
8
|
+
let db: Database.Database;
|
|
9
|
+
let siteRepo: SiteRepository;
|
|
10
|
+
let snapshotRepo: SnapshotRepository;
|
|
11
|
+
|
|
12
|
+
beforeEach(() => {
|
|
13
|
+
db = new Database(':memory:');
|
|
14
|
+
initSchema(db);
|
|
15
|
+
siteRepo = new SiteRepository(db);
|
|
16
|
+
snapshotRepo = new SnapshotRepository(db);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
afterEach(() => {
|
|
20
|
+
db.close();
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
it('getAllSites should return all sites ordered by domain', () => {
|
|
24
|
+
siteRepo.createSite('b.com');
|
|
25
|
+
siteRepo.createSite('a.com');
|
|
26
|
+
siteRepo.createSite('c.com');
|
|
27
|
+
|
|
28
|
+
const sites = siteRepo.getAllSites();
|
|
29
|
+
expect(sites).toHaveLength(3);
|
|
30
|
+
expect(sites[0].domain).toBe('a.com');
|
|
31
|
+
expect(sites[1].domain).toBe('b.com');
|
|
32
|
+
expect(sites[2].domain).toBe('c.com');
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('getSnapshotCount should return correct count', () => {
|
|
36
|
+
const siteId = siteRepo.createSite('test.com');
|
|
37
|
+
|
|
38
|
+
expect(snapshotRepo.getSnapshotCount(siteId)).toBe(0);
|
|
39
|
+
|
|
40
|
+
snapshotRepo.createSnapshot(siteId, 'full');
|
|
41
|
+
expect(snapshotRepo.getSnapshotCount(siteId)).toBe(1);
|
|
42
|
+
|
|
43
|
+
snapshotRepo.createSnapshot(siteId, 'partial');
|
|
44
|
+
expect(snapshotRepo.getSnapshotCount(siteId)).toBe(2);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('getLatestSnapshot should return the latest snapshot', () => {
|
|
48
|
+
const siteId = siteRepo.createSite('test.com');
|
|
49
|
+
|
|
50
|
+
// First snapshot
|
|
51
|
+
snapshotRepo.createSnapshot(siteId, 'full', 'completed');
|
|
52
|
+
// Wait a tiny bit to ensure timestamp diff if needed, but synchronous execution usually implies order
|
|
53
|
+
|
|
54
|
+
// Second snapshot
|
|
55
|
+
const secondId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
|
|
56
|
+
|
|
57
|
+
const latest = snapshotRepo.getLatestSnapshot(siteId);
|
|
58
|
+
expect(latest).toBeDefined();
|
|
59
|
+
expect(latest?.id).toBe(secondId);
|
|
60
|
+
expect(latest?.status).toBe('running');
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('getLatestSnapshot with status filter', () => {
|
|
64
|
+
const siteId = siteRepo.createSite('test.com');
|
|
65
|
+
const firstId = snapshotRepo.createSnapshot(siteId, 'full', 'completed');
|
|
66
|
+
snapshotRepo.createSnapshot(siteId, 'full', 'running');
|
|
67
|
+
|
|
68
|
+
const latestCompleted = snapshotRepo.getLatestSnapshot(siteId, 'completed');
|
|
69
|
+
expect(latestCompleted).toBeDefined();
|
|
70
|
+
expect(latestCompleted?.id).toBe(firstId);
|
|
71
|
+
});
|
|
72
|
+
});
|
package/tests/duplicate.test.ts
CHANGED
|
@@ -89,7 +89,7 @@ describe('Duplicate Detection', () => {
|
|
|
89
89
|
graph.updateNodeData('https://example.com/b', { contentHash: 'h1' });
|
|
90
90
|
|
|
91
91
|
// Add edge pointing to B
|
|
92
|
-
graph.
|
|
92
|
+
graph.addEdge('https://example.com/source', 'https://example.com/b', 1);
|
|
93
93
|
|
|
94
94
|
// Force A to be the representative by giving it higher inLinks manually, though it's determined dynamically
|
|
95
95
|
graph.nodes.get('https://example.com/a')!.inLinks = 10;
|
|
@@ -105,6 +105,6 @@ describe('Duplicate Detection', () => {
|
|
|
105
105
|
expect(b.collapseInto).toBe('https://example.com/a');
|
|
106
106
|
|
|
107
107
|
// Check edge transfer
|
|
108
|
-
expect(graph.edges.has('https://example.com/source
|
|
108
|
+
expect(graph.edges.has(Graph.getEdgeKey('https://example.com/source', 'https://example.com/a'))).toBe(true);
|
|
109
109
|
});
|
|
110
110
|
});
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { extractLinks } from '../src/crawler/extract.js';
|
|
2
|
+
import { test, expect, describe, vi, afterEach } from 'vitest';
|
|
3
|
+
import * as cheerio from 'cheerio';
|
|
4
|
+
|
|
5
|
+
// Mock cheerio.load to allow us to simulate errors
|
|
6
|
+
vi.mock('cheerio', async (importOriginal) => {
|
|
7
|
+
const mod = await importOriginal<any>();
|
|
8
|
+
return {
|
|
9
|
+
...mod,
|
|
10
|
+
load: vi.fn((...args: any[]) => mod.load(...args))
|
|
11
|
+
};
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
describe('extractLinks', () => {
|
|
15
|
+
afterEach(() => {
|
|
16
|
+
vi.restoreAllMocks();
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
test('should extract links correctly', () => {
|
|
20
|
+
const html = `
|
|
21
|
+
<html>
|
|
22
|
+
<body>
|
|
23
|
+
<a href="/foo">Foo</a>
|
|
24
|
+
<a href="bar">Bar</a>
|
|
25
|
+
<a href="https://other.com/baz">Baz</a>
|
|
26
|
+
<a href="#top">Top</a>
|
|
27
|
+
</body>
|
|
28
|
+
</html>
|
|
29
|
+
`;
|
|
30
|
+
const links = extractLinks(html, 'https://example.com/page/');
|
|
31
|
+
expect(links).toContain('https://example.com/foo');
|
|
32
|
+
expect(links).toContain('https://example.com/page/bar');
|
|
33
|
+
expect(links).toContain('https://other.com/baz');
|
|
34
|
+
expect(links).not.toContain('https://example.com/page/#top');
|
|
35
|
+
expect(links).toContain('https://example.com/page/');
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
test('should handle cheerio errors gracefully', () => {
|
|
39
|
+
const error = new Error('Cheerio error');
|
|
40
|
+
|
|
41
|
+
// Mock cheerio.load to throw an error
|
|
42
|
+
vi.mocked(cheerio.load).mockImplementationOnce(() => {
|
|
43
|
+
throw error;
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
const links = extractLinks('<html></html>', 'https://example.com');
|
|
47
|
+
|
|
48
|
+
expect(links).toEqual([]);
|
|
49
|
+
// No console error expected
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
test('should handle non-Error exceptions gracefully', () => {
|
|
53
|
+
const error = 'String error'; // Simulate a thrown string
|
|
54
|
+
|
|
55
|
+
vi.mocked(cheerio.load).mockImplementationOnce(() => {
|
|
56
|
+
throw error;
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
const links = extractLinks('<html></html>', 'https://example.com');
|
|
60
|
+
|
|
61
|
+
expect(links).toEqual([]);
|
|
62
|
+
// No console error expected
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
test('should ignore invalid URLs that cause URL constructor to throw', () => {
|
|
66
|
+
const html = '<a href="http://[">Invalid</a>';
|
|
67
|
+
const links = extractLinks(html, 'https://example.com');
|
|
68
|
+
expect(links).toEqual([]);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test('should ignore non-http protocols', () => {
|
|
72
|
+
const html = `
|
|
73
|
+
<a href="mailto:test@example.com">Mail</a>
|
|
74
|
+
<a href="javascript:void(0)">JS</a>
|
|
75
|
+
<a href="ftp://example.com/file">FTP</a>
|
|
76
|
+
`;
|
|
77
|
+
const links = extractLinks(html, 'https://example.com');
|
|
78
|
+
expect(links).toEqual([]);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
test('should ignore links without href', () => {
|
|
82
|
+
const html = '<a>No Href</a>';
|
|
83
|
+
const links = extractLinks(html, 'https://example.com');
|
|
84
|
+
expect(links).toEqual([]);
|
|
85
|
+
});
|
|
86
|
+
});
|