@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
import { test, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import { crawl } from '../src/crawler/crawl.js';
|
|
3
|
+
import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
|
|
4
|
+
import { closeDb } from '../src/db/index.js';
|
|
5
|
+
import { MockAgent, setGlobalDispatcher } from 'undici';
|
|
6
|
+
|
|
7
|
+
let mockAgent: MockAgent;
|
|
8
|
+
|
|
9
|
+
beforeEach(() => {
|
|
10
|
+
process.env.CRAWLITH_DB_PATH = ':memory:';
|
|
11
|
+
mockAgent = new MockAgent();
|
|
12
|
+
mockAgent.disableNetConnect();
|
|
13
|
+
setGlobalDispatcher(mockAgent);
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
afterEach(() => {
|
|
17
|
+
closeDb();
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
test('crawler should crawl and build graph', async () => {
|
|
21
|
+
const client = mockAgent.get('https://example.com');
|
|
22
|
+
|
|
23
|
+
// Root
|
|
24
|
+
client.intercept({
|
|
25
|
+
path: '/',
|
|
26
|
+
method: 'GET'
|
|
27
|
+
}).reply(200, `
|
|
28
|
+
<html><body>
|
|
29
|
+
<a href="/page1">Page 1</a>
|
|
30
|
+
<a href="/page2">Page 2</a>
|
|
31
|
+
</body></html>
|
|
32
|
+
`, {
|
|
33
|
+
headers: { 'content-type': 'text/html' }
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
// Page 1
|
|
37
|
+
client.intercept({
|
|
38
|
+
path: '/page1',
|
|
39
|
+
method: 'GET'
|
|
40
|
+
}).reply(200, `
|
|
41
|
+
<html><body>
|
|
42
|
+
<a href="/page2">Page 2</a>
|
|
43
|
+
</body></html>
|
|
44
|
+
`, {
|
|
45
|
+
headers: { 'content-type': 'text/html' }
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
// Page 2
|
|
49
|
+
client.intercept({
|
|
50
|
+
path: '/page2',
|
|
51
|
+
method: 'GET'
|
|
52
|
+
}).reply(200, `
|
|
53
|
+
<html><body>
|
|
54
|
+
<a href="/">Home</a>
|
|
55
|
+
</body></html>
|
|
56
|
+
`, {
|
|
57
|
+
headers: { 'content-type': 'text/html' }
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
// Robots.txt
|
|
61
|
+
client.intercept({
|
|
62
|
+
path: '/robots.txt',
|
|
63
|
+
method: 'GET'
|
|
64
|
+
}).reply(404, 'Not Found');
|
|
65
|
+
|
|
66
|
+
const snapshotId = await crawl('https://example.com', {
|
|
67
|
+
limit: 10,
|
|
68
|
+
depth: 2,
|
|
69
|
+
ignoreRobots: false,
|
|
70
|
+
rate: 1000
|
|
71
|
+
});
|
|
72
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
73
|
+
|
|
74
|
+
const nodes = graph.getNodes();
|
|
75
|
+
expect(nodes.length).toBe(3);
|
|
76
|
+
|
|
77
|
+
const root = graph.nodes.get('https://example.com/');
|
|
78
|
+
expect(root).toBeDefined();
|
|
79
|
+
expect(root?.depth).toBe(0);
|
|
80
|
+
expect(root?.outLinks).toBe(2);
|
|
81
|
+
|
|
82
|
+
const page1 = graph.nodes.get('https://example.com/page1');
|
|
83
|
+
expect(page1).toBeDefined();
|
|
84
|
+
expect(page1?.depth).toBe(1);
|
|
85
|
+
expect(page1?.inLinks).toBe(1);
|
|
86
|
+
|
|
87
|
+
const page2 = graph.nodes.get('https://example.com/page2');
|
|
88
|
+
expect(page2).toBeDefined();
|
|
89
|
+
expect(page2?.inLinks).toBe(2);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
test('hard page limit', async () => {
|
|
93
|
+
const client = mockAgent.get('https://limit.com');
|
|
94
|
+
|
|
95
|
+
// Robots
|
|
96
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
97
|
+
|
|
98
|
+
// Root links to 1, 2, 3
|
|
99
|
+
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
100
|
+
<html><a href="/1">1</a><a href="/2">2</a><a href="/3">3</a></html>
|
|
101
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
102
|
+
|
|
103
|
+
// 1, 2, 3 return html
|
|
104
|
+
client.intercept({ path: '/1', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
|
|
105
|
+
client.intercept({ path: '/2', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
|
|
106
|
+
client.intercept({ path: '/3', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
|
|
107
|
+
|
|
108
|
+
const snapshotId = await crawl('https://limit.com', {
|
|
109
|
+
limit: 2, // root + 1 page
|
|
110
|
+
depth: 5,
|
|
111
|
+
ignoreRobots: true,
|
|
112
|
+
rate: 1000
|
|
113
|
+
});
|
|
114
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
115
|
+
|
|
116
|
+
// Should have visited root + 1 other page (total 2 nodes with status > 0)
|
|
117
|
+
const crawledNodes = graph.getNodes().filter(n => n.status > 0);
|
|
118
|
+
expect(crawledNodes.length).toBeLessThanOrEqual(2);
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
test('hard depth cap', async () => {
|
|
122
|
+
const client = mockAgent.get('https://depth.com');
|
|
123
|
+
|
|
124
|
+
// Robots
|
|
125
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
126
|
+
|
|
127
|
+
// Chain of 12 pages
|
|
128
|
+
for (let i = 0; i < 12; i++) {
|
|
129
|
+
const path = i === 0 ? '/' : `/p${i}`;
|
|
130
|
+
const nextPath = `/p${i + 1}`;
|
|
131
|
+
client.intercept({ path, method: 'GET' }).reply(200, `
|
|
132
|
+
<html><a href="${nextPath}">Next</a></html>
|
|
133
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const snapshotId = await crawl('https://depth.com', {
|
|
137
|
+
limit: 100,
|
|
138
|
+
depth: 20, // requested 20, but internal hard cap is 10
|
|
139
|
+
ignoreRobots: true,
|
|
140
|
+
rate: 1000
|
|
141
|
+
});
|
|
142
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
143
|
+
|
|
144
|
+
const crawledNodes = graph.getNodes().filter(n => n.status > 0);
|
|
145
|
+
const maxCrawledDepth = crawledNodes.reduce((max, n) => Math.max(max, n.depth), 0);
|
|
146
|
+
|
|
147
|
+
expect(maxCrawledDepth).toBeLessThanOrEqual(10);
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
test('parameter explosion control', async () => {
|
|
151
|
+
const client = mockAgent.get('https://params.com');
|
|
152
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
153
|
+
|
|
154
|
+
// Root links to many variations
|
|
155
|
+
let links = '';
|
|
156
|
+
for (let i = 0; i < 10; i++) {
|
|
157
|
+
links += `<a href="/search?q=${i}">q${i}</a>`;
|
|
158
|
+
}
|
|
159
|
+
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
160
|
+
<html>${links}</html>
|
|
161
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
162
|
+
|
|
163
|
+
// Intercept all variations
|
|
164
|
+
for (let i = 0; i < 40; i++) {
|
|
165
|
+
client.intercept({ path: `/search?q=${i}`, method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const snapshotId = await crawl('https://params.com', {
|
|
169
|
+
limit: 100,
|
|
170
|
+
depth: 5,
|
|
171
|
+
ignoreRobots: true,
|
|
172
|
+
stripQuery: false,
|
|
173
|
+
detectTraps: true,
|
|
174
|
+
rate: 1000
|
|
175
|
+
});
|
|
176
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
177
|
+
|
|
178
|
+
// Should only crawl 5 variations + root
|
|
179
|
+
const nodes = graph.getNodes();
|
|
180
|
+
// Filter nodes that match /search pathname
|
|
181
|
+
const searchNodes = nodes.filter(n => n.url.includes('/search') && n.status > 0);
|
|
182
|
+
|
|
183
|
+
expect(searchNodes.length).toBeLessThanOrEqual(31);
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
test('redirect safety', async () => {
|
|
187
|
+
const client = mockAgent.get('https://redirect.com');
|
|
188
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
189
|
+
|
|
190
|
+
// Root -> /redir1
|
|
191
|
+
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
192
|
+
<html><a href="/redir1">Go</a></html>
|
|
193
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
194
|
+
|
|
195
|
+
// /redir1 -> 301 -> /dest
|
|
196
|
+
client.intercept({ path: '/redir1', method: 'GET' }).reply(301, '', {
|
|
197
|
+
headers: { 'location': '/dest' }
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
// /dest -> 200
|
|
201
|
+
client.intercept({ path: '/dest', method: 'GET' }).reply(200, '<html>Success</html>', { headers: { 'content-type': 'text/html' } });
|
|
202
|
+
|
|
203
|
+
const snapshotId = await crawl('https://redirect.com', {
|
|
204
|
+
limit: 10,
|
|
205
|
+
depth: 5,
|
|
206
|
+
ignoreRobots: true,
|
|
207
|
+
rate: 1000
|
|
208
|
+
});
|
|
209
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
210
|
+
|
|
211
|
+
const destNode = graph.nodes.get('https://redirect.com/dest');
|
|
212
|
+
expect(destNode).toBeDefined();
|
|
213
|
+
expect(destNode?.status).toBe(200);
|
|
214
|
+
|
|
215
|
+
// Redirect loop: A -> B -> A
|
|
216
|
+
const clientLoop = mockAgent.get('https://loop.com');
|
|
217
|
+
clientLoop.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
218
|
+
clientLoop.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
219
|
+
<html><a href="/a">Loop</a></html>
|
|
220
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
221
|
+
|
|
222
|
+
clientLoop.intercept({ path: '/a', method: 'GET' }).reply(301, '', { headers: { location: '/b' } });
|
|
223
|
+
clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
|
|
224
|
+
// We might mock /a again if it retries, but it shouldn't infinitely loop
|
|
225
|
+
|
|
226
|
+
const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
|
|
227
|
+
const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
|
|
228
|
+
// It should eventually stop
|
|
229
|
+
expect(graphLoop.getNodes().length).toBeGreaterThan(0);
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
test('mime check', async () => {
|
|
233
|
+
const client = mockAgent.get('https://mime.com');
|
|
234
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
235
|
+
|
|
236
|
+
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
237
|
+
<html><a href="/image.png">Img</a></html>
|
|
238
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
239
|
+
|
|
240
|
+
client.intercept({ path: '/data', method: 'GET' }).reply(200, `
|
|
241
|
+
<html><a href="/hidden">Hidden</a></html>
|
|
242
|
+
`, { headers: { 'content-type': 'application/json' } });
|
|
243
|
+
|
|
244
|
+
// Root links to /data
|
|
245
|
+
client.intercept({ path: '/start', method: 'GET' }).reply(200, `
|
|
246
|
+
<html><a href="/data">Data</a></html>
|
|
247
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
248
|
+
|
|
249
|
+
const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
|
|
250
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
251
|
+
|
|
252
|
+
// /data should be in graph
|
|
253
|
+
const dataNode = graph.nodes.get('https://mime.com/data');
|
|
254
|
+
expect(dataNode).toBeDefined();
|
|
255
|
+
// But we should NOT have parsed it, so /hidden should NOT be in graph
|
|
256
|
+
const hiddenNode = graph.nodes.get('https://mime.com/hidden');
|
|
257
|
+
expect(hiddenNode).toBeUndefined();
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
test('self-link guard', async () => {
|
|
261
|
+
const client = mockAgent.get('https://self.com');
|
|
262
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
263
|
+
|
|
264
|
+
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
265
|
+
<html><a href="/">Self</a><a href="/other">Other</a></html>
|
|
266
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
267
|
+
|
|
268
|
+
client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
|
|
269
|
+
|
|
270
|
+
const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
|
|
271
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
272
|
+
|
|
273
|
+
const edges = graph.getEdges();
|
|
274
|
+
const selfEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/');
|
|
275
|
+
expect(selfEdge).toBeUndefined();
|
|
276
|
+
|
|
277
|
+
const otherEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/other');
|
|
278
|
+
expect(otherEdge).toBeDefined();
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
test('limit warning', async () => {
|
|
282
|
+
const client = mockAgent.get('https://warn.com');
|
|
283
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
284
|
+
|
|
285
|
+
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
286
|
+
<html><a href="/1">1</a><a href="/2">2</a></html>
|
|
287
|
+
`, { headers: { 'content-type': 'text/html' } });
|
|
288
|
+
|
|
289
|
+
client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
|
|
290
|
+
|
|
291
|
+
const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 });
|
|
292
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
293
|
+
|
|
294
|
+
expect(graph.limitReached).toBe(true);
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
test('seeds from sitemap', async () => {
|
|
298
|
+
const client = mockAgent.get('https://sitemap-seed.com');
|
|
299
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
300
|
+
|
|
301
|
+
// Sitemap
|
|
302
|
+
client.intercept({ path: '/sitemap.xml', method: 'GET' }).reply(200, `
|
|
303
|
+
<urlset><url><loc>https://sitemap-seed.com/page1</loc></url></urlset>
|
|
304
|
+
`);
|
|
305
|
+
|
|
306
|
+
// Root
|
|
307
|
+
client.intercept({ path: '/', method: 'GET' }).reply(200, '<html>Root</html>', { headers: { 'content-type': 'text/html' } });
|
|
308
|
+
|
|
309
|
+
// Page 1
|
|
310
|
+
client.intercept({ path: '/page1', method: 'GET' }).reply(200, '<html>Page 1</html>', { headers: { 'content-type': 'text/html' } });
|
|
311
|
+
|
|
312
|
+
const snapshotId = await crawl('https://sitemap-seed.com', {
|
|
313
|
+
limit: 10,
|
|
314
|
+
depth: 5,
|
|
315
|
+
ignoreRobots: true,
|
|
316
|
+
sitemap: 'true',
|
|
317
|
+
rate: 1000
|
|
318
|
+
});
|
|
319
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
320
|
+
|
|
321
|
+
const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
|
|
322
|
+
expect(page1).toBeDefined();
|
|
323
|
+
expect(page1?.status).toBe(200);
|
|
324
|
+
});
|
|
325
|
+
|
|
326
|
+
test('incremental crawl uses etags', async () => {
|
|
327
|
+
const client = mockAgent.get('https://incremental.com');
|
|
328
|
+
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
329
|
+
|
|
330
|
+
// First crawl setup
|
|
331
|
+
client.intercept({ path: '/', method: 'GET' }).reply(200, 'Original', {
|
|
332
|
+
headers: { 'content-type': 'text/html', 'etag': '"v1"' }
|
|
333
|
+
});
|
|
334
|
+
|
|
335
|
+
const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 });
|
|
336
|
+
const graph1 = loadGraphFromSnapshot(snapshotId1);
|
|
337
|
+
const node1 = graph1.nodes.get('https://incremental.com/');
|
|
338
|
+
expect(node1?.etag).toBe('"v1"');
|
|
339
|
+
|
|
340
|
+
// Second crawl setup
|
|
341
|
+
client.intercept({
|
|
342
|
+
path: '/',
|
|
343
|
+
method: 'GET',
|
|
344
|
+
headers: { 'If-None-Match': '"v1"' }
|
|
345
|
+
}).reply(304, '', { headers: { 'etag': '"v1"' } });
|
|
346
|
+
|
|
347
|
+
const snapshotId2 = await crawl('https://incremental.com', {
|
|
348
|
+
limit: 10,
|
|
349
|
+
depth: 1,
|
|
350
|
+
ignoreRobots: true,
|
|
351
|
+
previousGraph: graph1,
|
|
352
|
+
rate: 1000
|
|
353
|
+
});
|
|
354
|
+
const graph2 = loadGraphFromSnapshot(snapshotId2);
|
|
355
|
+
|
|
356
|
+
const node2 = graph2.nodes.get('https://incremental.com/');
|
|
357
|
+
expect(node2?.incrementalStatus).toBe('unchanged');
|
|
358
|
+
});
|
package/tests/db.test.ts
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import Database from 'better-sqlite3';
|
|
3
|
+
import { initSchema } from '../src/db/schema.js';
|
|
4
|
+
import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
|
|
5
|
+
import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
|
|
6
|
+
import { PageRepository } from '../src/db/repositories/PageRepository.js';
|
|
7
|
+
import { EdgeRepository } from '../src/db/repositories/EdgeRepository.js';
|
|
8
|
+
import { MetricsRepository } from '../src/db/repositories/MetricsRepository.js';
|
|
9
|
+
|
|
10
|
+
describe('Database Layer', () => {
|
|
11
|
+
let db: Database.Database;
|
|
12
|
+
let siteRepo: SiteRepository;
|
|
13
|
+
let snapshotRepo: SnapshotRepository;
|
|
14
|
+
let pageRepo: PageRepository;
|
|
15
|
+
let edgeRepo: EdgeRepository;
|
|
16
|
+
let metricsRepo: MetricsRepository;
|
|
17
|
+
|
|
18
|
+
beforeEach(() => {
|
|
19
|
+
db = new Database(':memory:');
|
|
20
|
+
initSchema(db);
|
|
21
|
+
siteRepo = new SiteRepository(db);
|
|
22
|
+
snapshotRepo = new SnapshotRepository(db);
|
|
23
|
+
pageRepo = new PageRepository(db);
|
|
24
|
+
edgeRepo = new EdgeRepository(db);
|
|
25
|
+
metricsRepo = new MetricsRepository(db);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
afterEach(() => {
|
|
29
|
+
db.close();
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
it('should create and retrieve a site', () => {
|
|
33
|
+
const domain = 'example.com';
|
|
34
|
+
const id = siteRepo.createSite(domain);
|
|
35
|
+
expect(id).toBeGreaterThan(0);
|
|
36
|
+
|
|
37
|
+
const site = siteRepo.getSite(domain);
|
|
38
|
+
expect(site).toBeDefined();
|
|
39
|
+
expect(site?.domain).toBe(domain);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it('should create and retrieve a snapshot', () => {
|
|
43
|
+
const siteId = siteRepo.createSite('example.com');
|
|
44
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
|
|
45
|
+
expect(snapshotId).toBeGreaterThan(0);
|
|
46
|
+
|
|
47
|
+
const snapshot = snapshotRepo.getLatestSnapshot(siteId);
|
|
48
|
+
expect(snapshot).toBeDefined();
|
|
49
|
+
expect(snapshot?.status).toBe('running');
|
|
50
|
+
|
|
51
|
+
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 10, edge_count: 5 });
|
|
52
|
+
const updated = snapshotRepo.getLatestSnapshot(siteId);
|
|
53
|
+
expect(updated?.status).toBe('completed');
|
|
54
|
+
expect(updated?.node_count).toBe(10);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('should upsert pages', () => {
|
|
58
|
+
const siteId = siteRepo.createSite('example.com');
|
|
59
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
|
|
60
|
+
const url = 'http://example.com';
|
|
61
|
+
|
|
62
|
+
// First insert
|
|
63
|
+
pageRepo.upsertPage({
|
|
64
|
+
site_id: siteId,
|
|
65
|
+
normalized_url: url,
|
|
66
|
+
last_seen_snapshot_id: snapshotId,
|
|
67
|
+
http_status: 200,
|
|
68
|
+
depth: 0
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
let page = pageRepo.getPage(siteId, url);
|
|
72
|
+
expect(page).toBeDefined();
|
|
73
|
+
expect(page?.first_seen_snapshot_id).toBe(snapshotId);
|
|
74
|
+
expect(page?.last_seen_snapshot_id).toBe(snapshotId);
|
|
75
|
+
expect(page?.http_status).toBe(200);
|
|
76
|
+
|
|
77
|
+
// Update (second snapshot)
|
|
78
|
+
const snapshotId2 = snapshotRepo.createSnapshot(siteId, 'incremental');
|
|
79
|
+
pageRepo.upsertPage({
|
|
80
|
+
site_id: siteId,
|
|
81
|
+
normalized_url: url,
|
|
82
|
+
last_seen_snapshot_id: snapshotId2,
|
|
83
|
+
http_status: 200, // same status
|
|
84
|
+
depth: 0
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
page = pageRepo.getPage(siteId, url);
|
|
88
|
+
expect(page?.first_seen_snapshot_id).toBe(snapshotId); // Should remain the first one
|
|
89
|
+
expect(page?.last_seen_snapshot_id).toBe(snapshotId2); // Should update to the second one
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('should persist new columns (nofollow, security_error, retries)', () => {
|
|
93
|
+
const siteId = siteRepo.createSite('new-cols.com');
|
|
94
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
|
|
95
|
+
const url = 'http://new-cols.com';
|
|
96
|
+
|
|
97
|
+
pageRepo.upsertPage({
|
|
98
|
+
site_id: siteId,
|
|
99
|
+
normalized_url: url,
|
|
100
|
+
last_seen_snapshot_id: snapshotId,
|
|
101
|
+
nofollow: 1,
|
|
102
|
+
security_error: 'blocked',
|
|
103
|
+
retries: 3
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
const page = pageRepo.getPage(siteId, url);
|
|
107
|
+
expect(page?.nofollow).toBe(1);
|
|
108
|
+
expect(page?.security_error).toBe('blocked');
|
|
109
|
+
expect(page?.retries).toBe(3);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it('should insert and retrieve edges', () => {
|
|
113
|
+
const siteId = siteRepo.createSite('example.com');
|
|
114
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
|
|
115
|
+
|
|
116
|
+
// Create pages first
|
|
117
|
+
pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/1', last_seen_snapshot_id: snapshotId });
|
|
118
|
+
pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/2', last_seen_snapshot_id: snapshotId });
|
|
119
|
+
|
|
120
|
+
const p1 = pageRepo.getPage(siteId, 'http://example.com/1')!;
|
|
121
|
+
const p2 = pageRepo.getPage(siteId, 'http://example.com/2')!;
|
|
122
|
+
|
|
123
|
+
edgeRepo.insertEdge(snapshotId, p1.id, p2.id, 1.0, 'internal');
|
|
124
|
+
|
|
125
|
+
const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
|
|
126
|
+
expect(edges).toHaveLength(1);
|
|
127
|
+
expect(edges[0].source_page_id).toBe(p1.id);
|
|
128
|
+
expect(edges[0].target_page_id).toBe(p2.id);
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
it('should insert and retrieve metrics', () => {
|
|
132
|
+
const siteId = siteRepo.createSite('example.com');
|
|
133
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
|
|
134
|
+
pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/1', last_seen_snapshot_id: snapshotId });
|
|
135
|
+
const p1 = pageRepo.getPage(siteId, 'http://example.com/1')!;
|
|
136
|
+
|
|
137
|
+
metricsRepo.insertMetrics({
|
|
138
|
+
snapshot_id: snapshotId,
|
|
139
|
+
page_id: p1.id,
|
|
140
|
+
authority_score: 0.5,
|
|
141
|
+
hub_score: 0.2,
|
|
142
|
+
pagerank: 0.8,
|
|
143
|
+
pagerank_score: 80.0,
|
|
144
|
+
link_role: 'authority',
|
|
145
|
+
crawl_status: 'fetched',
|
|
146
|
+
word_count: 100,
|
|
147
|
+
thin_content_score: 0.1,
|
|
148
|
+
external_link_ratio: 0.0,
|
|
149
|
+
orphan_score: 0,
|
|
150
|
+
duplicate_cluster_id: null,
|
|
151
|
+
duplicate_type: null,
|
|
152
|
+
is_cluster_primary: 0
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
const metrics = metricsRepo.getMetricsForPage(snapshotId, p1.id);
|
|
156
|
+
expect(metrics).toBeDefined();
|
|
157
|
+
expect(metrics?.authority_score).toBe(0.5);
|
|
158
|
+
});
|
|
159
|
+
});
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { test, expect } from 'vitest';
|
|
2
|
+
import { Graph } from '../src/graph/graph.js';
|
|
3
|
+
import { compareGraphs } from '../src/diff/compare.js';
|
|
4
|
+
|
|
5
|
+
test('detects added and removed urls', () => {
|
|
6
|
+
const oldGraph = new Graph();
|
|
7
|
+
oldGraph.addNode('https://example.com/a', 0, 200);
|
|
8
|
+
oldGraph.addNode('https://example.com/b', 1, 200);
|
|
9
|
+
|
|
10
|
+
const newGraph = new Graph();
|
|
11
|
+
newGraph.addNode('https://example.com/a', 0, 200);
|
|
12
|
+
newGraph.addNode('https://example.com/c', 1, 200); // Added
|
|
13
|
+
|
|
14
|
+
const diff = compareGraphs(oldGraph, newGraph);
|
|
15
|
+
expect(diff.addedUrls).toContain('https://example.com/c');
|
|
16
|
+
expect(diff.removedUrls).toContain('https://example.com/b');
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
test('detects status changes', () => {
|
|
20
|
+
const oldGraph = new Graph();
|
|
21
|
+
oldGraph.addNode('https://example.com/a', 0, 200);
|
|
22
|
+
|
|
23
|
+
const newGraph = new Graph();
|
|
24
|
+
newGraph.addNode('https://example.com/a', 0, 404);
|
|
25
|
+
|
|
26
|
+
const diff = compareGraphs(oldGraph, newGraph);
|
|
27
|
+
expect(diff.changedStatus).toHaveLength(1);
|
|
28
|
+
expect(diff.changedStatus[0]).toEqual({
|
|
29
|
+
url: 'https://example.com/a',
|
|
30
|
+
oldStatus: 200,
|
|
31
|
+
newStatus: 404
|
|
32
|
+
});
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
test('detects canonical changes', () => {
|
|
36
|
+
const oldGraph = new Graph();
|
|
37
|
+
oldGraph.addNode('https://example.com/a', 0, 200);
|
|
38
|
+
oldGraph.updateNodeData('https://example.com/a', { canonical: 'https://example.com/canon1' });
|
|
39
|
+
|
|
40
|
+
const newGraph = new Graph();
|
|
41
|
+
newGraph.addNode('https://example.com/a', 0, 200);
|
|
42
|
+
newGraph.updateNodeData('https://example.com/a', { canonical: 'https://example.com/canon2' });
|
|
43
|
+
|
|
44
|
+
const diff = compareGraphs(oldGraph, newGraph);
|
|
45
|
+
expect(diff.changedCanonical).toHaveLength(1);
|
|
46
|
+
expect(diff.changedCanonical[0]).toEqual({
|
|
47
|
+
url: 'https://example.com/a',
|
|
48
|
+
oldCanonical: 'https://example.com/canon1',
|
|
49
|
+
newCanonical: 'https://example.com/canon2'
|
|
50
|
+
});
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
test('calculates metric deltas', () => {
|
|
54
|
+
const oldGraph = new Graph();
|
|
55
|
+
// Orphan: A (depth 1, inLinks 0)
|
|
56
|
+
oldGraph.addNode('https://example.com/a', 1, 200);
|
|
57
|
+
|
|
58
|
+
const newGraph = new Graph();
|
|
59
|
+
// Not Orphan: Root -> A
|
|
60
|
+
newGraph.addNode('https://example.com/', 0, 200);
|
|
61
|
+
newGraph.addNode('https://example.com/a', 1, 200);
|
|
62
|
+
newGraph.addEdge('https://example.com/', 'https://example.com/a');
|
|
63
|
+
|
|
64
|
+
const diff = compareGraphs(oldGraph, newGraph);
|
|
65
|
+
// Old orphan count: 1 (A). New: 0. Delta: -1.
|
|
66
|
+
expect(diff.metricDeltas.orphanCount).toBe(-1);
|
|
67
|
+
});
|