@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/tests/crawler.test.ts
DELETED
|
@@ -1,364 +0,0 @@
|
|
|
1
|
-
import { test, expect, beforeEach, afterEach, vi } from 'vitest';
|
|
2
|
-
import { crawl } from '../src/crawler/crawl.js';
|
|
3
|
-
import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
|
|
4
|
-
import { closeDb } from '../src/db/index.js';
|
|
5
|
-
import { MockAgent, setGlobalDispatcher } from 'undici';
|
|
6
|
-
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
7
|
-
import { EngineContext } from '../src/events.js';
|
|
8
|
-
|
|
9
|
-
let mockAgent: MockAgent;
|
|
10
|
-
const mockContext: EngineContext = { emit: vi.fn() };
|
|
11
|
-
|
|
12
|
-
beforeEach(() => {
|
|
13
|
-
process.env.CRAWLITH_DB_PATH = ':memory:';
|
|
14
|
-
mockAgent = new MockAgent();
|
|
15
|
-
mockAgent.disableNetConnect();
|
|
16
|
-
setGlobalDispatcher(mockAgent);
|
|
17
|
-
|
|
18
|
-
// IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
|
|
19
|
-
vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
afterEach(() => {
|
|
23
|
-
closeDb();
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
test('crawler should crawl and build graph', async () => {
|
|
27
|
-
const client = mockAgent.get('https://example.com');
|
|
28
|
-
|
|
29
|
-
// Root
|
|
30
|
-
client.intercept({
|
|
31
|
-
path: '/',
|
|
32
|
-
method: 'GET'
|
|
33
|
-
}).reply(200, `
|
|
34
|
-
<html><body>
|
|
35
|
-
<a href="/page1">Page 1</a>
|
|
36
|
-
<a href="/page2">Page 2</a>
|
|
37
|
-
</body></html>
|
|
38
|
-
`, {
|
|
39
|
-
headers: { 'content-type': 'text/html' }
|
|
40
|
-
});
|
|
41
|
-
|
|
42
|
-
// Page 1
|
|
43
|
-
client.intercept({
|
|
44
|
-
path: '/page1',
|
|
45
|
-
method: 'GET'
|
|
46
|
-
}).reply(200, `
|
|
47
|
-
<html><body>
|
|
48
|
-
<a href="/page2">Page 2</a>
|
|
49
|
-
</body></html>
|
|
50
|
-
`, {
|
|
51
|
-
headers: { 'content-type': 'text/html' }
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
// Page 2
|
|
55
|
-
client.intercept({
|
|
56
|
-
path: '/page2',
|
|
57
|
-
method: 'GET'
|
|
58
|
-
}).reply(200, `
|
|
59
|
-
<html><body>
|
|
60
|
-
<a href="/">Home</a>
|
|
61
|
-
</body></html>
|
|
62
|
-
`, {
|
|
63
|
-
headers: { 'content-type': 'text/html' }
|
|
64
|
-
});
|
|
65
|
-
|
|
66
|
-
// Robots.txt
|
|
67
|
-
client.intercept({
|
|
68
|
-
path: '/robots.txt',
|
|
69
|
-
method: 'GET'
|
|
70
|
-
}).reply(404, 'Not Found');
|
|
71
|
-
|
|
72
|
-
const snapshotId = await crawl('https://example.com', {
|
|
73
|
-
limit: 10,
|
|
74
|
-
depth: 2,
|
|
75
|
-
ignoreRobots: false,
|
|
76
|
-
rate: 1000
|
|
77
|
-
}, mockContext);
|
|
78
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
79
|
-
|
|
80
|
-
const nodes = graph.getNodes();
|
|
81
|
-
expect(nodes.length).toBe(3);
|
|
82
|
-
|
|
83
|
-
const root = graph.nodes.get('https://example.com/');
|
|
84
|
-
expect(root).toBeDefined();
|
|
85
|
-
expect(root?.depth).toBe(0);
|
|
86
|
-
expect(root?.outLinks).toBe(2);
|
|
87
|
-
|
|
88
|
-
const page1 = graph.nodes.get('https://example.com/page1');
|
|
89
|
-
expect(page1).toBeDefined();
|
|
90
|
-
expect(page1?.depth).toBe(1);
|
|
91
|
-
expect(page1?.inLinks).toBe(1);
|
|
92
|
-
|
|
93
|
-
const page2 = graph.nodes.get('https://example.com/page2');
|
|
94
|
-
expect(page2).toBeDefined();
|
|
95
|
-
expect(page2?.inLinks).toBe(2);
|
|
96
|
-
});
|
|
97
|
-
|
|
98
|
-
test('hard page limit', async () => {
|
|
99
|
-
const client = mockAgent.get('https://limit.com');
|
|
100
|
-
|
|
101
|
-
// Robots
|
|
102
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
103
|
-
|
|
104
|
-
// Root links to 1, 2, 3
|
|
105
|
-
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
106
|
-
<html><a href="/1">1</a><a href="/2">2</a><a href="/3">3</a></html>
|
|
107
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
108
|
-
|
|
109
|
-
// 1, 2, 3 return html
|
|
110
|
-
client.intercept({ path: '/1', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
|
|
111
|
-
client.intercept({ path: '/2', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
|
|
112
|
-
client.intercept({ path: '/3', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
|
|
113
|
-
|
|
114
|
-
const snapshotId = await crawl('https://limit.com', {
|
|
115
|
-
limit: 2, // root + 1 page
|
|
116
|
-
depth: 5,
|
|
117
|
-
ignoreRobots: true,
|
|
118
|
-
rate: 1000
|
|
119
|
-
}, mockContext);
|
|
120
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
121
|
-
|
|
122
|
-
// Should have visited root + 1 other page (total 2 nodes with status > 0)
|
|
123
|
-
const crawledNodes = graph.getNodes().filter(n => n.status > 0);
|
|
124
|
-
expect(crawledNodes.length).toBeLessThanOrEqual(2);
|
|
125
|
-
});
|
|
126
|
-
|
|
127
|
-
test('hard depth cap', async () => {
|
|
128
|
-
const client = mockAgent.get('https://depth.com');
|
|
129
|
-
|
|
130
|
-
// Robots
|
|
131
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
132
|
-
|
|
133
|
-
// Chain of 12 pages
|
|
134
|
-
for (let i = 0; i < 12; i++) {
|
|
135
|
-
const path = i === 0 ? '/' : `/p${i}`;
|
|
136
|
-
const nextPath = `/p${i + 1}`;
|
|
137
|
-
client.intercept({ path, method: 'GET' }).reply(200, `
|
|
138
|
-
<html><a href="${nextPath}">Next</a></html>
|
|
139
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
const snapshotId = await crawl('https://depth.com', {
|
|
143
|
-
limit: 100,
|
|
144
|
-
depth: 20, // requested 20, but internal hard cap is 10
|
|
145
|
-
ignoreRobots: true,
|
|
146
|
-
rate: 1000
|
|
147
|
-
}, mockContext);
|
|
148
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
149
|
-
|
|
150
|
-
const crawledNodes = graph.getNodes().filter(n => n.status > 0);
|
|
151
|
-
const maxCrawledDepth = crawledNodes.reduce((max, n) => Math.max(max, n.depth), 0);
|
|
152
|
-
|
|
153
|
-
expect(maxCrawledDepth).toBeLessThanOrEqual(10);
|
|
154
|
-
});
|
|
155
|
-
|
|
156
|
-
test('parameter explosion control', async () => {
|
|
157
|
-
const client = mockAgent.get('https://params.com');
|
|
158
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
159
|
-
|
|
160
|
-
// Root links to many variations
|
|
161
|
-
let links = '';
|
|
162
|
-
for (let i = 0; i < 10; i++) {
|
|
163
|
-
links += `<a href="/search?q=${i}">q${i}</a>`;
|
|
164
|
-
}
|
|
165
|
-
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
166
|
-
<html>${links}</html>
|
|
167
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
168
|
-
|
|
169
|
-
// Intercept all variations
|
|
170
|
-
for (let i = 0; i < 40; i++) {
|
|
171
|
-
client.intercept({ path: `/search?q=${i}`, method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
const snapshotId = await crawl('https://params.com', {
|
|
175
|
-
limit: 100,
|
|
176
|
-
depth: 5,
|
|
177
|
-
ignoreRobots: true,
|
|
178
|
-
stripQuery: false,
|
|
179
|
-
detectTraps: true,
|
|
180
|
-
rate: 1000
|
|
181
|
-
}, mockContext);
|
|
182
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
183
|
-
|
|
184
|
-
// Should only crawl 5 variations + root
|
|
185
|
-
const nodes = graph.getNodes();
|
|
186
|
-
// Filter nodes that match /search pathname
|
|
187
|
-
const searchNodes = nodes.filter(n => n.url.includes('/search') && n.status > 0);
|
|
188
|
-
|
|
189
|
-
expect(searchNodes.length).toBeLessThanOrEqual(31);
|
|
190
|
-
});
|
|
191
|
-
|
|
192
|
-
test('redirect safety', async () => {
|
|
193
|
-
const client = mockAgent.get('https://redirect.com');
|
|
194
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
195
|
-
|
|
196
|
-
// Root -> /redir1
|
|
197
|
-
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
198
|
-
<html><a href="/redir1">Go</a></html>
|
|
199
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
200
|
-
|
|
201
|
-
// /redir1 -> 301 -> /dest
|
|
202
|
-
client.intercept({ path: '/redir1', method: 'GET' }).reply(301, '', {
|
|
203
|
-
headers: { 'location': '/dest' }
|
|
204
|
-
});
|
|
205
|
-
|
|
206
|
-
// /dest -> 200
|
|
207
|
-
client.intercept({ path: '/dest', method: 'GET' }).reply(200, '<html>Success</html>', { headers: { 'content-type': 'text/html' } });
|
|
208
|
-
|
|
209
|
-
const snapshotId = await crawl('https://redirect.com', {
|
|
210
|
-
limit: 10,
|
|
211
|
-
depth: 5,
|
|
212
|
-
ignoreRobots: true,
|
|
213
|
-
rate: 1000
|
|
214
|
-
}, mockContext);
|
|
215
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
216
|
-
|
|
217
|
-
const destNode = graph.nodes.get('https://redirect.com/dest');
|
|
218
|
-
expect(destNode).toBeDefined();
|
|
219
|
-
expect(destNode?.status).toBe(200);
|
|
220
|
-
|
|
221
|
-
// Redirect loop: A -> B -> A
|
|
222
|
-
const clientLoop = mockAgent.get('https://loop.com');
|
|
223
|
-
clientLoop.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
224
|
-
clientLoop.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
225
|
-
<html><a href="/a">Loop</a></html>
|
|
226
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
227
|
-
|
|
228
|
-
clientLoop.intercept({ path: '/a', method: 'GET' }).reply(301, '', { headers: { location: '/b' } });
|
|
229
|
-
clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
|
|
230
|
-
// We might mock /a again if it retries, but it shouldn't infinitely loop
|
|
231
|
-
|
|
232
|
-
const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
233
|
-
const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
|
|
234
|
-
// It should eventually stop
|
|
235
|
-
expect(graphLoop.getNodes().length).toBeGreaterThan(0);
|
|
236
|
-
});
|
|
237
|
-
|
|
238
|
-
test('mime check', async () => {
|
|
239
|
-
const client = mockAgent.get('https://mime.com');
|
|
240
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
241
|
-
|
|
242
|
-
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
243
|
-
<html><a href="/image.png">Img</a></html>
|
|
244
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
245
|
-
|
|
246
|
-
client.intercept({ path: '/data', method: 'GET' }).reply(200, `
|
|
247
|
-
<html><a href="/hidden">Hidden</a></html>
|
|
248
|
-
`, { headers: { 'content-type': 'application/json' } });
|
|
249
|
-
|
|
250
|
-
// Root links to /data
|
|
251
|
-
client.intercept({ path: '/start', method: 'GET' }).reply(200, `
|
|
252
|
-
<html><a href="/data">Data</a></html>
|
|
253
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
254
|
-
|
|
255
|
-
const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
256
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
257
|
-
|
|
258
|
-
// /data should be in graph
|
|
259
|
-
const dataNode = graph.nodes.get('https://mime.com/data');
|
|
260
|
-
expect(dataNode).toBeDefined();
|
|
261
|
-
// But we should NOT have parsed it, so /hidden should NOT be in graph
|
|
262
|
-
const hiddenNode = graph.nodes.get('https://mime.com/hidden');
|
|
263
|
-
expect(hiddenNode).toBeUndefined();
|
|
264
|
-
});
|
|
265
|
-
|
|
266
|
-
test('self-link guard', async () => {
|
|
267
|
-
const client = mockAgent.get('https://self.com');
|
|
268
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
269
|
-
|
|
270
|
-
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
271
|
-
<html><a href="/">Self</a><a href="/other">Other</a></html>
|
|
272
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
273
|
-
|
|
274
|
-
client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
|
|
275
|
-
|
|
276
|
-
const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
277
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
278
|
-
|
|
279
|
-
const edges = graph.getEdges();
|
|
280
|
-
const selfEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/');
|
|
281
|
-
expect(selfEdge).toBeUndefined();
|
|
282
|
-
|
|
283
|
-
const otherEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/other');
|
|
284
|
-
expect(otherEdge).toBeDefined();
|
|
285
|
-
});
|
|
286
|
-
|
|
287
|
-
test('limit warning', async () => {
|
|
288
|
-
const client = mockAgent.get('https://warn.com');
|
|
289
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
290
|
-
|
|
291
|
-
client.intercept({ path: '/', method: 'GET' }).reply(200, `
|
|
292
|
-
<html><a href="/1">1</a><a href="/2">2</a></html>
|
|
293
|
-
`, { headers: { 'content-type': 'text/html' } });
|
|
294
|
-
|
|
295
|
-
client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
|
|
296
|
-
|
|
297
|
-
const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
298
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
299
|
-
|
|
300
|
-
expect(graph.limitReached).toBe(true);
|
|
301
|
-
});
|
|
302
|
-
|
|
303
|
-
test('seeds from sitemap', async () => {
|
|
304
|
-
const client = mockAgent.get('https://sitemap-seed.com');
|
|
305
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
306
|
-
|
|
307
|
-
// Sitemap
|
|
308
|
-
client.intercept({ path: '/sitemap.xml', method: 'GET' }).reply(200, `
|
|
309
|
-
<urlset><url><loc>https://sitemap-seed.com/page1</loc></url></urlset>
|
|
310
|
-
`);
|
|
311
|
-
|
|
312
|
-
// Root
|
|
313
|
-
client.intercept({ path: '/', method: 'GET' }).reply(200, '<html>Root</html>', { headers: { 'content-type': 'text/html' } });
|
|
314
|
-
|
|
315
|
-
// Page 1
|
|
316
|
-
client.intercept({ path: '/page1', method: 'GET' }).reply(200, '<html>Page 1</html>', { headers: { 'content-type': 'text/html' } });
|
|
317
|
-
|
|
318
|
-
const snapshotId = await crawl('https://sitemap-seed.com', {
|
|
319
|
-
limit: 10,
|
|
320
|
-
depth: 5,
|
|
321
|
-
ignoreRobots: true,
|
|
322
|
-
sitemap: 'true',
|
|
323
|
-
rate: 1000
|
|
324
|
-
}, mockContext);
|
|
325
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
326
|
-
|
|
327
|
-
const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
|
|
328
|
-
expect(page1).toBeDefined();
|
|
329
|
-
expect(page1?.status).toBe(200);
|
|
330
|
-
});
|
|
331
|
-
|
|
332
|
-
test('incremental crawl uses etags', async () => {
|
|
333
|
-
const client = mockAgent.get('https://incremental.com');
|
|
334
|
-
client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
|
|
335
|
-
|
|
336
|
-
// First crawl setup
|
|
337
|
-
client.intercept({ path: '/', method: 'GET' }).reply(200, 'Original', {
|
|
338
|
-
headers: { 'content-type': 'text/html', 'etag': '"v1"' }
|
|
339
|
-
});
|
|
340
|
-
|
|
341
|
-
const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 }, mockContext);
|
|
342
|
-
const graph1 = loadGraphFromSnapshot(snapshotId1);
|
|
343
|
-
const node1 = graph1.nodes.get('https://incremental.com/');
|
|
344
|
-
expect(node1?.etag).toBe('"v1"');
|
|
345
|
-
|
|
346
|
-
// Second crawl setup
|
|
347
|
-
client.intercept({
|
|
348
|
-
path: '/',
|
|
349
|
-
method: 'GET',
|
|
350
|
-
headers: { 'If-None-Match': '"v1"' }
|
|
351
|
-
}).reply(304, '', { headers: { 'etag': '"v1"' } });
|
|
352
|
-
|
|
353
|
-
const snapshotId2 = await crawl('https://incremental.com', {
|
|
354
|
-
limit: 10,
|
|
355
|
-
depth: 1,
|
|
356
|
-
ignoreRobots: true,
|
|
357
|
-
previousGraph: graph1,
|
|
358
|
-
rate: 1000
|
|
359
|
-
}, mockContext);
|
|
360
|
-
const graph2 = loadGraphFromSnapshot(snapshotId2);
|
|
361
|
-
|
|
362
|
-
const node2 = graph2.nodes.get('https://incremental.com/');
|
|
363
|
-
expect(node2?.incrementalStatus).toBe('unchanged');
|
|
364
|
-
});
|
package/tests/db/index.test.ts
DELETED
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
|
2
|
-
import { getDbPath, getDb, closeDb } from '../../src/db/index.js';
|
|
3
|
-
import fs from 'node:fs';
|
|
4
|
-
import os from 'node:os';
|
|
5
|
-
import path from 'node:path';
|
|
6
|
-
|
|
7
|
-
vi.mock('node:fs');
|
|
8
|
-
vi.mock('node:os');
|
|
9
|
-
vi.mock('better-sqlite3', () => {
|
|
10
|
-
return {
|
|
11
|
-
default: vi.fn(function () {
|
|
12
|
-
return {
|
|
13
|
-
pragma: vi.fn().mockReturnValue('ok'),
|
|
14
|
-
prepare: vi.fn().mockReturnValue({
|
|
15
|
-
run: vi.fn(),
|
|
16
|
-
get: vi.fn(),
|
|
17
|
-
iterate: vi.fn(),
|
|
18
|
-
all: vi.fn()
|
|
19
|
-
}),
|
|
20
|
-
exec: vi.fn(),
|
|
21
|
-
close: vi.fn(),
|
|
22
|
-
transaction: vi.fn((fn) => fn),
|
|
23
|
-
};
|
|
24
|
-
}),
|
|
25
|
-
};
|
|
26
|
-
});
|
|
27
|
-
vi.mock('../../src/db/schema.js', () => ({
|
|
28
|
-
initSchema: vi.fn(),
|
|
29
|
-
}));
|
|
30
|
-
|
|
31
|
-
describe('DB Index', () => {
|
|
32
|
-
const originalEnv = process.env;
|
|
33
|
-
|
|
34
|
-
beforeEach(() => {
|
|
35
|
-
vi.resetAllMocks();
|
|
36
|
-
closeDb();
|
|
37
|
-
process.env = { ...originalEnv };
|
|
38
|
-
// Default mock behaviors
|
|
39
|
-
vi.mocked(os.homedir).mockReturnValue('/home/user');
|
|
40
|
-
vi.mocked(fs.existsSync).mockReturnValue(false);
|
|
41
|
-
vi.mocked(fs.mkdirSync).mockImplementation(() => undefined as any);
|
|
42
|
-
vi.mocked(fs.chmodSync).mockImplementation(() => undefined);
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
afterEach(() => {
|
|
46
|
-
process.env = originalEnv;
|
|
47
|
-
closeDb();
|
|
48
|
-
});
|
|
49
|
-
|
|
50
|
-
describe('getDbPath', () => {
|
|
51
|
-
it('should return :memory: in test environment', () => {
|
|
52
|
-
process.env.NODE_ENV = 'test';
|
|
53
|
-
expect(getDbPath()).toBe(':memory:');
|
|
54
|
-
});
|
|
55
|
-
|
|
56
|
-
it('should return custom path if CRAWLITH_DB_PATH is set', () => {
|
|
57
|
-
process.env.NODE_ENV = 'production';
|
|
58
|
-
process.env.CRAWLITH_DB_PATH = '/custom/path/db.sqlite';
|
|
59
|
-
expect(getDbPath()).toBe('/custom/path/db.sqlite');
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
it('should return default path in home dir if no env var', () => {
|
|
63
|
-
process.env.NODE_ENV = 'production';
|
|
64
|
-
delete process.env.CRAWLITH_DB_PATH;
|
|
65
|
-
|
|
66
|
-
const expectedPath = path.join('/home/user', '.crawlith', 'crawlith.db');
|
|
67
|
-
expect(getDbPath()).toBe(expectedPath);
|
|
68
|
-
|
|
69
|
-
expect(fs.mkdirSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), { recursive: true });
|
|
70
|
-
expect(fs.chmodSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), 0o700);
|
|
71
|
-
});
|
|
72
|
-
|
|
73
|
-
it('should not create dir if it exists', () => {
|
|
74
|
-
process.env.NODE_ENV = 'production';
|
|
75
|
-
vi.mocked(fs.existsSync).mockReturnValue(true);
|
|
76
|
-
|
|
77
|
-
getDbPath();
|
|
78
|
-
|
|
79
|
-
expect(fs.mkdirSync).not.toHaveBeenCalled();
|
|
80
|
-
});
|
|
81
|
-
});
|
|
82
|
-
|
|
83
|
-
describe('getDb', () => {
|
|
84
|
-
it('should create a new database instance', () => {
|
|
85
|
-
process.env.NODE_ENV = 'production';
|
|
86
|
-
const db = getDb();
|
|
87
|
-
expect(db).toBeDefined();
|
|
88
|
-
// Check if pragma was called
|
|
89
|
-
expect(db.pragma).toHaveBeenCalledWith('journal_mode = WAL');
|
|
90
|
-
});
|
|
91
|
-
|
|
92
|
-
it('should return existing instance if called twice', () => {
|
|
93
|
-
process.env.NODE_ENV = 'production';
|
|
94
|
-
const db1 = getDb();
|
|
95
|
-
const db2 = getDb();
|
|
96
|
-
expect(db1).toBe(db2);
|
|
97
|
-
});
|
|
98
|
-
|
|
99
|
-
it('should handle permission errors gracefully', () => {
|
|
100
|
-
process.env.NODE_ENV = 'production';
|
|
101
|
-
// Avoid getDbPath throwing
|
|
102
|
-
vi.mocked(fs.existsSync).mockReturnValue(true);
|
|
103
|
-
|
|
104
|
-
vi.mocked(fs.chmodSync).mockImplementation((path) => {
|
|
105
|
-
if (path.toString().endsWith('crawlith.db')) {
|
|
106
|
-
throw new Error('EPERM');
|
|
107
|
-
}
|
|
108
|
-
});
|
|
109
|
-
|
|
110
|
-
expect(() => getDb()).not.toThrow();
|
|
111
|
-
});
|
|
112
|
-
|
|
113
|
-
it('should warn if integrity check fails', async () => {
|
|
114
|
-
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
|
|
115
|
-
process.env.NODE_ENV = 'production';
|
|
116
|
-
vi.mocked(fs.existsSync).mockReturnValue(true);
|
|
117
|
-
|
|
118
|
-
const MockDatabase = (await import('better-sqlite3')).default;
|
|
119
|
-
vi.mocked(MockDatabase).mockImplementationOnce(function() {
|
|
120
|
-
return {
|
|
121
|
-
pragma: vi.fn().mockReturnValue('corrupt'),
|
|
122
|
-
prepare: vi.fn(),
|
|
123
|
-
exec: vi.fn(),
|
|
124
|
-
close: vi.fn(),
|
|
125
|
-
transaction: vi.fn(),
|
|
126
|
-
} as any;
|
|
127
|
-
});
|
|
128
|
-
|
|
129
|
-
getDb();
|
|
130
|
-
|
|
131
|
-
expect(warnSpy).toHaveBeenCalledWith('Database integrity check failed:', 'corrupt');
|
|
132
|
-
});
|
|
133
|
-
});
|
|
134
|
-
});
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
-
import Database from 'better-sqlite3';
|
|
3
|
-
import { PageRepository } from '../../src/db/repositories/PageRepository.js';
|
|
4
|
-
import { initSchema } from '../../src/db/schema.js';
|
|
5
|
-
|
|
6
|
-
describe('PageRepository', () => {
|
|
7
|
-
let db: Database.Database;
|
|
8
|
-
let repo: PageRepository;
|
|
9
|
-
|
|
10
|
-
beforeEach(() => {
|
|
11
|
-
db = new Database(':memory:');
|
|
12
|
-
initSchema(db);
|
|
13
|
-
repo = new PageRepository(db);
|
|
14
|
-
|
|
15
|
-
// Seed required tables (sites, snapshots)
|
|
16
|
-
db.prepare("INSERT INTO sites (domain) VALUES ('example.com')").run();
|
|
17
|
-
db.prepare("INSERT INTO snapshots (site_id, type) VALUES (1, 'full')").run();
|
|
18
|
-
});
|
|
19
|
-
|
|
20
|
-
afterEach(() => {
|
|
21
|
-
db.close();
|
|
22
|
-
});
|
|
23
|
-
|
|
24
|
-
it('should get pages by URLs in chunks', () => {
|
|
25
|
-
const urls: string[] = [];
|
|
26
|
-
const siteId = 1;
|
|
27
|
-
const snapshotId = 1;
|
|
28
|
-
|
|
29
|
-
// Create 1000 pages (chunk size is 900)
|
|
30
|
-
const insertStmt = db.prepare(`
|
|
31
|
-
INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
|
|
32
|
-
VALUES (?, ?, ?, ?)
|
|
33
|
-
`);
|
|
34
|
-
|
|
35
|
-
const tx = db.transaction(() => {
|
|
36
|
-
for (let i = 0; i < 1000; i++) {
|
|
37
|
-
const url = `http://example.com/page${i}`;
|
|
38
|
-
urls.push(url);
|
|
39
|
-
insertStmt.run(siteId, url, snapshotId, snapshotId);
|
|
40
|
-
}
|
|
41
|
-
});
|
|
42
|
-
tx();
|
|
43
|
-
|
|
44
|
-
// Fetch pages
|
|
45
|
-
const pages = repo.getPagesByUrls(siteId, urls);
|
|
46
|
-
|
|
47
|
-
expect(pages).toHaveLength(1000);
|
|
48
|
-
expect(pages[0].normalized_url).toBe('http://example.com/page0');
|
|
49
|
-
expect(pages[999].normalized_url).toBe('http://example.com/page999');
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
it('should return empty array for empty URL list', () => {
|
|
53
|
-
const pages = repo.getPagesByUrls(1, []);
|
|
54
|
-
expect(pages).toEqual([]);
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
it('should iterate over pages by snapshot', () => {
|
|
58
|
-
const siteId = 1;
|
|
59
|
-
const snapshotId = 1;
|
|
60
|
-
const insertStmt = db.prepare(`
|
|
61
|
-
INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
|
|
62
|
-
VALUES (?, ?, ?, ?)
|
|
63
|
-
`);
|
|
64
|
-
|
|
65
|
-
db.transaction(() => {
|
|
66
|
-
insertStmt.run(siteId, 'http://example.com/1', snapshotId, snapshotId);
|
|
67
|
-
insertStmt.run(siteId, 'http://example.com/2', snapshotId, snapshotId);
|
|
68
|
-
insertStmt.run(siteId, 'http://example.com/3', snapshotId, snapshotId);
|
|
69
|
-
})();
|
|
70
|
-
|
|
71
|
-
const iterator = repo.getPagesIteratorBySnapshot(snapshotId);
|
|
72
|
-
const pages = Array.from(iterator);
|
|
73
|
-
|
|
74
|
-
expect(pages).toHaveLength(3);
|
|
75
|
-
expect(pages.map(p => p.normalized_url).sort()).toEqual([
|
|
76
|
-
'http://example.com/1',
|
|
77
|
-
'http://example.com/2',
|
|
78
|
-
'http://example.com/3'
|
|
79
|
-
]);
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
it('should upsert and get ID', () => {
|
|
83
|
-
const pageData = {
|
|
84
|
-
site_id: 1,
|
|
85
|
-
normalized_url: 'http://example.com/new',
|
|
86
|
-
last_seen_snapshot_id: 1,
|
|
87
|
-
http_status: 200,
|
|
88
|
-
};
|
|
89
|
-
|
|
90
|
-
const id = repo.upsertAndGetId(pageData);
|
|
91
|
-
expect(id).toBeGreaterThan(0);
|
|
92
|
-
|
|
93
|
-
const sameId = repo.upsertAndGetId({ ...pageData, http_status: 404 });
|
|
94
|
-
expect(sameId).toBe(id);
|
|
95
|
-
|
|
96
|
-
const page = repo.getPage(1, 'http://example.com/new');
|
|
97
|
-
expect(page?.http_status).toBe(404);
|
|
98
|
-
});
|
|
99
|
-
|
|
100
|
-
it('should get ID by URL', () => {
|
|
101
|
-
const pageData = {
|
|
102
|
-
site_id: 1,
|
|
103
|
-
normalized_url: 'http://example.com/id-test',
|
|
104
|
-
last_seen_snapshot_id: 1,
|
|
105
|
-
};
|
|
106
|
-
repo.upsertPage(pageData);
|
|
107
|
-
|
|
108
|
-
const id = repo.getIdByUrl(1, 'http://example.com/id-test');
|
|
109
|
-
expect(id).toBeDefined();
|
|
110
|
-
expect(id).toBeGreaterThan(0);
|
|
111
|
-
|
|
112
|
-
const missingId = repo.getIdByUrl(1, 'http://example.com/missing');
|
|
113
|
-
expect(missingId).toBeUndefined();
|
|
114
|
-
});
|
|
115
|
-
});
|