@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/tests/pagerank.test.ts
DELETED
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { Graph } from '../src/graph/graph.js';
|
|
3
|
-
import { computePageRank } from '../src/graph/pagerank.js';
|
|
4
|
-
|
|
5
|
-
describe('PageRank Engine', () => {
|
|
6
|
-
it('should calculate identical PageRank for a simple loop', () => {
|
|
7
|
-
const graph = new Graph();
|
|
8
|
-
graph.addNode('https://a.com', 0, 200);
|
|
9
|
-
graph.addNode('https://b.com', 1, 200);
|
|
10
|
-
graph.addEdge('https://a.com', 'https://b.com');
|
|
11
|
-
graph.addEdge('https://b.com', 'https://a.com');
|
|
12
|
-
|
|
13
|
-
computePageRank(graph);
|
|
14
|
-
const nodes = graph.getNodes();
|
|
15
|
-
|
|
16
|
-
expect(nodes[0].pageRank).toBeCloseTo(0.5, 4);
|
|
17
|
-
expect(nodes[1].pageRank).toBeCloseTo(0.5, 4);
|
|
18
|
-
expect(nodes[0].pageRankScore).toBe(100);
|
|
19
|
-
expect(nodes[1].pageRankScore).toBe(100);
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
it('should identify the center of a star graph as most important', () => {
|
|
23
|
-
const graph = new Graph();
|
|
24
|
-
graph.addNode('https://center.com', 0, 200);
|
|
25
|
-
graph.addNode('https://p1.com', 1, 200);
|
|
26
|
-
graph.addNode('https://p2.com', 1, 200);
|
|
27
|
-
graph.addNode('https://p3.com', 1, 200);
|
|
28
|
-
|
|
29
|
-
// Star in: all link to center
|
|
30
|
-
graph.addEdge('https://p1.com', 'https://center.com');
|
|
31
|
-
graph.addEdge('https://p2.com', 'https://center.com');
|
|
32
|
-
graph.addEdge('https://p3.com', 'https://center.com');
|
|
33
|
-
|
|
34
|
-
computePageRank(graph);
|
|
35
|
-
const nodes = graph.getNodes();
|
|
36
|
-
|
|
37
|
-
const center = nodes.find(n => n.url.includes('center'))!;
|
|
38
|
-
const leaves = nodes.filter(n => !n.url.includes('center'));
|
|
39
|
-
|
|
40
|
-
expect(center.pageRankScore).toBe(100);
|
|
41
|
-
leaves.forEach(leaf => {
|
|
42
|
-
expect(leaf.pageRankScore).toBeLessThan(100);
|
|
43
|
-
expect(leaf.pageRank!).toBeLessThan(center.pageRank!);
|
|
44
|
-
});
|
|
45
|
-
});
|
|
46
|
-
|
|
47
|
-
it('should respect link weights (Body > Nav > Footer)', () => {
|
|
48
|
-
const graph = new Graph();
|
|
49
|
-
graph.addNode('https://source.com', 0, 200);
|
|
50
|
-
graph.addNode('https://body-target.com', 1, 200);
|
|
51
|
-
graph.addNode('https://footer-target.com', 1, 200);
|
|
52
|
-
|
|
53
|
-
// Body weight 1.0, Footer weight 0.4
|
|
54
|
-
graph.addEdge('https://source.com', 'https://body-target.com', 1.0);
|
|
55
|
-
graph.addEdge('https://source.com', 'https://footer-target.com', 0.4);
|
|
56
|
-
|
|
57
|
-
computePageRank(graph);
|
|
58
|
-
|
|
59
|
-
const bodyTarget = graph.nodes.get('https://body-target.com')!;
|
|
60
|
-
const footerTarget = graph.nodes.get('https://footer-target.com')!;
|
|
61
|
-
|
|
62
|
-
expect(bodyTarget.pageRank!).toBeGreaterThan(footerTarget.pageRank!);
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
it('should handle sink nodes by redistributing rank', () => {
|
|
66
|
-
const graph = new Graph();
|
|
67
|
-
graph.addNode('https://a.com', 0, 200);
|
|
68
|
-
graph.addNode('https://b.com', 1, 200); // b is a sink
|
|
69
|
-
graph.addEdge('https://a.com', 'https://b.com');
|
|
70
|
-
|
|
71
|
-
computePageRank(graph);
|
|
72
|
-
|
|
73
|
-
const nodeA = graph.nodes.get('https://a.com')!;
|
|
74
|
-
const nodeB = graph.nodes.get('https://b.com')!;
|
|
75
|
-
|
|
76
|
-
// Without redistribution, A would lose all rank.
|
|
77
|
-
// With redistribution, A should still have some rank.
|
|
78
|
-
expect(nodeA.pageRank).toBeGreaterThan(0);
|
|
79
|
-
expect(nodeB.pageRank).toBeGreaterThan(nodeA.pageRank!);
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
it('should exclude noindex pages from receiving or passing rank', () => {
|
|
83
|
-
const graph = new Graph();
|
|
84
|
-
graph.addNode('https://a.com', 0, 200);
|
|
85
|
-
graph.addNode('https://no-index.com', 1, 200);
|
|
86
|
-
graph.nodes.get('https://no-index.com')!.noindex = true;
|
|
87
|
-
|
|
88
|
-
graph.addEdge('https://a.com', 'https://no-index.com');
|
|
89
|
-
|
|
90
|
-
computePageRank(graph);
|
|
91
|
-
|
|
92
|
-
const nodeA = graph.nodes.get('https://a.com')!;
|
|
93
|
-
const nodeNoIndex = graph.nodes.get('https://no-index.com')!;
|
|
94
|
-
|
|
95
|
-
expect(nodeNoIndex.pageRank).toBeUndefined();
|
|
96
|
-
expect(nodeA.pageRank).toBe(1.0); // Only one eligible node
|
|
97
|
-
});
|
|
98
|
-
});
|
package/tests/parser.test.ts
DELETED
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
import { test, expect } from 'vitest';
|
|
2
|
-
import { Parser } from '../src/crawler/parser.js';
|
|
3
|
-
|
|
4
|
-
const parser = new Parser();
|
|
5
|
-
const baseUrl = 'https://example.com';
|
|
6
|
-
|
|
7
|
-
test('extracts links correctly', () => {
|
|
8
|
-
const html = `
|
|
9
|
-
<html>
|
|
10
|
-
<body>
|
|
11
|
-
<a href="/page1">Page 1</a>
|
|
12
|
-
<a href="https://other.com">Other</a>
|
|
13
|
-
<a href="#hash">Hash</a>
|
|
14
|
-
<a href="javascript:void(0)">JS</a>
|
|
15
|
-
</body>
|
|
16
|
-
</html>
|
|
17
|
-
`;
|
|
18
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
19
|
-
const urls = result.links.map(l => l.url);
|
|
20
|
-
expect(urls).toContain('https://example.com/page1');
|
|
21
|
-
expect(urls).toContain('https://other.com/');
|
|
22
|
-
expect(urls).not.toContain('https://example.com/#hash');
|
|
23
|
-
// It also extracts the base URL itself from href="#hash"
|
|
24
|
-
expect(urls).toContain('https://example.com/');
|
|
25
|
-
expect(result.links.length).toBe(3);
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
test('respects nofollow on links', () => {
|
|
29
|
-
const html = `
|
|
30
|
-
<html>
|
|
31
|
-
<body>
|
|
32
|
-
<a href="/page1" rel="nofollow">Page 1</a>
|
|
33
|
-
<a href="/page2">Page 2</a>
|
|
34
|
-
</body>
|
|
35
|
-
</html>
|
|
36
|
-
`;
|
|
37
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
38
|
-
const urls = result.links.map(l => l.url);
|
|
39
|
-
expect(urls).not.toContain('https://example.com/page1');
|
|
40
|
-
expect(urls).toContain('https://example.com/page2');
|
|
41
|
-
});
|
|
42
|
-
|
|
43
|
-
test('respects meta robots nofollow', () => {
|
|
44
|
-
const html = `
|
|
45
|
-
<html>
|
|
46
|
-
<head>
|
|
47
|
-
<meta name="robots" content="nofollow">
|
|
48
|
-
</head>
|
|
49
|
-
<body>
|
|
50
|
-
<a href="/page1">Page 1</a>
|
|
51
|
-
</body>
|
|
52
|
-
</html>
|
|
53
|
-
`;
|
|
54
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
55
|
-
expect(result.nofollow).toBe(true);
|
|
56
|
-
expect(result.links.length).toBe(0);
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
test('detects canonical', () => {
|
|
60
|
-
const html = `
|
|
61
|
-
<html>
|
|
62
|
-
<head>
|
|
63
|
-
<link rel="canonical" href="https://example.com/canon">
|
|
64
|
-
</head>
|
|
65
|
-
</html>
|
|
66
|
-
`;
|
|
67
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
68
|
-
expect(result.canonical).toBe('https://example.com/canon');
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
test('detects relative canonical', () => {
|
|
72
|
-
const html = `
|
|
73
|
-
<html>
|
|
74
|
-
<head>
|
|
75
|
-
<link rel="canonical" href="/canon">
|
|
76
|
-
</head>
|
|
77
|
-
</html>
|
|
78
|
-
`;
|
|
79
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
80
|
-
expect(result.canonical).toBe('https://example.com/canon');
|
|
81
|
-
});
|
|
82
|
-
|
|
83
|
-
test('detects soft 404', () => {
|
|
84
|
-
const html = `
|
|
85
|
-
<html>
|
|
86
|
-
<head><title>Page Not Found</title></head>
|
|
87
|
-
<body>Sorry, the page you are looking for does not exist.</body>
|
|
88
|
-
</html>
|
|
89
|
-
`;
|
|
90
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
91
|
-
expect(result.soft404Score).toBeGreaterThanOrEqual(0.5);
|
|
92
|
-
});
|
|
93
|
-
|
|
94
|
-
test('content hash ignores scripts', () => {
|
|
95
|
-
const html1 = `
|
|
96
|
-
<html><body><script>var x=1;</script><p>Hello</p></body></html>
|
|
97
|
-
`;
|
|
98
|
-
const html2 = `
|
|
99
|
-
<html><body><script>var x=2;</script><p>Hello</p></body></html>
|
|
100
|
-
`;
|
|
101
|
-
const result1 = parser.parse(html1, baseUrl, 200);
|
|
102
|
-
const result2 = parser.parse(html2, baseUrl, 200);
|
|
103
|
-
expect(result1.contentHash).toBe(result2.contentHash);
|
|
104
|
-
});
|
|
105
|
-
|
|
106
|
-
test('detects meta robots noindex', () => {
|
|
107
|
-
const html = `
|
|
108
|
-
<html>
|
|
109
|
-
<head>
|
|
110
|
-
<meta name="robots" content="noindex, nofollow">
|
|
111
|
-
</head>
|
|
112
|
-
</html>
|
|
113
|
-
`;
|
|
114
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
115
|
-
expect(result.noindex).toBe(true);
|
|
116
|
-
expect(result.nofollow).toBe(true);
|
|
117
|
-
});
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
-
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
3
|
-
import { request, ProxyAgent } from 'undici';
|
|
4
|
-
|
|
5
|
-
vi.mock('undici', async (importOriginal) => {
|
|
6
|
-
const original = await importOriginal<typeof import('undici')>();
|
|
7
|
-
return {
|
|
8
|
-
...original,
|
|
9
|
-
request: vi.fn(),
|
|
10
|
-
ProxyAgent: vi.fn(function () {
|
|
11
|
-
return {
|
|
12
|
-
request: vi.fn(),
|
|
13
|
-
close: vi.fn()
|
|
14
|
-
};
|
|
15
|
-
})
|
|
16
|
-
};
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
describe('Proxy Integration', () => {
|
|
20
|
-
beforeEach(() => {
|
|
21
|
-
vi.clearAllMocks();
|
|
22
|
-
});
|
|
23
|
-
|
|
24
|
-
it('should use ProxyAgent when proxyUrl is provided', async () => {
|
|
25
|
-
const fetcher = new Fetcher({ proxyUrl: 'http://proxy.com:8080', rate: 100 });
|
|
26
|
-
const mockRequest = vi.mocked(request);
|
|
27
|
-
|
|
28
|
-
// Mock the request to return a successful response immediately
|
|
29
|
-
mockRequest.mockResolvedValueOnce({
|
|
30
|
-
statusCode: 200,
|
|
31
|
-
headers: {},
|
|
32
|
-
body: {
|
|
33
|
-
on: vi.fn((event, cb) => {
|
|
34
|
-
if (event === 'data') {
|
|
35
|
-
// Simulate async data chunk
|
|
36
|
-
setTimeout(() => cb(Buffer.from('ok')), 0);
|
|
37
|
-
}
|
|
38
|
-
if (event === 'end') {
|
|
39
|
-
// Simulate async end
|
|
40
|
-
setTimeout(() => cb(), 0);
|
|
41
|
-
}
|
|
42
|
-
return { on: vi.fn() }; // chaining
|
|
43
|
-
}),
|
|
44
|
-
dump: vi.fn(),
|
|
45
|
-
text: vi.fn().mockResolvedValue('ok')
|
|
46
|
-
}
|
|
47
|
-
} as any);
|
|
48
|
-
|
|
49
|
-
await fetcher.fetch('http://target.com');
|
|
50
|
-
|
|
51
|
-
expect(ProxyAgent).toHaveBeenCalledWith('http://proxy.com:8080');
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
it('should fail fast on invalid proxy URL', () => {
|
|
55
|
-
expect(() => new Fetcher({ proxyUrl: 'not-a-url' })).toThrow('Invalid proxy URL');
|
|
56
|
-
});
|
|
57
|
-
});
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
-
import { RedirectController } from '../src/core/network/redirectController.js';
|
|
3
|
-
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
4
|
-
import { request } from 'undici';
|
|
5
|
-
|
|
6
|
-
vi.mock('undici', () => ({
|
|
7
|
-
request: vi.fn(),
|
|
8
|
-
ProxyAgent: vi.fn().mockImplementation(() => ({ dispatcher: {} })),
|
|
9
|
-
Agent: class {
|
|
10
|
-
dispatch = vi.fn();
|
|
11
|
-
},
|
|
12
|
-
Dispatcher: class {}
|
|
13
|
-
}));
|
|
14
|
-
|
|
15
|
-
describe('RedirectController', () => {
|
|
16
|
-
it('should limit hops', () => {
|
|
17
|
-
const ctrl = new RedirectController(2);
|
|
18
|
-
expect(ctrl.nextHop('http://b.com')).toBe(null);
|
|
19
|
-
expect(ctrl.nextHop('http://c.com')).toBe(null);
|
|
20
|
-
expect(ctrl.nextHop('http://d.com')).toBe('redirect_limit_exceeded');
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
it('should detect loops', () => {
|
|
24
|
-
const ctrl = new RedirectController(5);
|
|
25
|
-
expect(ctrl.nextHop('http://b.com')).toBe(null);
|
|
26
|
-
expect(ctrl.nextHop('http://a.com')).toBe(null);
|
|
27
|
-
expect(ctrl.nextHop('http://b.com')).toBe('redirect_loop');
|
|
28
|
-
});
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
describe('Fetcher Redirect Integration', () => {
|
|
32
|
-
let fetcher: Fetcher;
|
|
33
|
-
|
|
34
|
-
beforeEach(() => {
|
|
35
|
-
vi.clearAllMocks();
|
|
36
|
-
fetcher = new Fetcher({ rate: 100, maxRedirects: 2 });
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
it('should stop at max redirects', async () => {
|
|
40
|
-
const mockRequest = vi.mocked(request);
|
|
41
|
-
|
|
42
|
-
// Return 301 with unique locations
|
|
43
|
-
mockRequest
|
|
44
|
-
.mockResolvedValueOnce({
|
|
45
|
-
statusCode: 301,
|
|
46
|
-
headers: { location: 'http://a.com' },
|
|
47
|
-
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
48
|
-
} as any)
|
|
49
|
-
.mockResolvedValueOnce({
|
|
50
|
-
statusCode: 301,
|
|
51
|
-
headers: { location: 'http://b.com' },
|
|
52
|
-
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
53
|
-
} as any)
|
|
54
|
-
.mockResolvedValueOnce({
|
|
55
|
-
statusCode: 301,
|
|
56
|
-
headers: { location: 'http://c.com' },
|
|
57
|
-
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
58
|
-
} as any);
|
|
59
|
-
|
|
60
|
-
const res = await fetcher.fetch('http://start.com');
|
|
61
|
-
expect(res.status).toBe('redirect_limit_exceeded');
|
|
62
|
-
expect(res.redirectChain).toHaveLength(2);
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
it('should detect loops in fetch', async () => {
|
|
66
|
-
const mockRequest = vi.mocked(request);
|
|
67
|
-
|
|
68
|
-
mockRequest.mockResolvedValue({
|
|
69
|
-
statusCode: 301,
|
|
70
|
-
headers: { location: 'http://start.com' },
|
|
71
|
-
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
72
|
-
} as any);
|
|
73
|
-
|
|
74
|
-
const res = await fetcher.fetch('http://start.com');
|
|
75
|
-
expect(res.status).toBe('redirect_loop');
|
|
76
|
-
});
|
|
77
|
-
});
|
|
@@ -1,183 +0,0 @@
|
|
|
1
|
-
import { describe, expect, test } from 'vitest';
|
|
2
|
-
import { renderAnalysisCsv, AnalysisResult } from '../src/analysis/analyze.js';
|
|
3
|
-
|
|
4
|
-
describe('renderAnalysisCsv', () => {
|
|
5
|
-
test('renders CSV with headers', () => {
|
|
6
|
-
const result: AnalysisResult = {
|
|
7
|
-
pages: [],
|
|
8
|
-
site_summary: {
|
|
9
|
-
pages_analyzed: 0,
|
|
10
|
-
avg_seo_score: 0,
|
|
11
|
-
thin_pages: 0,
|
|
12
|
-
duplicate_titles: 0,
|
|
13
|
-
site_score: 0
|
|
14
|
-
},
|
|
15
|
-
site_scores: {} as any,
|
|
16
|
-
active_modules: {
|
|
17
|
-
seo: true,
|
|
18
|
-
content: true,
|
|
19
|
-
accessibility: true
|
|
20
|
-
}
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
const csv = renderAnalysisCsv(result);
|
|
24
|
-
expect(csv).toContain('URL,SEO Score,Thin Score,HTTP Status,Title,Title Length,Meta Description,Desc Length,Word Count,Internal Links,External Links');
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
test('renders a single page correctly', () => {
|
|
28
|
-
const result: AnalysisResult = {
|
|
29
|
-
pages: [
|
|
30
|
-
{
|
|
31
|
-
url: 'https://example.com',
|
|
32
|
-
status: 200,
|
|
33
|
-
seoScore: 85,
|
|
34
|
-
thinScore: 10,
|
|
35
|
-
title: { value: 'Example Domain', length: 14, status: 'ok' },
|
|
36
|
-
metaDescription: { value: 'This is an example description.', length: 29, status: 'ok' },
|
|
37
|
-
content: { wordCount: 500 } as any,
|
|
38
|
-
links: { internalLinks: 5, externalLinks: 2 } as any,
|
|
39
|
-
h1: {} as any,
|
|
40
|
-
images: {} as any,
|
|
41
|
-
structuredData: {} as any,
|
|
42
|
-
meta: {}
|
|
43
|
-
}
|
|
44
|
-
],
|
|
45
|
-
site_summary: {
|
|
46
|
-
pages_analyzed: 1,
|
|
47
|
-
avg_seo_score: 85,
|
|
48
|
-
thin_pages: 0,
|
|
49
|
-
duplicate_titles: 0,
|
|
50
|
-
site_score: 85
|
|
51
|
-
},
|
|
52
|
-
site_scores: {} as any,
|
|
53
|
-
active_modules: {
|
|
54
|
-
seo: true,
|
|
55
|
-
content: true,
|
|
56
|
-
accessibility: true
|
|
57
|
-
}
|
|
58
|
-
};
|
|
59
|
-
|
|
60
|
-
const csv = renderAnalysisCsv(result);
|
|
61
|
-
const lines = csv.split('\n');
|
|
62
|
-
expect(lines.length).toBe(2);
|
|
63
|
-
expect(lines[1]).toContain('https://example.com,85,10,200,"Example Domain",14,"This is an example description.",29,500,5,2');
|
|
64
|
-
});
|
|
65
|
-
|
|
66
|
-
test('escapes quotes in title and meta description', () => {
|
|
67
|
-
const result: AnalysisResult = {
|
|
68
|
-
pages: [
|
|
69
|
-
{
|
|
70
|
-
url: 'https://example.com/quote',
|
|
71
|
-
status: 200,
|
|
72
|
-
seoScore: 90,
|
|
73
|
-
thinScore: 5,
|
|
74
|
-
title: { value: 'Example "Quoted" Domain', length: 23, status: 'ok' },
|
|
75
|
-
metaDescription: { value: 'This description contains "quotes" inside.', length: 42, status: 'ok' },
|
|
76
|
-
content: { wordCount: 300 } as any,
|
|
77
|
-
links: { internalLinks: 3, externalLinks: 1 } as any,
|
|
78
|
-
h1: {} as any,
|
|
79
|
-
images: {} as any,
|
|
80
|
-
structuredData: {} as any,
|
|
81
|
-
meta: {}
|
|
82
|
-
}
|
|
83
|
-
],
|
|
84
|
-
site_summary: {
|
|
85
|
-
pages_analyzed: 1,
|
|
86
|
-
avg_seo_score: 90,
|
|
87
|
-
thin_pages: 0,
|
|
88
|
-
duplicate_titles: 0,
|
|
89
|
-
site_score: 90
|
|
90
|
-
},
|
|
91
|
-
site_scores: {} as any,
|
|
92
|
-
active_modules: {
|
|
93
|
-
seo: true,
|
|
94
|
-
content: true,
|
|
95
|
-
accessibility: true
|
|
96
|
-
}
|
|
97
|
-
};
|
|
98
|
-
|
|
99
|
-
const csv = renderAnalysisCsv(result);
|
|
100
|
-
const lines = csv.split('\n');
|
|
101
|
-
// Expect double quotes to be escaped with double quotes: " -> ""
|
|
102
|
-
// And the whole field wrapped in quotes
|
|
103
|
-
expect(lines[1]).toContain('"Example ""Quoted"" Domain"');
|
|
104
|
-
expect(lines[1]).toContain('"This description contains ""quotes"" inside."');
|
|
105
|
-
});
|
|
106
|
-
|
|
107
|
-
test('handles Pending/Limit status (status: 0)', () => {
|
|
108
|
-
const result: AnalysisResult = {
|
|
109
|
-
pages: [
|
|
110
|
-
{
|
|
111
|
-
url: 'https://example.com/pending',
|
|
112
|
-
status: 0,
|
|
113
|
-
seoScore: 0,
|
|
114
|
-
thinScore: 0,
|
|
115
|
-
title: { value: null, length: 0, status: 'missing' },
|
|
116
|
-
metaDescription: { value: null, length: 0, status: 'missing' },
|
|
117
|
-
content: { wordCount: 0 } as any,
|
|
118
|
-
links: { internalLinks: 0, externalLinks: 0 } as any,
|
|
119
|
-
h1: {} as any,
|
|
120
|
-
images: {} as any,
|
|
121
|
-
structuredData: {} as any,
|
|
122
|
-
meta: {}
|
|
123
|
-
}
|
|
124
|
-
],
|
|
125
|
-
site_summary: {
|
|
126
|
-
pages_analyzed: 1,
|
|
127
|
-
avg_seo_score: 0,
|
|
128
|
-
thin_pages: 0,
|
|
129
|
-
duplicate_titles: 0,
|
|
130
|
-
site_score: 0
|
|
131
|
-
},
|
|
132
|
-
site_scores: {} as any,
|
|
133
|
-
active_modules: {
|
|
134
|
-
seo: true,
|
|
135
|
-
content: true,
|
|
136
|
-
accessibility: true
|
|
137
|
-
}
|
|
138
|
-
};
|
|
139
|
-
|
|
140
|
-
const csv = renderAnalysisCsv(result);
|
|
141
|
-
const lines = csv.split('\n');
|
|
142
|
-
expect(lines[1]).toContain('Pending/Limit');
|
|
143
|
-
});
|
|
144
|
-
|
|
145
|
-
test('handles missing title and description gracefully', () => {
|
|
146
|
-
const result: AnalysisResult = {
|
|
147
|
-
pages: [
|
|
148
|
-
{
|
|
149
|
-
url: 'https://example.com/missing',
|
|
150
|
-
status: 404,
|
|
151
|
-
seoScore: 0,
|
|
152
|
-
thinScore: 0,
|
|
153
|
-
title: { value: undefined as any, length: 0, status: 'missing' },
|
|
154
|
-
metaDescription: { value: null as any, length: 0, status: 'missing' },
|
|
155
|
-
content: { wordCount: 0 } as any,
|
|
156
|
-
links: { internalLinks: 0, externalLinks: 0 } as any,
|
|
157
|
-
h1: {} as any,
|
|
158
|
-
images: {} as any,
|
|
159
|
-
structuredData: {} as any,
|
|
160
|
-
meta: {}
|
|
161
|
-
}
|
|
162
|
-
],
|
|
163
|
-
site_summary: {
|
|
164
|
-
pages_analyzed: 1,
|
|
165
|
-
avg_seo_score: 0,
|
|
166
|
-
thin_pages: 0,
|
|
167
|
-
duplicate_titles: 0,
|
|
168
|
-
site_score: 0
|
|
169
|
-
},
|
|
170
|
-
site_scores: {} as any,
|
|
171
|
-
active_modules: {
|
|
172
|
-
seo: true,
|
|
173
|
-
content: true,
|
|
174
|
-
accessibility: true
|
|
175
|
-
}
|
|
176
|
-
};
|
|
177
|
-
|
|
178
|
-
const csv = renderAnalysisCsv(result);
|
|
179
|
-
const lines = csv.split('\n');
|
|
180
|
-
// Should produce empty quoted strings ""
|
|
181
|
-
expect(lines[1]).toContain(',"",0,"",0,0,0,0');
|
|
182
|
-
});
|
|
183
|
-
});
|
package/tests/safety.test.ts
DELETED
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi } from 'vitest';
|
|
2
|
-
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
3
|
-
import { RateLimiter } from '../src/core/network/rateLimiter.js';
|
|
4
|
-
import { RetryPolicy } from '../src/core/network/retryPolicy.js';
|
|
5
|
-
import { ResponseLimiter } from '../src/core/network/responseLimiter.js';
|
|
6
|
-
import { Readable } from 'stream';
|
|
7
|
-
import * as dns from 'dns';
|
|
8
|
-
|
|
9
|
-
vi.mock('dns', () => ({
|
|
10
|
-
resolve4: vi.fn(),
|
|
11
|
-
resolve6: vi.fn(),
|
|
12
|
-
}));
|
|
13
|
-
|
|
14
|
-
describe('IPGuard', () => {
|
|
15
|
-
it('should block IPv4 internal ranges', () => {
|
|
16
|
-
expect(IPGuard.isInternal('127.0.0.1')).toBe(true);
|
|
17
|
-
expect(IPGuard.isInternal('10.0.0.1')).toBe(true);
|
|
18
|
-
expect(IPGuard.isInternal('192.168.1.1')).toBe(true);
|
|
19
|
-
expect(IPGuard.isInternal('172.16.0.1')).toBe(true);
|
|
20
|
-
expect(IPGuard.isInternal('172.31.255.255')).toBe(true);
|
|
21
|
-
expect(IPGuard.isInternal('169.254.1.1')).toBe(true);
|
|
22
|
-
expect(IPGuard.isInternal('0.0.0.0')).toBe(true);
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
it('should allow public IPv4', () => {
|
|
26
|
-
expect(IPGuard.isInternal('8.8.8.8')).toBe(false);
|
|
27
|
-
expect(IPGuard.isInternal('1.1.1.1')).toBe(false);
|
|
28
|
-
expect(IPGuard.isInternal('172.32.0.1')).toBe(false);
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
it('should block IPv6 internal/local addresses', () => {
|
|
32
|
-
expect(IPGuard.isInternal('::1')).toBe(true);
|
|
33
|
-
expect(IPGuard.isInternal('fc00::1')).toBe(true);
|
|
34
|
-
expect(IPGuard.isInternal('fe80::1')).toBe(true);
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
it('should block IPv4-mapped IPv6 internal addresses', () => {
|
|
38
|
-
expect(IPGuard.isInternal('::ffff:127.0.0.1')).toBe(true);
|
|
39
|
-
expect(IPGuard.isInternal('::ffff:10.0.0.1')).toBe(true);
|
|
40
|
-
expect(IPGuard.isInternal('::ffff:192.168.1.1')).toBe(true);
|
|
41
|
-
expect(IPGuard.isInternal('::ffff:169.254.169.254')).toBe(true);
|
|
42
|
-
expect(IPGuard.isInternal('::ffff:7f00:0001')).toBe(true); // Hex 127.0.0.1
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
it('should allow IPv4-mapped IPv6 public addresses', () => {
|
|
46
|
-
expect(IPGuard.isInternal('::ffff:8.8.8.8')).toBe(false);
|
|
47
|
-
});
|
|
48
|
-
|
|
49
|
-
it('should validate hostname by resolving IPs', async () => {
|
|
50
|
-
const resolve4Spy = vi.mocked(dns.resolve4);
|
|
51
|
-
const resolve6Spy = vi.mocked(dns.resolve6);
|
|
52
|
-
|
|
53
|
-
resolve4Spy.mockImplementation((_h: string, cb: any) => cb(null, ['1.1.1.1']));
|
|
54
|
-
resolve6Spy.mockImplementation((_h: string, cb: any) => cb(null, []));
|
|
55
|
-
expect(await IPGuard.validateHost('example.com')).toBe(true);
|
|
56
|
-
|
|
57
|
-
resolve4Spy.mockImplementation((_h: string, cb: any) => cb(null, ['127.0.0.1']));
|
|
58
|
-
expect(await IPGuard.validateHost('localhost')).toBe(false);
|
|
59
|
-
});
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
describe('RateLimiter', () => {
|
|
63
|
-
it('should enforce rate limits', async () => {
|
|
64
|
-
const limiter = new RateLimiter(1); // 1 req/sec = 1000ms interval
|
|
65
|
-
const start = Date.now();
|
|
66
|
-
|
|
67
|
-
await limiter.waitForToken('host1'); // returns immediately, tokens becomes 0
|
|
68
|
-
await limiter.waitForToken('host1'); // waits for refill (1s)
|
|
69
|
-
|
|
70
|
-
const elapsed = Date.now() - start;
|
|
71
|
-
expect(elapsed).toBeGreaterThanOrEqual(1000);
|
|
72
|
-
}, 5000);
|
|
73
|
-
|
|
74
|
-
it('should have separate buckets for hosts', async () => {
|
|
75
|
-
const limiter = new RateLimiter(1);
|
|
76
|
-
const start = Date.now();
|
|
77
|
-
|
|
78
|
-
await limiter.waitForToken('host1');
|
|
79
|
-
await limiter.waitForToken('host2');
|
|
80
|
-
|
|
81
|
-
const elapsed = Date.now() - start;
|
|
82
|
-
expect(elapsed).toBeLessThan(100);
|
|
83
|
-
});
|
|
84
|
-
|
|
85
|
-
it('should respect crawlDelay if higher than rate', async () => {
|
|
86
|
-
const limiter = new RateLimiter(1); // 1000ms interval
|
|
87
|
-
const start = Date.now();
|
|
88
|
-
|
|
89
|
-
await limiter.waitForToken('host3'); // returns immediately, tokens = 0
|
|
90
|
-
await limiter.waitForToken('host3', 1); // 1s crawl delay
|
|
91
|
-
|
|
92
|
-
const elapsed = Date.now() - start;
|
|
93
|
-
expect(elapsed).toBeGreaterThanOrEqual(1000);
|
|
94
|
-
}, 5000);
|
|
95
|
-
});
|
|
96
|
-
|
|
97
|
-
describe('RetryPolicy', () => {
|
|
98
|
-
it('should retry transient failures', async () => {
|
|
99
|
-
let calls = 0;
|
|
100
|
-
const result = await RetryPolicy.execute(
|
|
101
|
-
async () => {
|
|
102
|
-
calls++;
|
|
103
|
-
if (calls < 3) throw new Error('Status 500');
|
|
104
|
-
return 'success';
|
|
105
|
-
},
|
|
106
|
-
(err) => err.message === 'Status 500',
|
|
107
|
-
{ maxRetries: 3, baseDelay: 10 }
|
|
108
|
-
);
|
|
109
|
-
|
|
110
|
-
expect(result).toBe('success');
|
|
111
|
-
expect(calls).toBe(3);
|
|
112
|
-
});
|
|
113
|
-
});
|
|
114
|
-
|
|
115
|
-
describe('ResponseLimiter', () => {
|
|
116
|
-
it('should stream to string', async () => {
|
|
117
|
-
const stream = Readable.from(['hello ', 'world']);
|
|
118
|
-
const result = await ResponseLimiter.streamToString(stream, 100);
|
|
119
|
-
expect(result).toBe('hello world');
|
|
120
|
-
});
|
|
121
|
-
|
|
122
|
-
it('should abort if limit exceeded', async () => {
|
|
123
|
-
const stream = Readable.from(['too ', 'large ', 'content']);
|
|
124
|
-
await expect(ResponseLimiter.streamToString(stream, 5)).rejects.toThrow('Oversized response');
|
|
125
|
-
});
|
|
126
|
-
});
|