@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
+
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
3
|
+
import { request, ProxyAgent } from 'undici';
|
|
4
|
+
|
|
5
|
+
vi.mock('undici', async (importOriginal) => {
|
|
6
|
+
const original = await importOriginal<typeof import('undici')>();
|
|
7
|
+
return {
|
|
8
|
+
...original,
|
|
9
|
+
request: vi.fn(),
|
|
10
|
+
ProxyAgent: vi.fn(function () {
|
|
11
|
+
return {
|
|
12
|
+
request: vi.fn(),
|
|
13
|
+
close: vi.fn()
|
|
14
|
+
};
|
|
15
|
+
})
|
|
16
|
+
};
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
describe('Proxy Integration', () => {
|
|
20
|
+
beforeEach(() => {
|
|
21
|
+
vi.clearAllMocks();
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it('should use ProxyAgent when proxyUrl is provided', async () => {
|
|
25
|
+
const fetcher = new Fetcher({ proxyUrl: 'http://proxy.com:8080', rate: 100 });
|
|
26
|
+
const mockRequest = vi.mocked(request);
|
|
27
|
+
|
|
28
|
+
// Mock the request to return a successful response immediately
|
|
29
|
+
mockRequest.mockResolvedValueOnce({
|
|
30
|
+
statusCode: 200,
|
|
31
|
+
headers: {},
|
|
32
|
+
body: {
|
|
33
|
+
on: vi.fn((event, cb) => {
|
|
34
|
+
if (event === 'data') {
|
|
35
|
+
// Simulate async data chunk
|
|
36
|
+
setTimeout(() => cb(Buffer.from('ok')), 0);
|
|
37
|
+
}
|
|
38
|
+
if (event === 'end') {
|
|
39
|
+
// Simulate async end
|
|
40
|
+
setTimeout(() => cb(), 0);
|
|
41
|
+
}
|
|
42
|
+
return { on: vi.fn() }; // chaining
|
|
43
|
+
}),
|
|
44
|
+
dump: vi.fn(),
|
|
45
|
+
text: vi.fn().mockResolvedValue('ok')
|
|
46
|
+
}
|
|
47
|
+
} as any);
|
|
48
|
+
|
|
49
|
+
await fetcher.fetch('http://target.com');
|
|
50
|
+
|
|
51
|
+
expect(ProxyAgent).toHaveBeenCalledWith('http://proxy.com:8080');
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it('should fail fast on invalid proxy URL', () => {
|
|
55
|
+
expect(() => new Fetcher({ proxyUrl: 'not-a-url' })).toThrow('Invalid proxy URL');
|
|
56
|
+
});
|
|
57
|
+
});
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
+
import { RedirectController } from '../src/core/network/redirectController.js';
|
|
3
|
+
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
4
|
+
import { request } from 'undici';
|
|
5
|
+
|
|
6
|
+
vi.mock('undici', () => ({
|
|
7
|
+
request: vi.fn(),
|
|
8
|
+
ProxyAgent: vi.fn().mockImplementation(() => ({ dispatcher: {} }))
|
|
9
|
+
}));
|
|
10
|
+
|
|
11
|
+
describe('RedirectController', () => {
|
|
12
|
+
it('should limit hops', () => {
|
|
13
|
+
const ctrl = new RedirectController(2);
|
|
14
|
+
expect(ctrl.nextHop('http://b.com')).toBe(null);
|
|
15
|
+
expect(ctrl.nextHop('http://c.com')).toBe(null);
|
|
16
|
+
expect(ctrl.nextHop('http://d.com')).toBe('redirect_limit_exceeded');
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it('should detect loops', () => {
|
|
20
|
+
const ctrl = new RedirectController(5);
|
|
21
|
+
expect(ctrl.nextHop('http://b.com')).toBe(null);
|
|
22
|
+
expect(ctrl.nextHop('http://a.com')).toBe(null);
|
|
23
|
+
expect(ctrl.nextHop('http://b.com')).toBe('redirect_loop');
|
|
24
|
+
});
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
describe('Fetcher Redirect Integration', () => {
|
|
28
|
+
let fetcher: Fetcher;
|
|
29
|
+
|
|
30
|
+
beforeEach(() => {
|
|
31
|
+
vi.clearAllMocks();
|
|
32
|
+
fetcher = new Fetcher({ rate: 100, maxRedirects: 2 });
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('should stop at max redirects', async () => {
|
|
36
|
+
const mockRequest = vi.mocked(request);
|
|
37
|
+
|
|
38
|
+
// Return 301 with unique locations
|
|
39
|
+
mockRequest
|
|
40
|
+
.mockResolvedValueOnce({
|
|
41
|
+
statusCode: 301,
|
|
42
|
+
headers: { location: 'http://a.com' },
|
|
43
|
+
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
44
|
+
} as any)
|
|
45
|
+
.mockResolvedValueOnce({
|
|
46
|
+
statusCode: 301,
|
|
47
|
+
headers: { location: 'http://b.com' },
|
|
48
|
+
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
49
|
+
} as any)
|
|
50
|
+
.mockResolvedValueOnce({
|
|
51
|
+
statusCode: 301,
|
|
52
|
+
headers: { location: 'http://c.com' },
|
|
53
|
+
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
54
|
+
} as any);
|
|
55
|
+
|
|
56
|
+
const res = await fetcher.fetch('http://start.com');
|
|
57
|
+
expect(res.status).toBe('redirect_limit_exceeded');
|
|
58
|
+
expect(res.redirectChain).toHaveLength(2);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it('should detect loops in fetch', async () => {
|
|
62
|
+
const mockRequest = vi.mocked(request);
|
|
63
|
+
|
|
64
|
+
mockRequest.mockResolvedValue({
|
|
65
|
+
statusCode: 301,
|
|
66
|
+
headers: { location: 'http://start.com' },
|
|
67
|
+
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
68
|
+
} as any);
|
|
69
|
+
|
|
70
|
+
const res = await fetcher.fetch('http://start.com');
|
|
71
|
+
expect(res.status).toBe('redirect_loop');
|
|
72
|
+
});
|
|
73
|
+
});
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
2
|
+
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
3
|
+
import { RateLimiter } from '../src/core/network/rateLimiter.js';
|
|
4
|
+
import { RetryPolicy } from '../src/core/network/retryPolicy.js';
|
|
5
|
+
import { ResponseLimiter } from '../src/core/network/responseLimiter.js';
|
|
6
|
+
import { Readable } from 'stream';
|
|
7
|
+
import * as dns from 'dns';
|
|
8
|
+
|
|
9
|
+
vi.mock('dns', () => ({
|
|
10
|
+
resolve4: vi.fn(),
|
|
11
|
+
resolve6: vi.fn(),
|
|
12
|
+
}));
|
|
13
|
+
|
|
14
|
+
describe('IPGuard', () => {
|
|
15
|
+
it('should block IPv4 internal ranges', () => {
|
|
16
|
+
expect(IPGuard.isInternal('127.0.0.1')).toBe(true);
|
|
17
|
+
expect(IPGuard.isInternal('10.0.0.1')).toBe(true);
|
|
18
|
+
expect(IPGuard.isInternal('192.168.1.1')).toBe(true);
|
|
19
|
+
expect(IPGuard.isInternal('172.16.0.1')).toBe(true);
|
|
20
|
+
expect(IPGuard.isInternal('172.31.255.255')).toBe(true);
|
|
21
|
+
expect(IPGuard.isInternal('169.254.1.1')).toBe(true);
|
|
22
|
+
expect(IPGuard.isInternal('0.0.0.0')).toBe(true);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it('should allow public IPv4', () => {
|
|
26
|
+
expect(IPGuard.isInternal('8.8.8.8')).toBe(false);
|
|
27
|
+
expect(IPGuard.isInternal('1.1.1.1')).toBe(false);
|
|
28
|
+
expect(IPGuard.isInternal('172.32.0.1')).toBe(false);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it('should block IPv6 internal/local addresses', () => {
|
|
32
|
+
expect(IPGuard.isInternal('::1')).toBe(true);
|
|
33
|
+
expect(IPGuard.isInternal('fc00::1')).toBe(true);
|
|
34
|
+
expect(IPGuard.isInternal('fe80::1')).toBe(true);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it('should validate hostname by resolving IPs', async () => {
|
|
38
|
+
const resolve4Spy = vi.mocked(dns.resolve4);
|
|
39
|
+
const resolve6Spy = vi.mocked(dns.resolve6);
|
|
40
|
+
|
|
41
|
+
resolve4Spy.mockImplementation((_h: string, cb: any) => cb(null, ['1.1.1.1']));
|
|
42
|
+
resolve6Spy.mockImplementation((_h: string, cb: any) => cb(null, []));
|
|
43
|
+
expect(await IPGuard.validateHost('example.com')).toBe(true);
|
|
44
|
+
|
|
45
|
+
resolve4Spy.mockImplementation((_h: string, cb: any) => cb(null, ['127.0.0.1']));
|
|
46
|
+
expect(await IPGuard.validateHost('localhost')).toBe(false);
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
describe('RateLimiter', () => {
|
|
51
|
+
it('should enforce rate limits', async () => {
|
|
52
|
+
const limiter = new RateLimiter(1); // 1 req/sec = 1000ms interval
|
|
53
|
+
const start = Date.now();
|
|
54
|
+
|
|
55
|
+
await limiter.waitForToken('host1'); // returns immediately, tokens becomes 0
|
|
56
|
+
await limiter.waitForToken('host1'); // waits for refill (1s)
|
|
57
|
+
|
|
58
|
+
const elapsed = Date.now() - start;
|
|
59
|
+
expect(elapsed).toBeGreaterThanOrEqual(1000);
|
|
60
|
+
}, 5000);
|
|
61
|
+
|
|
62
|
+
it('should have separate buckets for hosts', async () => {
|
|
63
|
+
const limiter = new RateLimiter(1);
|
|
64
|
+
const start = Date.now();
|
|
65
|
+
|
|
66
|
+
await limiter.waitForToken('host1');
|
|
67
|
+
await limiter.waitForToken('host2');
|
|
68
|
+
|
|
69
|
+
const elapsed = Date.now() - start;
|
|
70
|
+
expect(elapsed).toBeLessThan(100);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it('should respect crawlDelay if higher than rate', async () => {
|
|
74
|
+
const limiter = new RateLimiter(1); // 1000ms interval
|
|
75
|
+
const start = Date.now();
|
|
76
|
+
|
|
77
|
+
await limiter.waitForToken('host3'); // returns immediately, tokens = 0
|
|
78
|
+
await limiter.waitForToken('host3', 1); // 1s crawl delay
|
|
79
|
+
|
|
80
|
+
const elapsed = Date.now() - start;
|
|
81
|
+
expect(elapsed).toBeGreaterThanOrEqual(1000);
|
|
82
|
+
}, 5000);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
describe('RetryPolicy', () => {
|
|
86
|
+
it('should retry transient failures', async () => {
|
|
87
|
+
let calls = 0;
|
|
88
|
+
const result = await RetryPolicy.execute(
|
|
89
|
+
async () => {
|
|
90
|
+
calls++;
|
|
91
|
+
if (calls < 3) throw new Error('Status 500');
|
|
92
|
+
return 'success';
|
|
93
|
+
},
|
|
94
|
+
(err) => err.message === 'Status 500',
|
|
95
|
+
{ maxRetries: 3, baseDelay: 10 }
|
|
96
|
+
);
|
|
97
|
+
|
|
98
|
+
expect(result).toBe('success');
|
|
99
|
+
expect(calls).toBe(3);
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
describe('ResponseLimiter', () => {
|
|
104
|
+
it('should stream to string', async () => {
|
|
105
|
+
const stream = Readable.from(['hello ', 'world']);
|
|
106
|
+
const result = await ResponseLimiter.streamToString(stream, 100);
|
|
107
|
+
expect(result).toBe('hello world');
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it('should abort if limit exceeded', async () => {
|
|
111
|
+
const stream = Readable.from(['too ', 'large ', 'content']);
|
|
112
|
+
await expect(ResponseLimiter.streamToString(stream, 5)).rejects.toThrow('Oversized response');
|
|
113
|
+
});
|
|
114
|
+
});
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { DomainFilter } from '../src/core/scope/domainFilter.js';
|
|
3
|
+
import { SubdomainPolicy } from '../src/core/scope/subdomainPolicy.js';
|
|
4
|
+
import { ScopeManager } from '../src/core/scope/scopeManager.js';
|
|
5
|
+
|
|
6
|
+
describe('DomainFilter', () => {
|
|
7
|
+
it('should normalize hostnames', () => {
|
|
8
|
+
const filter = new DomainFilter(['EXAMPLE.COM.'], ['DENY.COM.']);
|
|
9
|
+
expect(filter.isAllowed('example.com')).toBe(true);
|
|
10
|
+
expect(filter.isAllowed('deny.com')).toBe(false);
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
it('should respect precedence (deny wins)', () => {
|
|
14
|
+
const filter = new DomainFilter(['example.com'], ['example.com']);
|
|
15
|
+
expect(filter.isAllowed('example.com')).toBe(false);
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
it('should handle punycode', () => {
|
|
19
|
+
// xn--80ak6aa92e.com is punycode for пример.com
|
|
20
|
+
const filter = new DomainFilter(['xn--80ak6aa92e.com']);
|
|
21
|
+
expect(filter.isAllowed('XN--80AK6AA92E.COM')).toBe(true);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it('should block if not in allow list (when list not empty)', () => {
|
|
25
|
+
const filter = new DomainFilter(['allowed.com']);
|
|
26
|
+
expect(filter.isAllowed('other.com')).toBe(false);
|
|
27
|
+
});
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
describe('SubdomainPolicy', () => {
|
|
31
|
+
it('should enforce exact match by default', () => {
|
|
32
|
+
const policy = new SubdomainPolicy('https://example.com');
|
|
33
|
+
expect(policy.isAllowed('example.com')).toBe(true);
|
|
34
|
+
expect(policy.isAllowed('sub.example.com')).toBe(false);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it('should allow valid subdomains when enabled', () => {
|
|
38
|
+
const policy = new SubdomainPolicy('https://example.com', true);
|
|
39
|
+
expect(policy.isAllowed('example.com')).toBe(true);
|
|
40
|
+
expect(policy.isAllowed('sub.example.com')).toBe(true);
|
|
41
|
+
expect(policy.isAllowed('deep.sub.example.com')).toBe(true);
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
it('should reject malicious suffix matches', () => {
|
|
45
|
+
const policy = new SubdomainPolicy('https://example.com', true);
|
|
46
|
+
expect(policy.isAllowed('evil-example.com')).toBe(false);
|
|
47
|
+
expect(policy.isAllowed('example.com.evil.com')).toBe(false);
|
|
48
|
+
});
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
describe('ScopeManager', () => {
|
|
52
|
+
it('should compose policies correctly', () => {
|
|
53
|
+
const manager = new ScopeManager({
|
|
54
|
+
rootUrl: 'https://example.com',
|
|
55
|
+
allowedDomains: ['example.com', 'sub.example.com', 'other.com'],
|
|
56
|
+
deniedDomains: ['bad.example.com'],
|
|
57
|
+
includeSubdomains: true
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
expect(manager.isUrlEligible('https://example.com/')).toBe('allowed');
|
|
61
|
+
expect(manager.isUrlEligible('https://sub.example.com/')).toBe('allowed');
|
|
62
|
+
expect(manager.isUrlEligible('https://bad.example.com/')).toBe('blocked_by_domain_filter');
|
|
63
|
+
expect(manager.isUrlEligible('https://other.com/')).toBe('allowed');
|
|
64
|
+
expect(manager.isUrlEligible('https://google.com/')).toBe('blocked_by_domain_filter');
|
|
65
|
+
});
|
|
66
|
+
});
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { expect, test } from 'vitest';
|
|
2
|
+
import { scorePageSeo, aggregateSiteScore } from '../src/analysis/scoring.js';
|
|
3
|
+
import { PageAnalysis } from '../src/analysis/analyze.js';
|
|
4
|
+
|
|
5
|
+
const basePage: PageAnalysis = {
|
|
6
|
+
url: 'https://example.com',
|
|
7
|
+
status: 200,
|
|
8
|
+
title: { value: 'x'.repeat(55), length: 55, status: 'ok' },
|
|
9
|
+
metaDescription: { value: 'x'.repeat(150), length: 150, status: 'ok' },
|
|
10
|
+
h1: { count: 1, status: 'ok', matchesTitle: false },
|
|
11
|
+
content: { wordCount: 700, textHtmlRatio: 0.3, uniqueSentenceCount: 8 },
|
|
12
|
+
thinScore: 0,
|
|
13
|
+
images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
|
|
14
|
+
links: { internalLinks: 5, externalLinks: 2, nofollowCount: 1, externalRatio: 2 / 7 },
|
|
15
|
+
structuredData: { present: true, valid: true, types: ['Article'] },
|
|
16
|
+
seoScore: 0
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
test('page score stays in 0-100', () => {
|
|
20
|
+
expect(scorePageSeo(basePage)).toBeGreaterThanOrEqual(0);
|
|
21
|
+
expect(scorePageSeo(basePage)).toBeLessThanOrEqual(100);
|
|
22
|
+
|
|
23
|
+
const badPage: PageAnalysis = {
|
|
24
|
+
...basePage,
|
|
25
|
+
title: { value: null, length: 0, status: 'missing' },
|
|
26
|
+
metaDescription: { value: null, length: 0, status: 'missing' },
|
|
27
|
+
h1: { count: 0, status: 'critical', matchesTitle: false },
|
|
28
|
+
content: { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
|
|
29
|
+
thinScore: 100,
|
|
30
|
+
images: { totalImages: 2, missingAlt: 2, emptyAlt: 0 },
|
|
31
|
+
structuredData: { present: false, valid: false, types: [] },
|
|
32
|
+
links: { internalLinks: 0, externalLinks: 9, nofollowCount: 9, externalRatio: 1 }
|
|
33
|
+
};
|
|
34
|
+
expect(scorePageSeo(badPage)).toBeLessThan(50);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
test('aggregate site score includes existing metrics signals', () => {
|
|
38
|
+
const score = aggregateSiteScore({
|
|
39
|
+
totalPages: 2,
|
|
40
|
+
totalEdges: 1,
|
|
41
|
+
orphanPages: ['https://example.com/x'],
|
|
42
|
+
nearOrphans: [],
|
|
43
|
+
deepPages: [],
|
|
44
|
+
topAuthorityPages: [{ url: 'a', authority: 1 }],
|
|
45
|
+
averageOutDegree: 1,
|
|
46
|
+
maxDepthFound: 1,
|
|
47
|
+
crawlEfficiencyScore: 0.8,
|
|
48
|
+
averageDepth: 1,
|
|
49
|
+
structuralEntropy: 2,
|
|
50
|
+
limitReached: false
|
|
51
|
+
}, [
|
|
52
|
+
{ ...basePage, seoScore: 70 },
|
|
53
|
+
{ ...basePage, seoScore: 90, url: 'https://example.com/2' }
|
|
54
|
+
]);
|
|
55
|
+
|
|
56
|
+
expect(score.seoHealthScore).toBe(80);
|
|
57
|
+
expect(score.overallScore).toBeGreaterThan(0);
|
|
58
|
+
expect(score.overallScore).toBeLessThanOrEqual(100);
|
|
59
|
+
});
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { test, expect, beforeEach } from 'vitest';
|
|
2
|
+
import { Sitemap } from '../src/crawler/sitemap.js';
|
|
3
|
+
import { MockAgent, setGlobalDispatcher } from 'undici';
|
|
4
|
+
|
|
5
|
+
let mockAgent: MockAgent;
|
|
6
|
+
|
|
7
|
+
beforeEach(() => {
|
|
8
|
+
mockAgent = new MockAgent();
|
|
9
|
+
mockAgent.disableNetConnect();
|
|
10
|
+
setGlobalDispatcher(mockAgent);
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
test('fetches and parses simple sitemap', async () => {
|
|
14
|
+
const client = mockAgent.get('https://example.com');
|
|
15
|
+
client.intercept({
|
|
16
|
+
path: '/sitemap.xml',
|
|
17
|
+
method: 'GET'
|
|
18
|
+
}).reply(200, `
|
|
19
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
20
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
21
|
+
<url>
|
|
22
|
+
<loc>https://example.com/page1</loc>
|
|
23
|
+
</url>
|
|
24
|
+
<url>
|
|
25
|
+
<loc>https://example.com/page2</loc>
|
|
26
|
+
</url>
|
|
27
|
+
</urlset>
|
|
28
|
+
`);
|
|
29
|
+
|
|
30
|
+
const sitemap = new Sitemap();
|
|
31
|
+
const urls = await sitemap.fetch('https://example.com/sitemap.xml');
|
|
32
|
+
expect(urls).toContain('https://example.com/page1');
|
|
33
|
+
expect(urls).toContain('https://example.com/page2');
|
|
34
|
+
expect(urls.length).toBe(2);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
test('handles sitemap index recursively', async () => {
|
|
38
|
+
const client = mockAgent.get('https://example.com');
|
|
39
|
+
|
|
40
|
+
// Index
|
|
41
|
+
client.intercept({
|
|
42
|
+
path: '/sitemap-index.xml',
|
|
43
|
+
method: 'GET'
|
|
44
|
+
}).reply(200, `
|
|
45
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
46
|
+
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
47
|
+
<sitemap>
|
|
48
|
+
<loc>https://example.com/sitemap1.xml</loc>
|
|
49
|
+
</sitemap>
|
|
50
|
+
</sitemapindex>
|
|
51
|
+
`);
|
|
52
|
+
|
|
53
|
+
// Child sitemap
|
|
54
|
+
client.intercept({
|
|
55
|
+
path: '/sitemap1.xml',
|
|
56
|
+
method: 'GET'
|
|
57
|
+
}).reply(200, `
|
|
58
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
59
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
60
|
+
<url>
|
|
61
|
+
<loc>https://example.com/page3</loc>
|
|
62
|
+
</url>
|
|
63
|
+
</urlset>
|
|
64
|
+
`);
|
|
65
|
+
|
|
66
|
+
const sitemap = new Sitemap();
|
|
67
|
+
const urls = await sitemap.fetch('https://example.com/sitemap-index.xml');
|
|
68
|
+
expect(urls).toContain('https://example.com/page3');
|
|
69
|
+
expect(urls.length).toBe(1);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
test('handles invalid xml gracefully', async () => {
|
|
73
|
+
const client = mockAgent.get('https://example.com');
|
|
74
|
+
client.intercept({ path: '/bad.xml', method: 'GET' }).reply(200, 'Not XML');
|
|
75
|
+
|
|
76
|
+
const sitemap = new Sitemap();
|
|
77
|
+
const urls = await sitemap.fetch('https://example.com/bad.xml');
|
|
78
|
+
expect(urls.length).toBe(0);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
test('handles fetch errors gracefully', async () => {
|
|
82
|
+
const client = mockAgent.get('https://example.com');
|
|
83
|
+
client.intercept({ path: '/error.xml', method: 'GET' }).reply(500, 'Error');
|
|
84
|
+
|
|
85
|
+
const sitemap = new Sitemap();
|
|
86
|
+
const urls = await sitemap.fetch('https://example.com/error.xml');
|
|
87
|
+
expect(urls.length).toBe(0);
|
|
88
|
+
});
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { Parser } from '../src/crawler/parser.js';
|
|
3
|
+
|
|
4
|
+
describe('Soft 404 Detection', () => {
|
|
5
|
+
const parser = new Parser();
|
|
6
|
+
const baseUrl = 'https://example.com';
|
|
7
|
+
|
|
8
|
+
it('should detect soft 404 by title pattern', () => {
|
|
9
|
+
const html = '<html><head><title>Page Not Found</title></head><body>Welcome to the site</body></html>';
|
|
10
|
+
const result = parser.parse(html, baseUrl, 200);
|
|
11
|
+
expect(result.soft404Score).toBeGreaterThan(0.3);
|
|
12
|
+
expect(result.soft404Signals).toContain('title_pattern_not_found');
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
it('should detect soft 404 by H1 pattern', () => {
|
|
16
|
+
const html = '<html><body><h1>404 Error</h1></body></html>';
|
|
17
|
+
const result = parser.parse(html, baseUrl, 200);
|
|
18
|
+
expect(result.soft404Score).toBeGreaterThan(0.2);
|
|
19
|
+
expect(result.soft404Signals).toContain('h1_pattern_404');
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it('should detect soft 404 by very low word count', () => {
|
|
23
|
+
const html = '<html><body>Short text</body></html>';
|
|
24
|
+
const result = parser.parse(html, baseUrl, 200);
|
|
25
|
+
expect(result.soft404Score).toBeGreaterThan(0.2);
|
|
26
|
+
expect(result.soft404Signals).toContain('very_low_word_count');
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it('should detect soft 404 by lack of outbound links', () => {
|
|
30
|
+
const html = '<html><body>A page with some text but no links.</body></html>';
|
|
31
|
+
const result = parser.parse(html, baseUrl, 200);
|
|
32
|
+
expect(result.soft404Signals).toContain('no_outbound_links');
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('should combine multiple signals for high score', () => {
|
|
36
|
+
const html = '<html><head><title>Error</title></head><body><h1>Not Found</h1><p>The requested page was not found.</p></body></html>';
|
|
37
|
+
const result = parser.parse(html, baseUrl, 200);
|
|
38
|
+
// title (0.4) + h1 (0.3) + body phrase (0.2) + low word count (0.3) = 1.2 -> capped at 1.0
|
|
39
|
+
expect(result.soft404Score).toBe(1.0);
|
|
40
|
+
});
|
|
41
|
+
});
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { TrapDetector } from '../src/crawler/trap.js';
|
|
3
|
+
|
|
4
|
+
describe('TrapDetector', () => {
|
|
5
|
+
const detector = new TrapDetector();
|
|
6
|
+
|
|
7
|
+
it('should detect session ID traps', () => {
|
|
8
|
+
const result = detector.checkTrap('https://example.com/page?sid=12345', 1);
|
|
9
|
+
expect(result.risk).toBeGreaterThan(0.8);
|
|
10
|
+
expect(result.type).toBe('session_trap');
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
it('should detect calendar patterns', () => {
|
|
14
|
+
const result = detector.checkTrap('https://example.com/archive/2023/12/01/', 1);
|
|
15
|
+
expect(result.risk).toBeGreaterThan(0.6);
|
|
16
|
+
expect(result.type).toBe('calendar_trap');
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it('should detect pagination loops', () => {
|
|
20
|
+
// Simulate many pages
|
|
21
|
+
for (let i = 1; i <= 60; i++) {
|
|
22
|
+
detector.checkTrap(`https://example.com/blog?page=${i}`, 1);
|
|
23
|
+
}
|
|
24
|
+
const result = detector.checkTrap('https://example.com/blog?page=61', 1);
|
|
25
|
+
expect(result.risk).toBeGreaterThan(0.8);
|
|
26
|
+
expect(result.type).toBe('pagination_loop');
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it('should detect faceted navigation / parameter explosion', () => {
|
|
30
|
+
detector.reset();
|
|
31
|
+
const basePath = 'https://example.com/products';
|
|
32
|
+
for (let i = 1; i <= 35; i++) {
|
|
33
|
+
detector.checkTrap(`${basePath}?color=red&size=${i}`, 1);
|
|
34
|
+
}
|
|
35
|
+
const result = detector.checkTrap(`${basePath}?color=blue&size=large`, 1);
|
|
36
|
+
expect(result.risk).toBeGreaterThan(0.9);
|
|
37
|
+
expect(result.type).toBe('faceted_navigation');
|
|
38
|
+
});
|
|
39
|
+
});
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { SITEGRAPH_HTML } from '../src/report/sitegraph_template.js';
|
|
3
|
+
import { Graph } from '../src/graph/graph.js';
|
|
4
|
+
import { computePageRank } from '../src/graph/pagerank.js';
|
|
5
|
+
|
|
6
|
+
describe('Visualization Data & Template', () => {
|
|
7
|
+
it('should include pageRankScore in graph JSON output after PageRank computation', () => {
|
|
8
|
+
const graph = new Graph();
|
|
9
|
+
graph.addNode('https://a.com', 0, 200);
|
|
10
|
+
graph.addNode('https://b.com', 1, 200);
|
|
11
|
+
graph.addEdge('https://a.com', 'https://b.com');
|
|
12
|
+
|
|
13
|
+
computePageRank(graph);
|
|
14
|
+
|
|
15
|
+
const json = graph.toJSON();
|
|
16
|
+
const nodeA = json.nodes.find(n => n.url === 'https://a.com');
|
|
17
|
+
const nodeB = json.nodes.find(n => n.url === 'https://b.com');
|
|
18
|
+
|
|
19
|
+
expect(nodeA).toBeDefined();
|
|
20
|
+
expect(nodeB).toBeDefined();
|
|
21
|
+
expect(typeof nodeA?.pageRankScore).toBe('number');
|
|
22
|
+
expect(typeof nodeB?.pageRankScore).toBe('number');
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it('should contain UI toggle buttons for Authority Mode', () => {
|
|
26
|
+
expect(SITEGRAPH_HTML).toContain('id="btn-auth-pagerank"');
|
|
27
|
+
expect(SITEGRAPH_HTML).toContain('id="btn-auth-structural"');
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('should contain setAuthorityMode function', () => {
|
|
31
|
+
// Use regex to be flexible with whitespace
|
|
32
|
+
expect(SITEGRAPH_HTML).toMatch(/function\s+setAuthorityMode\s*\(mode,\s*btn\)/);
|
|
33
|
+
expect(SITEGRAPH_HTML).toContain('n.authority = mode === \'pagerank\' ? n.pageRankAuthority : n.structuralAuthority');
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('should contain logic to calculate pageRankAuthority from pageRankScore', () => {
|
|
37
|
+
expect(SITEGRAPH_HTML).toContain('n.pageRankAuthority = n.pageRankScore / 100');
|
|
38
|
+
expect(SITEGRAPH_HTML).toContain('n.structuralAuthority = Math.log(1 + n.inLinks)');
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it('should update details panel to show both metrics', () => {
|
|
42
|
+
expect(SITEGRAPH_HTML).toContain('id="d-auth-container"');
|
|
43
|
+
expect(SITEGRAPH_HTML).toContain('In-Degree: ${structVal}');
|
|
44
|
+
expect(SITEGRAPH_HTML).toContain('PR: <strong>${prVal}</strong>');
|
|
45
|
+
});
|
|
46
|
+
});
|