@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { calculateScore } from '../../src/audit/scoring.js';
|
|
3
|
+
import { TransportDiagnostics, DnsDiagnostics, SecurityHeadersResult, PerformanceMetrics, AuditIssue } from '../../src/audit/types.js';
|
|
4
|
+
|
|
5
|
+
describe('Scoring Engine', () => {
|
|
6
|
+
const mockTransport: TransportDiagnostics = {
|
|
7
|
+
tlsVersion: 'TLSv1.3',
|
|
8
|
+
cipherSuite: 'TLS_AES_256_GCM_SHA384',
|
|
9
|
+
alpnProtocol: 'h2',
|
|
10
|
+
certificate: {
|
|
11
|
+
issuer: 'Let\'s Encrypt',
|
|
12
|
+
subject: 'example.com',
|
|
13
|
+
validFrom: '2023-01-01',
|
|
14
|
+
validTo: '2024-01-01',
|
|
15
|
+
daysUntilExpiry: 60,
|
|
16
|
+
isSelfSigned: false,
|
|
17
|
+
isValidChain: true,
|
|
18
|
+
fingerprint: 'SHA256:...'
|
|
19
|
+
} as any,
|
|
20
|
+
httpVersion: '2.0',
|
|
21
|
+
compression: ['gzip'],
|
|
22
|
+
keepAlive: true,
|
|
23
|
+
transferEncoding: null,
|
|
24
|
+
redirectCount: 0,
|
|
25
|
+
redirects: [],
|
|
26
|
+
serverHeader: 'nginx',
|
|
27
|
+
headers: {}
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
const mockDns: DnsDiagnostics = {
|
|
31
|
+
a: ['1.1.1.1', '1.0.0.1'],
|
|
32
|
+
aaaa: ['2606:4700:4700::1111'],
|
|
33
|
+
cname: [],
|
|
34
|
+
reverse: [],
|
|
35
|
+
ipCount: 3,
|
|
36
|
+
ipv6Support: true,
|
|
37
|
+
resolutionTime: 10
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
const mockHeaders: SecurityHeadersResult = {
|
|
41
|
+
strictTransportSecurity: { present: true, valid: true, value: 'max-age=31536000' },
|
|
42
|
+
contentSecurityPolicy: { present: true, valid: true, value: "default-src 'self'" },
|
|
43
|
+
xFrameOptions: { present: true, valid: true, value: 'DENY' },
|
|
44
|
+
xContentTypeOptions: { present: true, valid: true, value: 'nosniff' },
|
|
45
|
+
referrerPolicy: { present: true, valid: true, value: 'strict-origin' },
|
|
46
|
+
permissionsPolicy: { present: true, valid: true, value: 'geolocation=()' },
|
|
47
|
+
details: {},
|
|
48
|
+
score: 100
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
const mockPerformance: PerformanceMetrics = {
|
|
52
|
+
dnsLookupTime: 10,
|
|
53
|
+
tcpConnectTime: 20,
|
|
54
|
+
tlsHandshakeTime: 30,
|
|
55
|
+
ttfb: 100,
|
|
56
|
+
totalTime: 200,
|
|
57
|
+
htmlSize: 50000,
|
|
58
|
+
headerSize: 500,
|
|
59
|
+
redirectTime: 0
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
it('should give perfect score for perfect inputs', () => {
|
|
63
|
+
const result = calculateScore(mockTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
64
|
+
expect(result.score).toBe(100);
|
|
65
|
+
expect(result.grade).toBe('A');
|
|
66
|
+
expect(result.issues).toHaveLength(0);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it('should penalize TLS < 1.2', () => {
|
|
70
|
+
const badTransport = { ...mockTransport, tlsVersion: 'TLSv1.1' };
|
|
71
|
+
const result = calculateScore(badTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
72
|
+
expect(result.score).toBeLessThan(100);
|
|
73
|
+
expect(result.categoryScores.transport).toBeLessThan(30);
|
|
74
|
+
expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'tls-old' })]));
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it('should penalize missing HTTPS', () => {
|
|
78
|
+
const badTransport = { ...mockTransport, tlsVersion: null, certificate: null };
|
|
79
|
+
const result = calculateScore(badTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
80
|
+
expect(result.score).toBeLessThan(50); // Critical
|
|
81
|
+
expect(result.grade).toBe('F');
|
|
82
|
+
expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'no-https' })]));
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it('should fail on expired cert', () => {
|
|
86
|
+
const expiredTransport = {
|
|
87
|
+
...mockTransport,
|
|
88
|
+
certificate: { ...mockTransport.certificate!, daysUntilExpiry: -5, validTo: '2023-01-01' }
|
|
89
|
+
};
|
|
90
|
+
const result = calculateScore(expiredTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
91
|
+
expect(result.grade).toBe('F');
|
|
92
|
+
expect(result.score).toBeLessThanOrEqual(40);
|
|
93
|
+
expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'cert-expired' })]));
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it('should penalize missing security headers', () => {
|
|
97
|
+
// If score is 50, it means we lost 50 points in headers category (internal score)
|
|
98
|
+
// headers category is 20 points total. So we lose 10 points.
|
|
99
|
+
const badHeaders = { ...mockHeaders, score: 50, strictTransportSecurity: { present: false, valid: false, value: null } };
|
|
100
|
+
const result = calculateScore(mockTransport, mockDns, badHeaders, mockPerformance, []);
|
|
101
|
+
expect(result.categoryScores.security).toBe(10);
|
|
102
|
+
expect(result.score).toBe(90); // 100 - 10
|
|
103
|
+
expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'hsts-missing' })]));
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it('should penalize poor performance', () => {
|
|
107
|
+
const badPerf = { ...mockPerformance, ttfb: 1000, htmlSize: 2000000 };
|
|
108
|
+
const result = calculateScore(mockTransport, mockDns, mockHeaders, badPerf, []);
|
|
109
|
+
// TTFB > 800: Lose 10 pts
|
|
110
|
+
// HTML > 1MB: Lose 5 pts
|
|
111
|
+
// Total perf score (30) -> 15.
|
|
112
|
+
expect(result.categoryScores.performance).toBe(15);
|
|
113
|
+
expect(result.score).toBe(85);
|
|
114
|
+
expect(result.issues).toEqual(expect.arrayContaining([
|
|
115
|
+
expect.objectContaining({ id: 'slow-ttfb' }),
|
|
116
|
+
expect.objectContaining({ id: 'large-html' })
|
|
117
|
+
]));
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
it('should penalize infrastructure issues', () => {
|
|
121
|
+
const badDns = { ...mockDns, ipv6Support: false, ipCount: 1 };
|
|
122
|
+
const result = calculateScore(mockTransport, badDns, mockHeaders, mockPerformance, []);
|
|
123
|
+
// No IPv6: Lose 10 pts
|
|
124
|
+
// Single IP: Lose 10 pts
|
|
125
|
+
// Infra score (20) -> 0.
|
|
126
|
+
expect(result.categoryScores.infrastructure).toBe(0);
|
|
127
|
+
expect(result.score).toBe(80);
|
|
128
|
+
expect(result.issues).toEqual(expect.arrayContaining([
|
|
129
|
+
expect.objectContaining({ id: 'no-ipv6' }),
|
|
130
|
+
expect.objectContaining({ id: 'single-ip' })
|
|
131
|
+
]));
|
|
132
|
+
});
|
|
133
|
+
});
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { auditUrl } from '../../src/audit/index.js';
|
|
3
|
+
|
|
4
|
+
describe('Audit Security', () => {
|
|
5
|
+
it('should block audits of internal IP addresses', async () => {
|
|
6
|
+
await expect(auditUrl('http://127.0.0.1')).rejects.toThrow('Access to internal or private infrastructure is prohibited');
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
it('should block audits of link-local addresses', async () => {
|
|
10
|
+
await expect(auditUrl('http://169.254.169.254')).rejects.toThrow('Access to internal or private infrastructure is prohibited');
|
|
11
|
+
});
|
|
12
|
+
});
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import { describe, it, expect, vi, afterEach } from 'vitest';
|
|
2
|
+
import { analyzeTransport } from '../../src/audit/transport.js';
|
|
3
|
+
import https from 'node:https';
|
|
4
|
+
import http from 'node:http';
|
|
5
|
+
import tls from 'node:tls';
|
|
6
|
+
import { EventEmitter } from 'events';
|
|
7
|
+
|
|
8
|
+
vi.mock('node:https');
|
|
9
|
+
vi.mock('node:http');
|
|
10
|
+
|
|
11
|
+
describe('Transport Diagnostics', () => {
|
|
12
|
+
afterEach(() => {
|
|
13
|
+
vi.clearAllMocks();
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it('should analyze HTTPS transport', async () => {
|
|
17
|
+
// Mock Response
|
|
18
|
+
const mockRes = new EventEmitter() as any;
|
|
19
|
+
mockRes.statusCode = 200;
|
|
20
|
+
mockRes.statusMessage = 'OK';
|
|
21
|
+
mockRes.headers = {
|
|
22
|
+
'content-encoding': 'gzip',
|
|
23
|
+
'server': 'nginx',
|
|
24
|
+
'connection': 'keep-alive'
|
|
25
|
+
};
|
|
26
|
+
mockRes.httpVersion = '1.1';
|
|
27
|
+
|
|
28
|
+
const mockSocket = new EventEmitter();
|
|
29
|
+
Object.setPrototypeOf(mockSocket, tls.TLSSocket.prototype);
|
|
30
|
+
(mockSocket as any).getPeerCertificate = () => ({
|
|
31
|
+
subject: { CN: 'example.com' },
|
|
32
|
+
issuer: { CN: 'Let\'s Encrypt' },
|
|
33
|
+
valid_from: 'Jan 1 2023',
|
|
34
|
+
valid_to: 'Jan 1 2024',
|
|
35
|
+
fingerprint: 'SHA256:...'
|
|
36
|
+
});
|
|
37
|
+
(mockSocket as any).getProtocol = () => 'TLSv1.3';
|
|
38
|
+
(mockSocket as any).getCipher = () => ({ name: 'TLS_AES_...' });
|
|
39
|
+
(mockSocket as any).alpnProtocol = 'h2';
|
|
40
|
+
(mockSocket as any).authorized = true;
|
|
41
|
+
|
|
42
|
+
mockRes.socket = mockSocket;
|
|
43
|
+
|
|
44
|
+
// Mock Request
|
|
45
|
+
const mockReq = new EventEmitter() as any;
|
|
46
|
+
mockReq.end = vi.fn();
|
|
47
|
+
mockReq.destroy = vi.fn();
|
|
48
|
+
|
|
49
|
+
// Mock https.request
|
|
50
|
+
vi.spyOn(https, 'request').mockImplementation((url, options, cb) => {
|
|
51
|
+
if (cb) cb(mockRes);
|
|
52
|
+
// Simulate socket events
|
|
53
|
+
setTimeout(() => {
|
|
54
|
+
mockReq.emit('socket', mockRes.socket);
|
|
55
|
+
mockRes.socket.emit('lookup');
|
|
56
|
+
mockRes.socket.emit('connect');
|
|
57
|
+
mockRes.socket.emit('secureConnect');
|
|
58
|
+
mockReq.emit('finish');
|
|
59
|
+
// Response data
|
|
60
|
+
mockRes.emit('data', Buffer.from('<html></html>'));
|
|
61
|
+
mockRes.emit('end');
|
|
62
|
+
}, 10);
|
|
63
|
+
return mockReq;
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
const result = await analyzeTransport('https://example.com', 1000);
|
|
67
|
+
expect(result.transport.tlsVersion).toBe('TLSv1.3');
|
|
68
|
+
expect(result.transport.httpVersion).toBe('1.1');
|
|
69
|
+
expect(result.performance.htmlSize).toBeGreaterThan(0);
|
|
70
|
+
expect(result.transport.headers['server']).toBe('nginx');
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it('should handle redirects', async () => {
|
|
74
|
+
const req1 = new EventEmitter() as any; req1.end = vi.fn(); req1.destroy = vi.fn();
|
|
75
|
+
const res1 = new EventEmitter() as any; res1.statusCode = 301; res1.headers = { location: 'https://example.com/' };
|
|
76
|
+
res1.socket = new EventEmitter(); Object.setPrototypeOf(res1.socket, tls.TLSSocket.prototype);
|
|
77
|
+
|
|
78
|
+
const req2 = new EventEmitter() as any; req2.end = vi.fn(); req2.destroy = vi.fn();
|
|
79
|
+
const res2 = new EventEmitter() as any; res2.statusCode = 200; res2.headers = {};
|
|
80
|
+
res2.socket = new EventEmitter(); Object.setPrototypeOf(res2.socket, tls.TLSSocket.prototype);
|
|
81
|
+
|
|
82
|
+
// Setup res2 socket for TLS checks
|
|
83
|
+
res2.socket.getPeerCertificate = () => ({});
|
|
84
|
+
res2.socket.getProtocol = () => 'TLSv1.2';
|
|
85
|
+
res2.socket.getCipher = () => ({ name: 'AES' });
|
|
86
|
+
|
|
87
|
+
const requestSpy = vi.spyOn(https, 'request');
|
|
88
|
+
requestSpy
|
|
89
|
+
.mockImplementationOnce((url, options, cb) => {
|
|
90
|
+
if (cb) cb(res1);
|
|
91
|
+
setTimeout(() => {
|
|
92
|
+
req1.emit('socket', res1.socket);
|
|
93
|
+
res1.emit('data', Buffer.from('redirecting'));
|
|
94
|
+
res1.emit('end');
|
|
95
|
+
}, 10);
|
|
96
|
+
return req1;
|
|
97
|
+
})
|
|
98
|
+
.mockImplementationOnce((url, options, cb) => {
|
|
99
|
+
if (cb) cb(res2);
|
|
100
|
+
setTimeout(() => {
|
|
101
|
+
req2.emit('socket', res2.socket);
|
|
102
|
+
res2.emit('data', Buffer.from('ok'));
|
|
103
|
+
res2.emit('end');
|
|
104
|
+
}, 10);
|
|
105
|
+
return req2;
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
const result = await analyzeTransport('https://redirect.com', 1000);
|
|
109
|
+
expect(result.transport.redirectCount).toBe(1);
|
|
110
|
+
expect(result.transport.redirects[0].location).toBe('https://example.com/');
|
|
111
|
+
});
|
|
112
|
+
});
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach } from 'vitest';
|
|
2
|
+
import { Graph } from '../src/graph/graph.js';
|
|
3
|
+
import { detectContentClusters } from '../src/graph/cluster.js';
|
|
4
|
+
|
|
5
|
+
describe('Content Clustering', () => {
|
|
6
|
+
let graph: Graph;
|
|
7
|
+
|
|
8
|
+
beforeEach(() => {
|
|
9
|
+
graph = new Graph();
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
it('should group similar pages into a cluster', () => {
|
|
13
|
+
// Mock simhashes for similar pages (Hamming distance 1)
|
|
14
|
+
const h1 = 0b101010n;
|
|
15
|
+
const h2 = 0b101011n;
|
|
16
|
+
const h3 = 0b101001n;
|
|
17
|
+
|
|
18
|
+
graph.addNode('https://example.com/p1', 0, 200);
|
|
19
|
+
graph.addNode('https://example.com/p2', 0, 200);
|
|
20
|
+
graph.addNode('https://example.com/p3', 0, 200);
|
|
21
|
+
|
|
22
|
+
graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
|
|
23
|
+
graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
|
|
24
|
+
graph.updateNodeData('https://example.com/p3', { simhash: h3.toString() });
|
|
25
|
+
|
|
26
|
+
const clusters = detectContentClusters(graph, 2, 2);
|
|
27
|
+
|
|
28
|
+
expect(clusters.length).toBe(1);
|
|
29
|
+
expect(clusters[0].count).toBe(3);
|
|
30
|
+
expect(graph.nodes.get('https://example.com/p1')?.clusterId).toBe(1);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
it('should separate dissimilar pages', () => {
|
|
34
|
+
// Mock simhashes for very different pages
|
|
35
|
+
const h1 = 0b1111111111n;
|
|
36
|
+
const h2 = 0b0000000000n;
|
|
37
|
+
|
|
38
|
+
graph.addNode('https://example.com/p1', 0, 200);
|
|
39
|
+
graph.addNode('https://example.com/p2', 0, 200);
|
|
40
|
+
|
|
41
|
+
graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
|
|
42
|
+
graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
|
|
43
|
+
|
|
44
|
+
const clusters = detectContentClusters(graph, 2, 2);
|
|
45
|
+
|
|
46
|
+
expect(clusters.length).toBe(0); // None meet minSize 2
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it('should respect minClusterSize', () => {
|
|
50
|
+
const h1 = 0b1n;
|
|
51
|
+
const h2 = 0b0n;
|
|
52
|
+
|
|
53
|
+
graph.addNode('https://example.com/p1', 0, 200);
|
|
54
|
+
graph.addNode('https://example.com/p2', 0, 200);
|
|
55
|
+
|
|
56
|
+
graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
|
|
57
|
+
graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
|
|
58
|
+
|
|
59
|
+
const clusters = detectContentClusters(graph, 1, 3);
|
|
60
|
+
expect(clusters.length).toBe(0);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('should identify shared path prefixes (silos)', () => {
|
|
64
|
+
graph.addNode('https://example.com/blog/seo-tips', 0, 200);
|
|
65
|
+
graph.addNode('https://example.com/blog/link-building', 0, 200);
|
|
66
|
+
graph.addNode('https://example.com/blog/technical-seo', 0, 200);
|
|
67
|
+
|
|
68
|
+
const h = 0b111n;
|
|
69
|
+
graph.updateNodeData('https://example.com/blog/seo-tips', { simhash: h.toString() });
|
|
70
|
+
graph.updateNodeData('https://example.com/blog/link-building', { simhash: h.toString() });
|
|
71
|
+
graph.updateNodeData('https://example.com/blog/technical-seo', { simhash: h.toString() });
|
|
72
|
+
|
|
73
|
+
const clusters = detectContentClusters(graph, 0, 3);
|
|
74
|
+
expect(clusters[0].sharedPathPrefix).toBe('/blog');
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it('should be deterministic with unstable input order', () => {
|
|
78
|
+
// We'll add nodes in different orders and check if cluster primary is same
|
|
79
|
+
const h = 0b111n;
|
|
80
|
+
graph.addNode('https://example.com/z', 0, 200);
|
|
81
|
+
graph.addNode('https://example.com/a', 0, 200);
|
|
82
|
+
graph.addNode('https://example.com/m', 0, 200);
|
|
83
|
+
|
|
84
|
+
graph.updateNodeData('https://example.com/z', { simhash: h.toString(), pageRank: 10 });
|
|
85
|
+
graph.updateNodeData('https://example.com/a', { simhash: h.toString(), pageRank: 10 });
|
|
86
|
+
graph.updateNodeData('https://example.com/m', { simhash: h.toString(), pageRank: 10 });
|
|
87
|
+
|
|
88
|
+
const clusters = detectContentClusters(graph, 0, 3);
|
|
89
|
+
// a should be primary because it's shortest/lexicographic first since PageRanks are same
|
|
90
|
+
expect(clusters[0].primaryUrl).toBe('https://example.com/a');
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it('should use band optimization correctly (heuristic nature)', () => {
|
|
94
|
+
// Create many nodes in 2 groups
|
|
95
|
+
// Group 1: Matches in band 0
|
|
96
|
+
// Group 2: Matches in band 1
|
|
97
|
+
for (let i = 0; i < 5; i++) {
|
|
98
|
+
const url = `https://example.com/g1/${i}`;
|
|
99
|
+
graph.addNode(url, 0, 200);
|
|
100
|
+
// Simhash that matches in first 16 bits (0xAAAA)
|
|
101
|
+
const hash = BigInt(0xAAAA) | (BigInt(i) << 16n);
|
|
102
|
+
graph.updateNodeData(url, { simhash: hash.toString() });
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
for (let i = 0; i < 5; i++) {
|
|
106
|
+
const url = `https://example.com/g2/${i}`;
|
|
107
|
+
graph.addNode(url, 0, 200);
|
|
108
|
+
// Simhash that matches in second 16 bits (0xBBBB << 16)
|
|
109
|
+
const hash = (BigInt(0xBBBB) << 16n) | BigInt(i);
|
|
110
|
+
graph.updateNodeData(url, { simhash: hash.toString() });
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const clusters = detectContentClusters(graph, 5, 3);
|
|
114
|
+
expect(clusters.length).toBe(2);
|
|
115
|
+
expect(clusters[0].count).toBe(5);
|
|
116
|
+
expect(clusters[1].count).toBe(5);
|
|
117
|
+
});
|
|
118
|
+
});
|