@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,133 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { calculateScore } from '../../src/audit/scoring.js';
3
+ import { TransportDiagnostics, DnsDiagnostics, SecurityHeadersResult, PerformanceMetrics, AuditIssue } from '../../src/audit/types.js';
4
+
5
+ describe('Scoring Engine', () => {
6
+ const mockTransport: TransportDiagnostics = {
7
+ tlsVersion: 'TLSv1.3',
8
+ cipherSuite: 'TLS_AES_256_GCM_SHA384',
9
+ alpnProtocol: 'h2',
10
+ certificate: {
11
+ issuer: 'Let\'s Encrypt',
12
+ subject: 'example.com',
13
+ validFrom: '2023-01-01',
14
+ validTo: '2024-01-01',
15
+ daysUntilExpiry: 60,
16
+ isSelfSigned: false,
17
+ isValidChain: true,
18
+ fingerprint: 'SHA256:...'
19
+ } as any,
20
+ httpVersion: '2.0',
21
+ compression: ['gzip'],
22
+ keepAlive: true,
23
+ transferEncoding: null,
24
+ redirectCount: 0,
25
+ redirects: [],
26
+ serverHeader: 'nginx',
27
+ headers: {}
28
+ };
29
+
30
+ const mockDns: DnsDiagnostics = {
31
+ a: ['1.1.1.1', '1.0.0.1'],
32
+ aaaa: ['2606:4700:4700::1111'],
33
+ cname: [],
34
+ reverse: [],
35
+ ipCount: 3,
36
+ ipv6Support: true,
37
+ resolutionTime: 10
38
+ };
39
+
40
+ const mockHeaders: SecurityHeadersResult = {
41
+ strictTransportSecurity: { present: true, valid: true, value: 'max-age=31536000' },
42
+ contentSecurityPolicy: { present: true, valid: true, value: "default-src 'self'" },
43
+ xFrameOptions: { present: true, valid: true, value: 'DENY' },
44
+ xContentTypeOptions: { present: true, valid: true, value: 'nosniff' },
45
+ referrerPolicy: { present: true, valid: true, value: 'strict-origin' },
46
+ permissionsPolicy: { present: true, valid: true, value: 'geolocation=()' },
47
+ details: {},
48
+ score: 100
49
+ };
50
+
51
+ const mockPerformance: PerformanceMetrics = {
52
+ dnsLookupTime: 10,
53
+ tcpConnectTime: 20,
54
+ tlsHandshakeTime: 30,
55
+ ttfb: 100,
56
+ totalTime: 200,
57
+ htmlSize: 50000,
58
+ headerSize: 500,
59
+ redirectTime: 0
60
+ };
61
+
62
+ it('should give perfect score for perfect inputs', () => {
63
+ const result = calculateScore(mockTransport, mockDns, mockHeaders, mockPerformance, []);
64
+ expect(result.score).toBe(100);
65
+ expect(result.grade).toBe('A');
66
+ expect(result.issues).toHaveLength(0);
67
+ });
68
+
69
+ it('should penalize TLS < 1.2', () => {
70
+ const badTransport = { ...mockTransport, tlsVersion: 'TLSv1.1' };
71
+ const result = calculateScore(badTransport, mockDns, mockHeaders, mockPerformance, []);
72
+ expect(result.score).toBeLessThan(100);
73
+ expect(result.categoryScores.transport).toBeLessThan(30);
74
+ expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'tls-old' })]));
75
+ });
76
+
77
+ it('should penalize missing HTTPS', () => {
78
+ const badTransport = { ...mockTransport, tlsVersion: null, certificate: null };
79
+ const result = calculateScore(badTransport, mockDns, mockHeaders, mockPerformance, []);
80
+ expect(result.score).toBeLessThan(50); // Critical
81
+ expect(result.grade).toBe('F');
82
+ expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'no-https' })]));
83
+ });
84
+
85
+ it('should fail on expired cert', () => {
86
+ const expiredTransport = {
87
+ ...mockTransport,
88
+ certificate: { ...mockTransport.certificate!, daysUntilExpiry: -5, validTo: '2023-01-01' }
89
+ };
90
+ const result = calculateScore(expiredTransport, mockDns, mockHeaders, mockPerformance, []);
91
+ expect(result.grade).toBe('F');
92
+ expect(result.score).toBeLessThanOrEqual(40);
93
+ expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'cert-expired' })]));
94
+ });
95
+
96
+ it('should penalize missing security headers', () => {
97
+ // If score is 50, it means we lost 50 points in headers category (internal score)
98
+ // headers category is 20 points total. So we lose 10 points.
99
+ const badHeaders = { ...mockHeaders, score: 50, strictTransportSecurity: { present: false, valid: false, value: null } };
100
+ const result = calculateScore(mockTransport, mockDns, badHeaders, mockPerformance, []);
101
+ expect(result.categoryScores.security).toBe(10);
102
+ expect(result.score).toBe(90); // 100 - 10
103
+ expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'hsts-missing' })]));
104
+ });
105
+
106
+ it('should penalize poor performance', () => {
107
+ const badPerf = { ...mockPerformance, ttfb: 1000, htmlSize: 2000000 };
108
+ const result = calculateScore(mockTransport, mockDns, mockHeaders, badPerf, []);
109
+ // TTFB > 800: Lose 10 pts
110
+ // HTML > 1MB: Lose 5 pts
111
+ // Total perf score (30) -> 15.
112
+ expect(result.categoryScores.performance).toBe(15);
113
+ expect(result.score).toBe(85);
114
+ expect(result.issues).toEqual(expect.arrayContaining([
115
+ expect.objectContaining({ id: 'slow-ttfb' }),
116
+ expect.objectContaining({ id: 'large-html' })
117
+ ]));
118
+ });
119
+
120
+ it('should penalize infrastructure issues', () => {
121
+ const badDns = { ...mockDns, ipv6Support: false, ipCount: 1 };
122
+ const result = calculateScore(mockTransport, badDns, mockHeaders, mockPerformance, []);
123
+ // No IPv6: Lose 10 pts
124
+ // Single IP: Lose 10 pts
125
+ // Infra score (20) -> 0.
126
+ expect(result.categoryScores.infrastructure).toBe(0);
127
+ expect(result.score).toBe(80);
128
+ expect(result.issues).toEqual(expect.arrayContaining([
129
+ expect.objectContaining({ id: 'no-ipv6' }),
130
+ expect.objectContaining({ id: 'single-ip' })
131
+ ]));
132
+ });
133
+ });
@@ -0,0 +1,12 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { auditUrl } from '../../src/audit/index.js';
3
+
4
+ describe('Audit Security', () => {
5
+ it('should block audits of internal IP addresses', async () => {
6
+ await expect(auditUrl('http://127.0.0.1')).rejects.toThrow('Access to internal or private infrastructure is prohibited');
7
+ });
8
+
9
+ it('should block audits of link-local addresses', async () => {
10
+ await expect(auditUrl('http://169.254.169.254')).rejects.toThrow('Access to internal or private infrastructure is prohibited');
11
+ });
12
+ });
@@ -0,0 +1,112 @@
1
+ import { describe, it, expect, vi, afterEach } from 'vitest';
2
+ import { analyzeTransport } from '../../src/audit/transport.js';
3
+ import https from 'node:https';
4
+ import http from 'node:http';
5
+ import tls from 'node:tls';
6
+ import { EventEmitter } from 'events';
7
+
8
+ vi.mock('node:https');
9
+ vi.mock('node:http');
10
+
11
+ describe('Transport Diagnostics', () => {
12
+ afterEach(() => {
13
+ vi.clearAllMocks();
14
+ });
15
+
16
+ it('should analyze HTTPS transport', async () => {
17
+ // Mock Response
18
+ const mockRes = new EventEmitter() as any;
19
+ mockRes.statusCode = 200;
20
+ mockRes.statusMessage = 'OK';
21
+ mockRes.headers = {
22
+ 'content-encoding': 'gzip',
23
+ 'server': 'nginx',
24
+ 'connection': 'keep-alive'
25
+ };
26
+ mockRes.httpVersion = '1.1';
27
+
28
+ const mockSocket = new EventEmitter();
29
+ Object.setPrototypeOf(mockSocket, tls.TLSSocket.prototype);
30
+ (mockSocket as any).getPeerCertificate = () => ({
31
+ subject: { CN: 'example.com' },
32
+ issuer: { CN: 'Let\'s Encrypt' },
33
+ valid_from: 'Jan 1 2023',
34
+ valid_to: 'Jan 1 2024',
35
+ fingerprint: 'SHA256:...'
36
+ });
37
+ (mockSocket as any).getProtocol = () => 'TLSv1.3';
38
+ (mockSocket as any).getCipher = () => ({ name: 'TLS_AES_...' });
39
+ (mockSocket as any).alpnProtocol = 'h2';
40
+ (mockSocket as any).authorized = true;
41
+
42
+ mockRes.socket = mockSocket;
43
+
44
+ // Mock Request
45
+ const mockReq = new EventEmitter() as any;
46
+ mockReq.end = vi.fn();
47
+ mockReq.destroy = vi.fn();
48
+
49
+ // Mock https.request
50
+ vi.spyOn(https, 'request').mockImplementation((url, options, cb) => {
51
+ if (cb) cb(mockRes);
52
+ // Simulate socket events
53
+ setTimeout(() => {
54
+ mockReq.emit('socket', mockRes.socket);
55
+ mockRes.socket.emit('lookup');
56
+ mockRes.socket.emit('connect');
57
+ mockRes.socket.emit('secureConnect');
58
+ mockReq.emit('finish');
59
+ // Response data
60
+ mockRes.emit('data', Buffer.from('<html></html>'));
61
+ mockRes.emit('end');
62
+ }, 10);
63
+ return mockReq;
64
+ });
65
+
66
+ const result = await analyzeTransport('https://example.com', 1000);
67
+ expect(result.transport.tlsVersion).toBe('TLSv1.3');
68
+ expect(result.transport.httpVersion).toBe('1.1');
69
+ expect(result.performance.htmlSize).toBeGreaterThan(0);
70
+ expect(result.transport.headers['server']).toBe('nginx');
71
+ });
72
+
73
+ it('should handle redirects', async () => {
74
+ const req1 = new EventEmitter() as any; req1.end = vi.fn(); req1.destroy = vi.fn();
75
+ const res1 = new EventEmitter() as any; res1.statusCode = 301; res1.headers = { location: 'https://example.com/' };
76
+ res1.socket = new EventEmitter(); Object.setPrototypeOf(res1.socket, tls.TLSSocket.prototype);
77
+
78
+ const req2 = new EventEmitter() as any; req2.end = vi.fn(); req2.destroy = vi.fn();
79
+ const res2 = new EventEmitter() as any; res2.statusCode = 200; res2.headers = {};
80
+ res2.socket = new EventEmitter(); Object.setPrototypeOf(res2.socket, tls.TLSSocket.prototype);
81
+
82
+ // Setup res2 socket for TLS checks
83
+ res2.socket.getPeerCertificate = () => ({});
84
+ res2.socket.getProtocol = () => 'TLSv1.2';
85
+ res2.socket.getCipher = () => ({ name: 'AES' });
86
+
87
+ const requestSpy = vi.spyOn(https, 'request');
88
+ requestSpy
89
+ .mockImplementationOnce((url, options, cb) => {
90
+ if (cb) cb(res1);
91
+ setTimeout(() => {
92
+ req1.emit('socket', res1.socket);
93
+ res1.emit('data', Buffer.from('redirecting'));
94
+ res1.emit('end');
95
+ }, 10);
96
+ return req1;
97
+ })
98
+ .mockImplementationOnce((url, options, cb) => {
99
+ if (cb) cb(res2);
100
+ setTimeout(() => {
101
+ req2.emit('socket', res2.socket);
102
+ res2.emit('data', Buffer.from('ok'));
103
+ res2.emit('end');
104
+ }, 10);
105
+ return req2;
106
+ });
107
+
108
+ const result = await analyzeTransport('https://redirect.com', 1000);
109
+ expect(result.transport.redirectCount).toBe(1);
110
+ expect(result.transport.redirects[0].location).toBe('https://example.com/');
111
+ });
112
+ });
@@ -0,0 +1,118 @@
1
+ import { describe, it, expect, beforeEach } from 'vitest';
2
+ import { Graph } from '../src/graph/graph.js';
3
+ import { detectContentClusters } from '../src/graph/cluster.js';
4
+
5
+ describe('Content Clustering', () => {
6
+ let graph: Graph;
7
+
8
+ beforeEach(() => {
9
+ graph = new Graph();
10
+ });
11
+
12
+ it('should group similar pages into a cluster', () => {
13
+ // Mock simhashes for similar pages (Hamming distance 1)
14
+ const h1 = 0b101010n;
15
+ const h2 = 0b101011n;
16
+ const h3 = 0b101001n;
17
+
18
+ graph.addNode('https://example.com/p1', 0, 200);
19
+ graph.addNode('https://example.com/p2', 0, 200);
20
+ graph.addNode('https://example.com/p3', 0, 200);
21
+
22
+ graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
23
+ graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
24
+ graph.updateNodeData('https://example.com/p3', { simhash: h3.toString() });
25
+
26
+ const clusters = detectContentClusters(graph, 2, 2);
27
+
28
+ expect(clusters.length).toBe(1);
29
+ expect(clusters[0].count).toBe(3);
30
+ expect(graph.nodes.get('https://example.com/p1')?.clusterId).toBe(1);
31
+ });
32
+
33
+ it('should separate dissimilar pages', () => {
34
+ // Mock simhashes for very different pages
35
+ const h1 = 0b1111111111n;
36
+ const h2 = 0b0000000000n;
37
+
38
+ graph.addNode('https://example.com/p1', 0, 200);
39
+ graph.addNode('https://example.com/p2', 0, 200);
40
+
41
+ graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
42
+ graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
43
+
44
+ const clusters = detectContentClusters(graph, 2, 2);
45
+
46
+ expect(clusters.length).toBe(0); // None meet minSize 2
47
+ });
48
+
49
+ it('should respect minClusterSize', () => {
50
+ const h1 = 0b1n;
51
+ const h2 = 0b0n;
52
+
53
+ graph.addNode('https://example.com/p1', 0, 200);
54
+ graph.addNode('https://example.com/p2', 0, 200);
55
+
56
+ graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
57
+ graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
58
+
59
+ const clusters = detectContentClusters(graph, 1, 3);
60
+ expect(clusters.length).toBe(0);
61
+ });
62
+
63
+ it('should identify shared path prefixes (silos)', () => {
64
+ graph.addNode('https://example.com/blog/seo-tips', 0, 200);
65
+ graph.addNode('https://example.com/blog/link-building', 0, 200);
66
+ graph.addNode('https://example.com/blog/technical-seo', 0, 200);
67
+
68
+ const h = 0b111n;
69
+ graph.updateNodeData('https://example.com/blog/seo-tips', { simhash: h.toString() });
70
+ graph.updateNodeData('https://example.com/blog/link-building', { simhash: h.toString() });
71
+ graph.updateNodeData('https://example.com/blog/technical-seo', { simhash: h.toString() });
72
+
73
+ const clusters = detectContentClusters(graph, 0, 3);
74
+ expect(clusters[0].sharedPathPrefix).toBe('/blog');
75
+ });
76
+
77
+ it('should be deterministic with unstable input order', () => {
78
+ // We'll add nodes in different orders and check if cluster primary is same
79
+ const h = 0b111n;
80
+ graph.addNode('https://example.com/z', 0, 200);
81
+ graph.addNode('https://example.com/a', 0, 200);
82
+ graph.addNode('https://example.com/m', 0, 200);
83
+
84
+ graph.updateNodeData('https://example.com/z', { simhash: h.toString(), pageRank: 10 });
85
+ graph.updateNodeData('https://example.com/a', { simhash: h.toString(), pageRank: 10 });
86
+ graph.updateNodeData('https://example.com/m', { simhash: h.toString(), pageRank: 10 });
87
+
88
+ const clusters = detectContentClusters(graph, 0, 3);
89
+ // a should be primary because it's shortest/lexicographic first since PageRanks are same
90
+ expect(clusters[0].primaryUrl).toBe('https://example.com/a');
91
+ });
92
+
93
+ it('should use band optimization correctly (heuristic nature)', () => {
94
+ // Create many nodes in 2 groups
95
+ // Group 1: Matches in band 0
96
+ // Group 2: Matches in band 1
97
+ for (let i = 0; i < 5; i++) {
98
+ const url = `https://example.com/g1/${i}`;
99
+ graph.addNode(url, 0, 200);
100
+ // Simhash that matches in first 16 bits (0xAAAA)
101
+ const hash = BigInt(0xAAAA) | (BigInt(i) << 16n);
102
+ graph.updateNodeData(url, { simhash: hash.toString() });
103
+ }
104
+
105
+ for (let i = 0; i < 5; i++) {
106
+ const url = `https://example.com/g2/${i}`;
107
+ graph.addNode(url, 0, 200);
108
+ // Simhash that matches in second 16 bits (0xBBBB << 16)
109
+ const hash = (BigInt(0xBBBB) << 16n) | BigInt(i);
110
+ graph.updateNodeData(url, { simhash: hash.toString() });
111
+ }
112
+
113
+ const clusters = detectContentClusters(graph, 5, 3);
114
+ expect(clusters.length).toBe(2);
115
+ expect(clusters[0].count).toBe(5);
116
+ expect(clusters[1].count).toBe(5);
117
+ });
118
+ });