@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,133 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { calculateScore } from '../../src/audit/scoring.js';
3
- import { TransportDiagnostics, DnsDiagnostics, SecurityHeadersResult, PerformanceMetrics } from '../../src/audit/types.js';
4
-
5
- describe('Scoring Engine', () => {
6
- const mockTransport: TransportDiagnostics = {
7
- tlsVersion: 'TLSv1.3',
8
- cipherSuite: 'TLS_AES_256_GCM_SHA384',
9
- alpnProtocol: 'h2',
10
- certificate: {
11
- issuer: 'Let\'s Encrypt',
12
- subject: 'example.com',
13
- validFrom: '2023-01-01',
14
- validTo: '2024-01-01',
15
- daysUntilExpiry: 60,
16
- isSelfSigned: false,
17
- isValidChain: true,
18
- fingerprint: 'SHA256:...'
19
- } as any,
20
- httpVersion: '2.0',
21
- compression: ['gzip'],
22
- keepAlive: true,
23
- transferEncoding: null,
24
- redirectCount: 0,
25
- redirects: [],
26
- serverHeader: 'nginx',
27
- headers: {}
28
- };
29
-
30
- const mockDns: DnsDiagnostics = {
31
- a: ['1.1.1.1', '1.0.0.1'],
32
- aaaa: ['2606:4700:4700::1111'],
33
- cname: [],
34
- reverse: [],
35
- ipCount: 3,
36
- ipv6Support: true,
37
- resolutionTime: 10
38
- };
39
-
40
- const mockHeaders: SecurityHeadersResult = {
41
- strictTransportSecurity: { present: true, valid: true, value: 'max-age=31536000' },
42
- contentSecurityPolicy: { present: true, valid: true, value: "default-src 'self'" },
43
- xFrameOptions: { present: true, valid: true, value: 'DENY' },
44
- xContentTypeOptions: { present: true, valid: true, value: 'nosniff' },
45
- referrerPolicy: { present: true, valid: true, value: 'strict-origin' },
46
- permissionsPolicy: { present: true, valid: true, value: 'geolocation=()' },
47
- details: {},
48
- score: 100
49
- };
50
-
51
- const mockPerformance: PerformanceMetrics = {
52
- dnsLookupTime: 10,
53
- tcpConnectTime: 20,
54
- tlsHandshakeTime: 30,
55
- ttfb: 100,
56
- totalTime: 200,
57
- htmlSize: 50000,
58
- headerSize: 500,
59
- redirectTime: 0
60
- };
61
-
62
- it('should give perfect score for perfect inputs', () => {
63
- const result = calculateScore(mockTransport, mockDns, mockHeaders, mockPerformance, []);
64
- expect(result.score).toBe(100);
65
- expect(result.grade).toBe('A');
66
- expect(result.issues).toHaveLength(0);
67
- });
68
-
69
- it('should penalize TLS < 1.2', () => {
70
- const badTransport = { ...mockTransport, tlsVersion: 'TLSv1.1' };
71
- const result = calculateScore(badTransport, mockDns, mockHeaders, mockPerformance, []);
72
- expect(result.score).toBeLessThan(100);
73
- expect(result.categoryScores.transport).toBeLessThan(30);
74
- expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'tls-old' })]));
75
- });
76
-
77
- it('should penalize missing HTTPS', () => {
78
- const badTransport = { ...mockTransport, tlsVersion: null, certificate: null };
79
- const result = calculateScore(badTransport, mockDns, mockHeaders, mockPerformance, []);
80
- expect(result.score).toBeLessThan(50); // Critical
81
- expect(result.grade).toBe('F');
82
- expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'no-https' })]));
83
- });
84
-
85
- it('should fail on expired cert', () => {
86
- const expiredTransport = {
87
- ...mockTransport,
88
- certificate: { ...mockTransport.certificate!, daysUntilExpiry: -5, validTo: '2023-01-01' }
89
- };
90
- const result = calculateScore(expiredTransport, mockDns, mockHeaders, mockPerformance, []);
91
- expect(result.grade).toBe('F');
92
- expect(result.score).toBeLessThanOrEqual(40);
93
- expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'cert-expired' })]));
94
- });
95
-
96
- it('should penalize missing security headers', () => {
97
- // If score is 50, it means we lost 50 points in headers category (internal score)
98
- // headers category is 20 points total. So we lose 10 points.
99
- const badHeaders = { ...mockHeaders, score: 50, strictTransportSecurity: { present: false, valid: false, value: null } };
100
- const result = calculateScore(mockTransport, mockDns, badHeaders, mockPerformance, []);
101
- expect(result.categoryScores.security).toBe(10);
102
- expect(result.score).toBe(90); // 100 - 10
103
- expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'hsts-missing' })]));
104
- });
105
-
106
- it('should penalize poor performance', () => {
107
- const badPerf = { ...mockPerformance, ttfb: 1000, htmlSize: 2000000 };
108
- const result = calculateScore(mockTransport, mockDns, mockHeaders, badPerf, []);
109
- // TTFB > 800: Lose 10 pts
110
- // HTML > 1MB: Lose 5 pts
111
- // Total perf score (30) -> 15.
112
- expect(result.categoryScores.performance).toBe(15);
113
- expect(result.score).toBe(85);
114
- expect(result.issues).toEqual(expect.arrayContaining([
115
- expect.objectContaining({ id: 'slow-ttfb' }),
116
- expect.objectContaining({ id: 'large-html' })
117
- ]));
118
- });
119
-
120
- it('should penalize infrastructure issues', () => {
121
- const badDns = { ...mockDns, ipv6Support: false, ipCount: 1 };
122
- const result = calculateScore(mockTransport, badDns, mockHeaders, mockPerformance, []);
123
- // No IPv6: Lose 10 pts
124
- // Single IP: Lose 10 pts
125
- // Infra score (20) -> 0.
126
- expect(result.categoryScores.infrastructure).toBe(0);
127
- expect(result.score).toBe(80);
128
- expect(result.issues).toEqual(expect.arrayContaining([
129
- expect.objectContaining({ id: 'no-ipv6' }),
130
- expect.objectContaining({ id: 'single-ip' })
131
- ]));
132
- });
133
- });
@@ -1,12 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { auditUrl } from '../../src/audit/index.js';
3
-
4
- describe('Audit Security', () => {
5
- it('should block audits of internal IP addresses', async () => {
6
- await expect(auditUrl('http://127.0.0.1')).rejects.toThrow('Access to internal or private infrastructure is prohibited');
7
- });
8
-
9
- it('should block audits of link-local addresses', async () => {
10
- await expect(auditUrl('http://169.254.169.254')).rejects.toThrow('Access to internal or private infrastructure is prohibited');
11
- });
12
- });
@@ -1,111 +0,0 @@
1
- import { describe, it, expect, vi, afterEach } from 'vitest';
2
- import { analyzeTransport } from '../../src/audit/transport.js';
3
- import https from 'node:https';
4
- import tls from 'node:tls';
5
- import { EventEmitter } from 'events';
6
-
7
- vi.mock('node:https');
8
- vi.mock('node:http');
9
-
10
- describe('Transport Diagnostics', () => {
11
- afterEach(() => {
12
- vi.clearAllMocks();
13
- });
14
-
15
- it('should analyze HTTPS transport', async () => {
16
- // Mock Response
17
- const mockRes = new EventEmitter() as any;
18
- mockRes.statusCode = 200;
19
- mockRes.statusMessage = 'OK';
20
- mockRes.headers = {
21
- 'content-encoding': 'gzip',
22
- 'server': 'nginx',
23
- 'connection': 'keep-alive'
24
- };
25
- mockRes.httpVersion = '1.1';
26
-
27
- const mockSocket = new EventEmitter();
28
- Object.setPrototypeOf(mockSocket, tls.TLSSocket.prototype);
29
- (mockSocket as any).getPeerCertificate = () => ({
30
- subject: { CN: 'example.com' },
31
- issuer: { CN: 'Let\'s Encrypt' },
32
- valid_from: 'Jan 1 2023',
33
- valid_to: 'Jan 1 2024',
34
- fingerprint: 'SHA256:...'
35
- });
36
- (mockSocket as any).getProtocol = () => 'TLSv1.3';
37
- (mockSocket as any).getCipher = () => ({ name: 'TLS_AES_...' });
38
- (mockSocket as any).alpnProtocol = 'h2';
39
- (mockSocket as any).authorized = true;
40
-
41
- mockRes.socket = mockSocket;
42
-
43
- // Mock Request
44
- const mockReq = new EventEmitter() as any;
45
- mockReq.end = vi.fn();
46
- mockReq.destroy = vi.fn();
47
-
48
- // Mock https.request
49
- vi.spyOn(https, 'request').mockImplementation((url, options, cb) => {
50
- if (cb) cb(mockRes);
51
- // Simulate socket events
52
- setTimeout(() => {
53
- mockReq.emit('socket', mockRes.socket);
54
- mockRes.socket.emit('lookup');
55
- mockRes.socket.emit('connect');
56
- mockRes.socket.emit('secureConnect');
57
- mockReq.emit('finish');
58
- // Response data
59
- mockRes.emit('data', Buffer.from('<html></html>'));
60
- mockRes.emit('end');
61
- }, 10);
62
- return mockReq;
63
- });
64
-
65
- const result = await analyzeTransport('https://example.com', 1000);
66
- expect(result.transport.tlsVersion).toBe('TLSv1.3');
67
- expect(result.transport.httpVersion).toBe('1.1');
68
- expect(result.performance.htmlSize).toBeGreaterThan(0);
69
- expect(result.transport.headers['server']).toBe('nginx');
70
- });
71
-
72
- it('should handle redirects', async () => {
73
- const req1 = new EventEmitter() as any; req1.end = vi.fn(); req1.destroy = vi.fn();
74
- const res1 = new EventEmitter() as any; res1.statusCode = 301; res1.headers = { location: 'https://example.com/' };
75
- res1.socket = new EventEmitter(); Object.setPrototypeOf(res1.socket, tls.TLSSocket.prototype);
76
-
77
- const req2 = new EventEmitter() as any; req2.end = vi.fn(); req2.destroy = vi.fn();
78
- const res2 = new EventEmitter() as any; res2.statusCode = 200; res2.headers = {};
79
- res2.socket = new EventEmitter(); Object.setPrototypeOf(res2.socket, tls.TLSSocket.prototype);
80
-
81
- // Setup res2 socket for TLS checks
82
- res2.socket.getPeerCertificate = () => ({});
83
- res2.socket.getProtocol = () => 'TLSv1.2';
84
- res2.socket.getCipher = () => ({ name: 'AES' });
85
-
86
- const requestSpy = vi.spyOn(https, 'request');
87
- requestSpy
88
- .mockImplementationOnce((url, options, cb) => {
89
- if (cb) cb(res1);
90
- setTimeout(() => {
91
- req1.emit('socket', res1.socket);
92
- res1.emit('data', Buffer.from('redirecting'));
93
- res1.emit('end');
94
- }, 10);
95
- return req1;
96
- })
97
- .mockImplementationOnce((url, options, cb) => {
98
- if (cb) cb(res2);
99
- setTimeout(() => {
100
- req2.emit('socket', res2.socket);
101
- res2.emit('data', Buffer.from('ok'));
102
- res2.emit('end');
103
- }, 10);
104
- return req2;
105
- });
106
-
107
- const result = await analyzeTransport('https://redirect.com', 1000);
108
- expect(result.transport.redirectCount).toBe(1);
109
- expect(result.transport.redirects[0].location).toBe('https://example.com/');
110
- });
111
- });
@@ -1,118 +0,0 @@
1
- import { describe, it, expect, beforeEach } from 'vitest';
2
- import { Graph } from '../src/graph/graph.js';
3
- import { detectContentClusters } from '../src/graph/cluster.js';
4
-
5
- describe('Content Clustering', () => {
6
- let graph: Graph;
7
-
8
- beforeEach(() => {
9
- graph = new Graph();
10
- });
11
-
12
- it('should group similar pages into a cluster', () => {
13
- // Mock simhashes for similar pages (Hamming distance 1)
14
- const h1 = 0b101010n;
15
- const h2 = 0b101011n;
16
- const h3 = 0b101001n;
17
-
18
- graph.addNode('https://example.com/p1', 0, 200);
19
- graph.addNode('https://example.com/p2', 0, 200);
20
- graph.addNode('https://example.com/p3', 0, 200);
21
-
22
- graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
23
- graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
24
- graph.updateNodeData('https://example.com/p3', { simhash: h3.toString() });
25
-
26
- const clusters = detectContentClusters(graph, 2, 2);
27
-
28
- expect(clusters.length).toBe(1);
29
- expect(clusters[0].count).toBe(3);
30
- expect(graph.nodes.get('https://example.com/p1')?.clusterId).toBe(1);
31
- });
32
-
33
- it('should separate dissimilar pages', () => {
34
- // Mock simhashes for very different pages
35
- const h1 = 0b1111111111n;
36
- const h2 = 0b0000000000n;
37
-
38
- graph.addNode('https://example.com/p1', 0, 200);
39
- graph.addNode('https://example.com/p2', 0, 200);
40
-
41
- graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
42
- graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
43
-
44
- const clusters = detectContentClusters(graph, 2, 2);
45
-
46
- expect(clusters.length).toBe(0); // None meet minSize 2
47
- });
48
-
49
- it('should respect minClusterSize', () => {
50
- const h1 = 0b1n;
51
- const h2 = 0b0n;
52
-
53
- graph.addNode('https://example.com/p1', 0, 200);
54
- graph.addNode('https://example.com/p2', 0, 200);
55
-
56
- graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
57
- graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
58
-
59
- const clusters = detectContentClusters(graph, 1, 3);
60
- expect(clusters.length).toBe(0);
61
- });
62
-
63
- it('should identify shared path prefixes (silos)', () => {
64
- graph.addNode('https://example.com/blog/seo-tips', 0, 200);
65
- graph.addNode('https://example.com/blog/link-building', 0, 200);
66
- graph.addNode('https://example.com/blog/technical-seo', 0, 200);
67
-
68
- const h = 0b111n;
69
- graph.updateNodeData('https://example.com/blog/seo-tips', { simhash: h.toString() });
70
- graph.updateNodeData('https://example.com/blog/link-building', { simhash: h.toString() });
71
- graph.updateNodeData('https://example.com/blog/technical-seo', { simhash: h.toString() });
72
-
73
- const clusters = detectContentClusters(graph, 0, 3);
74
- expect(clusters[0].sharedPathPrefix).toBe('/blog');
75
- });
76
-
77
- it('should be deterministic with unstable input order', () => {
78
- // We'll add nodes in different orders and check if cluster primary is same
79
- const h = 0b111n;
80
- graph.addNode('https://example.com/z', 0, 200);
81
- graph.addNode('https://example.com/a', 0, 200);
82
- graph.addNode('https://example.com/m', 0, 200);
83
-
84
- graph.updateNodeData('https://example.com/z', { simhash: h.toString(), pageRank: 10 });
85
- graph.updateNodeData('https://example.com/a', { simhash: h.toString(), pageRank: 10 });
86
- graph.updateNodeData('https://example.com/m', { simhash: h.toString(), pageRank: 10 });
87
-
88
- const clusters = detectContentClusters(graph, 0, 3);
89
- // a should be primary because it's shortest/lexicographic first since PageRanks are same
90
- expect(clusters[0].primaryUrl).toBe('https://example.com/a');
91
- });
92
-
93
- it('should use band optimization correctly (heuristic nature)', () => {
94
- // Create many nodes in 2 groups
95
- // Group 1: Matches in band 0
96
- // Group 2: Matches in band 1
97
- for (let i = 0; i < 5; i++) {
98
- const url = `https://example.com/g1/${i}`;
99
- graph.addNode(url, 0, 200);
100
- // Simhash that matches in first 16 bits (0xAAAA)
101
- const hash = BigInt(0xAAAA) | (BigInt(i) << 16n);
102
- graph.updateNodeData(url, { simhash: hash.toString() });
103
- }
104
-
105
- for (let i = 0; i < 5; i++) {
106
- const url = `https://example.com/g2/${i}`;
107
- graph.addNode(url, 0, 200);
108
- // Simhash that matches in second 16 bits (0xBBBB << 16)
109
- const hash = (BigInt(0xBBBB) << 16n) | BigInt(i);
110
- graph.updateNodeData(url, { simhash: hash.toString() });
111
- }
112
-
113
- const clusters = detectContentClusters(graph, 5, 3);
114
- expect(clusters.length).toBe(2);
115
- expect(clusters[0].count).toBe(5);
116
- expect(clusters[1].count).toBe(5);
117
- });
118
- });
@@ -1,118 +0,0 @@
1
- import { describe, it, expect, beforeEach } from 'vitest';
2
- import { Graph } from '../src/graph/graph.js';
3
- import { detectContentClusters } from '../src/graph/cluster.js';
4
-
5
- describe('Cluster Risk Heuristic', () => {
6
- let graph: Graph;
7
-
8
- beforeEach(() => {
9
- graph = new Graph();
10
- });
11
-
12
- it('should assign HIGH risk to clusters with identical titles', () => {
13
- const html = '<html><head><title>Duplicate Title</title></head><body>Content</body></html>';
14
- const h = 0b101010n.toString();
15
-
16
- graph.addNode('https://example.com/p1', 0, 200);
17
- graph.addNode('https://example.com/p2', 0, 200);
18
- graph.addNode('https://example.com/p3', 0, 200);
19
-
20
- graph.updateNodeData('https://example.com/p1', { simhash: h, html });
21
- graph.updateNodeData('https://example.com/p2', { simhash: h, html });
22
- graph.updateNodeData('https://example.com/p3', { simhash: h, html });
23
-
24
- const clusters = detectContentClusters(graph, 2, 2);
25
-
26
- expect(clusters.length).toBe(1);
27
- expect(clusters[0].risk).toBe('high');
28
- });
29
-
30
- it('should assign HIGH risk to clusters with identical H1s', () => {
31
- const h = 0b101010n.toString();
32
-
33
- graph.addNode('https://example.com/p1', 0, 200);
34
- graph.addNode('https://example.com/p2', 0, 200);
35
- graph.addNode('https://example.com/p3', 0, 200);
36
-
37
- // Different titles, same H1
38
- graph.updateNodeData('https://example.com/p1', {
39
- simhash: h,
40
- html: '<html><head><title>Page 1</title></head><body><h1>Duplicate Header</h1></body></html>'
41
- });
42
- graph.updateNodeData('https://example.com/p2', {
43
- simhash: h,
44
- html: '<html><head><title>Page 2</title></head><body><h1>Duplicate Header</h1></body></html>'
45
- });
46
- graph.updateNodeData('https://example.com/p3', {
47
- simhash: h,
48
- html: '<html><head><title>Page 3</title></head><body><h1>Duplicate Header</h1></body></html>'
49
- });
50
-
51
- const clusters = detectContentClusters(graph, 2, 2);
52
-
53
- expect(clusters.length).toBe(1);
54
- expect(clusters[0].risk).toBe('high');
55
- });
56
-
57
- it('should assign LOW risk to small clusters with unique titles and H1s', () => {
58
- const h = 0b101010n.toString();
59
-
60
- graph.addNode('https://example.com/p1', 0, 200);
61
- graph.addNode('https://example.com/p2', 0, 200);
62
- graph.addNode('https://example.com/p3', 0, 200);
63
-
64
- graph.updateNodeData('https://example.com/p1', {
65
- simhash: h,
66
- html: '<html><head><title>Page 1</title></head><body><h1>Header 1</h1></body></html>'
67
- });
68
- graph.updateNodeData('https://example.com/p2', {
69
- simhash: h,
70
- html: '<html><head><title>Page 2</title></head><body><h1>Header 2</h1></body></html>'
71
- });
72
- graph.updateNodeData('https://example.com/p3', {
73
- simhash: h,
74
- html: '<html><head><title>Page 3</title></head><body><h1>Header 3</h1></body></html>'
75
- });
76
-
77
- const clusters = detectContentClusters(graph, 2, 2);
78
-
79
- expect(clusters.length).toBe(1);
80
- expect(clusters[0].risk).toBe('low');
81
- });
82
-
83
- it('should assign MEDIUM risk to large clusters even with unique titles', () => {
84
- const h = 0b101010n.toString();
85
-
86
- // 12 nodes, all unique titles
87
- for (let i = 0; i < 12; i++) {
88
- const url = `https://example.com/p${i}`;
89
- graph.addNode(url, 0, 200);
90
- graph.updateNodeData(url, {
91
- simhash: h,
92
- html: `<html><head><title>Page ${i}</title></head><body><h1>Header ${i}</h1></body></html>`
93
- });
94
- }
95
-
96
- const clusters = detectContentClusters(graph, 2, 2);
97
-
98
- expect(clusters.length).toBe(1);
99
- expect(clusters[0].risk).toBe('medium');
100
- });
101
-
102
- it('should handle missing HTML gracefully', () => {
103
- const h = 0b101010n.toString();
104
-
105
- graph.addNode('https://example.com/p1', 0, 200);
106
- graph.addNode('https://example.com/p2', 0, 200);
107
-
108
- // No HTML provided
109
- graph.updateNodeData('https://example.com/p1', { simhash: h });
110
- graph.updateNodeData('https://example.com/p2', { simhash: h });
111
-
112
- const clusters = detectContentClusters(graph, 2, 2);
113
-
114
- expect(clusters.length).toBe(1);
115
- // Fallback to size based? 2 nodes -> low risk
116
- expect(clusters[0].risk).toBe('low');
117
- });
118
- });