@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,110 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { Graph } from '../src/graph/graph.js';
3
+ import { detectDuplicates } from '../src/graph/duplicate.js';
4
+ import { SimHash } from '../src/graph/simhash.js';
5
+
6
+ describe('Duplicate Detection', () => {
7
+ it('should detect exact duplicates based on contentHash', () => {
8
+ const graph = new Graph();
9
+ graph.addNode('https://example.com/a', 0, 200);
10
+ graph.addNode('https://example.com/b', 0, 200);
11
+ graph.addNode('https://example.com/c', 0, 200);
12
+
13
+ graph.updateNodeData('https://example.com/a', { contentHash: 'hash1', uniqueTokenRatio: 1.0 });
14
+ graph.updateNodeData('https://example.com/b', { contentHash: 'hash1', uniqueTokenRatio: 1.0 });
15
+ graph.updateNodeData('https://example.com/c', { contentHash: 'hash2', uniqueTokenRatio: 1.0 });
16
+
17
+ detectDuplicates(graph);
18
+
19
+ expect(graph.duplicateClusters).toHaveLength(1);
20
+ const cluster = graph.duplicateClusters[0];
21
+ expect(cluster.type).toBe('exact');
22
+ expect(cluster.size).toBe(2);
23
+
24
+ const nodeA = graph.nodes.get('https://example.com/a')!;
25
+ const nodeB = graph.nodes.get('https://example.com/b')!;
26
+ expect(nodeA.duplicateClusterId).toBeDefined();
27
+ expect(nodeA.duplicateClusterId).toBe(nodeB.duplicateClusterId);
28
+
29
+ // One should be primary, one should be collapsed
30
+ expect(!nodeA.isCollapsed !== !nodeB.isCollapsed).toBe(true);
31
+ });
32
+
33
+ it('should detect near duplicates using SimHash', () => {
34
+ const graph = new Graph();
35
+ graph.addNode('https://example.com/x', 0, 200);
36
+ graph.addNode('https://example.com/y', 0, 200);
37
+
38
+ // Calculate simhashes that are 1 bit apart
39
+ const tokens1 = ['hello', 'world', 'this', 'is', 'a', 'test', 'document'];
40
+ const tokens2 = ['hello', 'world', 'this', 'is', 'a', 'test', 'document2'];
41
+
42
+ const h1 = SimHash.generate(tokens1);
43
+ const h2 = SimHash.generate(tokens2);
44
+
45
+ // Assume standard text gives < 3 diff. For reliability in test, we'll manually set string bigint representations.
46
+ // Actually, we can just use the calculated ones.
47
+ graph.updateNodeData('https://example.com/x', { contentHash: 'x', simhash: h1.toString(), uniqueTokenRatio: 1.0 });
48
+ graph.updateNodeData('https://example.com/y', { contentHash: 'y', simhash: h2.toString(), uniqueTokenRatio: 1.0 });
49
+
50
+ detectDuplicates(graph, { simhashThreshold: 10 }); // use high threshold to guarantee match
51
+
52
+ expect(graph.duplicateClusters).toHaveLength(1);
53
+ expect(graph.duplicateClusters[0].type).toBe('near');
54
+ });
55
+
56
+ it('should identify template-heavy clusters', () => {
57
+ const graph = new Graph();
58
+ graph.addNode('https://example.com/1', 0, 200);
59
+ graph.addNode('https://example.com/2', 0, 200);
60
+
61
+ graph.updateNodeData('https://example.com/1', { contentHash: 'h1', uniqueTokenRatio: 0.2 });
62
+ graph.updateNodeData('https://example.com/2', { contentHash: 'h1', uniqueTokenRatio: 0.2 });
63
+
64
+ detectDuplicates(graph);
65
+
66
+ expect(graph.duplicateClusters[0].type).toBe('template_heavy');
67
+ });
68
+
69
+ it('should mark high severity on missing canonicals', () => {
70
+ const graph = new Graph();
71
+ graph.addNode('https://example.com/a', 0, 200);
72
+ graph.addNode('https://example.com/b', 0, 200);
73
+
74
+ graph.updateNodeData('https://example.com/a', { contentHash: 'h1', canonical: 'https://example.com/a' });
75
+ graph.updateNodeData('https://example.com/b', { contentHash: 'h1', canonical: undefined }); // missing
76
+
77
+ detectDuplicates(graph);
78
+
79
+ expect(graph.duplicateClusters[0].severity).toBe('high');
80
+ });
81
+
82
+ it('should transfer edges during collapse', () => {
83
+ const graph = new Graph();
84
+ graph.addNode('https://example.com/a', 0, 200);
85
+ graph.addNode('https://example.com/b', 0, 200);
86
+ graph.addNode('https://example.com/source', 0, 200);
87
+
88
+ graph.updateNodeData('https://example.com/a', { contentHash: 'h1' });
89
+ graph.updateNodeData('https://example.com/b', { contentHash: 'h1' });
90
+
91
+ // Add edge pointing to B
92
+ graph.edges.set('https://example.com/source|https://example.com/b', 1);
93
+
94
+ // Force A to be the representative by giving it higher inLinks manually, though it's determined dynamically
95
+ graph.nodes.get('https://example.com/a')!.inLinks = 10;
96
+
97
+ detectDuplicates(graph);
98
+
99
+ const a = graph.nodes.get('https://example.com/a')!;
100
+ const b = graph.nodes.get('https://example.com/b')!;
101
+
102
+ expect(a.isClusterPrimary).toBe(true);
103
+ expect(a.isCollapsed).toBe(false);
104
+ expect(b.isCollapsed).toBe(true);
105
+ expect(b.collapseInto).toBe('https://example.com/a');
106
+
107
+ // Check edge transfer
108
+ expect(graph.edges.has('https://example.com/source|https://example.com/a')).toBe(true);
109
+ });
110
+ });
@@ -0,0 +1,106 @@
1
+ import { test, expect, beforeEach } from 'vitest';
2
+ import { Fetcher } from '../src/crawler/fetcher.js';
3
+ import { MockAgent, setGlobalDispatcher } from 'undici';
4
+
5
+ let mockAgent: MockAgent;
6
+
7
+ beforeEach(() => {
8
+ mockAgent = new MockAgent();
9
+ mockAgent.disableNetConnect();
10
+ setGlobalDispatcher(mockAgent);
11
+ });
12
+
13
+ test('fetches simple page', async () => {
14
+ const client = mockAgent.get('https://example.com');
15
+ client.intercept({ path: '/', method: 'GET' }).reply(200, 'Hello', {
16
+ headers: { 'content-type': 'text/html', 'etag': '"123"', 'last-modified': 'Mon, 01 Jan 2000 00:00:00 GMT' }
17
+ });
18
+
19
+ const fetcher = new Fetcher();
20
+ const res = await fetcher.fetch('https://example.com/');
21
+ expect(res.status).toBe(200);
22
+ expect(res.body).toBe('Hello');
23
+ expect(res.etag).toBe('"123"');
24
+ expect(res.lastModified).toBe('Mon, 01 Jan 2000 00:00:00 GMT');
25
+ expect(res.redirectChain).toEqual([]);
26
+ });
27
+
28
+ test('follows redirects', async () => {
29
+ const client = mockAgent.get('https://example.com');
30
+ // A -> B
31
+ client.intercept({ path: '/a', method: 'GET' }).reply(301, '', {
32
+ headers: { location: '/b' }
33
+ });
34
+ // B -> C
35
+ client.intercept({ path: '/b', method: 'GET' }).reply(302, '', {
36
+ headers: { location: 'https://other.com/c' }
37
+ });
38
+
39
+ const otherClient = mockAgent.get('https://other.com');
40
+ // C -> 200
41
+ otherClient.intercept({ path: '/c', method: 'GET' }).reply(200, 'Final');
42
+
43
+ const fetcher = new Fetcher();
44
+ const res = await fetcher.fetch('https://example.com/a');
45
+
46
+ expect(res.status).toBe(200);
47
+ expect(res.body).toBe('Final');
48
+ expect(res.finalUrl).toBe('https://other.com/c');
49
+ expect(res.redirectChain.length).toBe(2);
50
+ expect(res.redirectChain[0]).toEqual({ url: 'https://example.com/a', status: 301, target: 'https://example.com/b' });
51
+ expect(res.redirectChain[1]).toEqual({ url: 'https://example.com/b', status: 302, target: 'https://other.com/c' });
52
+ });
53
+
54
+ test('detects redirect loop', async () => {
55
+ const client = mockAgent.get('https://loop.com');
56
+ // A -> B
57
+ client.intercept({ path: '/a', method: 'GET' }).reply(301, '', { headers: { location: '/b' } });
58
+ // B -> A (This will be detected as loop)
59
+ client.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
60
+
61
+ const fetcher = new Fetcher();
62
+ const res = await fetcher.fetch('https://loop.com/a');
63
+
64
+ // Should return the redirect_loop security error
65
+ expect(res.status).toBe('redirect_loop');
66
+ expect(res.redirectChain.length).toBe(1); // Detected while resolving target of B
67
+ expect(res.redirectChain[0].url).toBe('https://loop.com/a');
68
+ });
69
+
70
+ test('sends conditional headers', async () => {
71
+ const client = mockAgent.get('https://cache.com');
72
+
73
+ client.intercept({
74
+ path: '/',
75
+ method: 'GET',
76
+ headers: {
77
+ 'If-None-Match': '"123"',
78
+ 'If-Modified-Since': 'Mon, 01 Jan 2000 00:00:00 GMT'
79
+ }
80
+ }).reply(304, '', { headers: { etag: '"123"' } });
81
+
82
+ const fetcher = new Fetcher();
83
+ const res = await fetcher.fetch('https://cache.com/', {
84
+ etag: '"123"',
85
+ lastModified: 'Mon, 01 Jan 2000 00:00:00 GMT'
86
+ });
87
+
88
+ expect(res.status).toBe(304);
89
+ expect(res.body).toBe('');
90
+ });
91
+
92
+ test('handles max redirects', async () => {
93
+ const client = mockAgent.get('https://max.com');
94
+ // 11 redirects
95
+ for (let i = 0; i < 11; i++) {
96
+ client.intercept({ path: `/p${i}`, method: 'GET' }).reply(301, '', { headers: { location: `/p${i + 1}` } });
97
+ }
98
+
99
+ // Set maxRedirects to 10 to trigger failure exactly after 10 hops
100
+ // Increase rate to prevent timeout (11 requests * 500ms > 5000ms)
101
+ const fetcher = new Fetcher({ maxRedirects: 10, rate: 100 });
102
+ const res = await fetcher.fetch('https://max.com/p0');
103
+
104
+ expect(res.status).toBe('redirect_limit_exceeded');
105
+ expect(res.redirectChain.length).toBe(10);
106
+ });
@@ -0,0 +1,85 @@
1
+ import { describe, it, expect, vi, beforeEach } from 'vitest';
2
+ import { Fetcher } from '../src/crawler/fetcher.js';
3
+ import { request } from 'undici';
4
+
5
+ vi.mock('undici', () => ({
6
+ request: vi.fn(),
7
+ }));
8
+
9
+ describe('Fetcher Safety Integration', () => {
10
+ let fetcher: Fetcher;
11
+
12
+ beforeEach(() => {
13
+ vi.clearAllMocks();
14
+ fetcher = new Fetcher({ rate: 100 }); // High rate for tests
15
+ });
16
+
17
+ it('should block internal IPs', async () => {
18
+ const res = await fetcher.fetch('http://127.0.0.1');
19
+ expect(res.status).toBe('blocked_internal_ip');
20
+ });
21
+
22
+ it('should block internal IPs in redirects', async () => {
23
+ const mockRequest = vi.mocked(request);
24
+
25
+ // First request is fine, returns redirect
26
+ mockRequest.mockResolvedValueOnce({
27
+ statusCode: 301,
28
+ headers: { location: 'http://192.168.1.1' },
29
+ body: { dump: vi.fn(), text: vi.fn().mockResolvedValue('') }
30
+ } as any);
31
+
32
+ const res = await fetcher.fetch('http://example.com');
33
+ expect(res.status).toBe('blocked_internal_ip');
34
+ expect(res.redirectChain).toHaveLength(1); // Records the redirect that led to block
35
+ expect(res.redirectChain[0].target).toBe('http://192.168.1.1/');
36
+ });
37
+
38
+ it('should enforce max bytes', async () => {
39
+ const mockRequest = vi.mocked(request);
40
+
41
+ mockRequest.mockResolvedValueOnce({
42
+ statusCode: 200,
43
+ headers: {},
44
+ body: {
45
+ on: vi.fn((event, cb) => {
46
+ if (event === 'data') {
47
+ cb(Buffer.alloc(1000));
48
+ cb(Buffer.alloc(1000));
49
+ }
50
+ return { on: vi.fn() };
51
+ }),
52
+ destroy: vi.fn(),
53
+ dump: vi.fn()
54
+ }
55
+ } as any);
56
+
57
+ const res = await fetcher.fetch('http://example.com', { maxBytes: 500 });
58
+ expect(res.status).toBe('oversized');
59
+ });
60
+
61
+ it('should retry on 500', async () => {
62
+ const mockRequest = vi.mocked(request);
63
+
64
+ mockRequest
65
+ .mockResolvedValueOnce({
66
+ statusCode: 500,
67
+ headers: {},
68
+ body: { dump: vi.fn().mockResolvedValue(undefined) }
69
+ } as any)
70
+ .mockResolvedValueOnce({
71
+ statusCode: 200,
72
+ headers: {},
73
+ body: {
74
+ on: vi.fn((event, cb) => {
75
+ if (event === 'data') cb(Buffer.from('ok'));
76
+ if (event === 'end') cb();
77
+ })
78
+ }
79
+ } as any);
80
+
81
+ const res = await fetcher.fetch('http://example.com');
82
+ expect(res.status).toBe(200);
83
+ expect(res.retries).toBe(1);
84
+ });
85
+ });
@@ -0,0 +1,26 @@
1
+ {
2
+ "pages": [
3
+ {
4
+ "url": "https://example.com/",
5
+ "status": 200,
6
+ "depth": 0,
7
+ "html": "<html><head><title>Example Home Page SEO Title For Strong Ranking Signals 12345</title><meta name='description' content='This is an intentionally long and descriptive meta description designed to fit ideal search snippet lengths with rich context for users and engines.'/></head><body><h1>Home</h1><nav><a href='/skip'>Nav</a></nav><p>Welcome to the homepage. This page contains meaningful content. Another sentence here.</p><img src='/a.jpg' alt='hero'><img src='/b.jpg'><a href='/about'>About</a><a href='https://external.com' rel='nofollow noopener'>External</a><script type='application/ld+json'>{\"@context\":\"https://schema.org\",\"@type\":\"WebSite\"}</script></body></html>"
8
+ },
9
+ {
10
+ "url": "https://example.com/about",
11
+ "status": 200,
12
+ "depth": 1,
13
+ "html": "<html><head><title>Example Home Page SEO Title For Strong Ranking Signals 12345</title><meta name='description' content='short desc'/></head><body><h1>Example Home Page SEO Title For Strong Ranking Signals 12345</h1><h1>Second</h1><p>Duplicate body sentence. Duplicate body sentence.</p><img src='/c.jpg' alt=''><script type='application/ld+json'>not-json</script><a href='https://example.com/'>Home</a></body></html>"
14
+ },
15
+ {
16
+ "url": "https://example.com/empty",
17
+ "status": 200,
18
+ "depth": 2,
19
+ "html": ""
20
+ }
21
+ ],
22
+ "edges": [
23
+ { "source": "https://example.com/", "target": "https://example.com/about" },
24
+ { "source": "https://example.com/about", "target": "https://example.com/" }
25
+ ]
26
+ }
@@ -0,0 +1,134 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { Graph } from '../src/graph/graph.js';
3
+ import { computeHITS } from '../src/scoring/hits.js';
4
+
5
+ describe('HITS Scoring', () => {
6
+ it('should compute scores for a simple star topology', () => {
7
+ const graph = new Graph();
8
+ // Hub
9
+ graph.addNode('http://hub.com', 0, 200);
10
+ // Authorities
11
+ graph.addNode('http://auth1.com', 1, 200);
12
+ graph.addNode('http://auth2.com', 1, 200);
13
+ graph.addNode('http://auth3.com', 1, 200);
14
+
15
+ graph.addEdge('http://hub.com', 'http://auth1.com');
16
+ graph.addEdge('http://hub.com', 'http://auth2.com');
17
+ graph.addEdge('http://hub.com', 'http://auth3.com');
18
+
19
+ computeHITS(graph, { iterations: 10 });
20
+
21
+ const hub = graph.nodes.get('http://hub.com')!;
22
+ const auth1 = graph.nodes.get('http://auth1.com')!;
23
+
24
+ // In a star topology:
25
+ // Hub should have max hub score
26
+ // Authorities should have max authority scores
27
+ expect(hub.hubScore).toBeGreaterThan(0.9);
28
+ expect(hub.authorityScore).toBe(0); // No one links to hub
29
+
30
+ expect(auth1.authorityScore).toBeGreaterThan(0.5);
31
+ expect(auth1.hubScore).toBe(0); // Auth1 links to no one
32
+ });
33
+
34
+ it('should handle exclusion rules', () => {
35
+ const graph = new Graph();
36
+ graph.addNode('http://valid.com', 0, 200);
37
+ graph.addNode('http://noindex.com', 0, 200);
38
+ graph.updateNodeData('http://noindex.com', { noindex: true });
39
+ graph.addNode('http://redirect.com', 0, 200);
40
+ graph.updateNodeData('http://redirect.com', { redirectChain: ['http://target.com'] });
41
+ graph.addNode('http://external.com', 0, 200); // Eligibility check marks it as eligible if status is 200
42
+ // but typically external wouldn't have status 200 in the graph if we don't crawl them or they are marked as external.
43
+ // The current hits logic relies on: status === 200 && no redirectChain && !noindex
44
+
45
+ graph.addEdge('http://valid.com', 'http://noindex.com');
46
+ graph.addEdge('http://valid.com', 'http://redirect.com');
47
+
48
+ computeHITS(graph);
49
+
50
+ expect(graph.nodes.get('http://noindex.com')?.hubScore).toBeUndefined();
51
+ expect(graph.nodes.get('http://redirect.com')?.hubScore).toBeUndefined();
52
+ expect(graph.nodes.get('http://valid.com')?.hubScore).toBe(0); // Valid hub but its targets are ineligible
53
+ });
54
+
55
+ it('should respect edge weights', () => {
56
+ const graph = new Graph();
57
+ graph.addNode('http://hub.com', 0, 200);
58
+ graph.addNode('http://auth-high.com', 1, 200);
59
+ graph.addNode('http://auth-low.com', 1, 200);
60
+
61
+ graph.addEdge('http://hub.com', 'http://auth-high.com', 1.0);
62
+ graph.addEdge('http://hub.com', 'http://auth-low.com', 0.1);
63
+
64
+ computeHITS(graph, { iterations: 10 });
65
+
66
+ const authHigh = graph.nodes.get('http://auth-high.com')!;
67
+ const authLow = graph.nodes.get('http://auth-low.com')!;
68
+
69
+ expect(authHigh.authorityScore).toBeGreaterThan(authLow.authorityScore!);
70
+ });
71
+
72
+ it('should classify link roles correctly', () => {
73
+ const graph = new Graph();
74
+ for (let i = 0; i < 11; i++) {
75
+ graph.addNode(`http://node${i}.com`, 0, 200);
76
+ }
77
+
78
+ // AUTHORITY: node1 (linked by 0,2,3... no outlinks)
79
+ graph.addEdge('http://node0.com', 'http://node1.com');
80
+ graph.addEdge('http://node2.com', 'http://node1.com');
81
+ graph.addEdge('http://node3.com', 'http://node1.com');
82
+ graph.addEdge('http://node4.com', 'http://node1.com');
83
+
84
+ // HUB: node4 (links to 1,5,6,7... few inlinks)
85
+ graph.addEdge('http://node4.com', 'http://node5.com');
86
+ graph.addEdge('http://node4.com', 'http://node6.com');
87
+ graph.addEdge('http://node4.com', 'http://node7.com');
88
+
89
+ // POWER: node2 (linked by 0, power is often recursive... link to authority and be linked by hub)
90
+ graph.addEdge('http://node0.com', 'http://node2.com');
91
+ graph.addEdge('http://node2.com', 'http://node1.com');
92
+ graph.addEdge('http://node2.com', 'http://node5.com');
93
+
94
+ // PERIPHERAL: node10 (no links)
95
+ // Some filler nodes to push medians down
96
+ graph.addEdge('http://node8.com', 'http://node9.com');
97
+
98
+ computeHITS(graph, { iterations: 20 });
99
+
100
+ const roles = graph.getNodes().map(n => n.linkRole).filter(Boolean);
101
+ expect(roles).toContain('authority');
102
+ expect(roles).toContain('hub');
103
+ expect(roles).toContain('power');
104
+ expect(roles).toContain('peripheral');
105
+ });
106
+
107
+ it('should handle large synthetic graphs (Performance Test)', () => {
108
+ const graph = new Graph();
109
+ const nodeCount = 5000;
110
+
111
+ // Create 5000 nodes
112
+ for (let i = 0; i < nodeCount; i++) {
113
+ graph.addNode(`http://page${i}.com`, 1, 200);
114
+ }
115
+
116
+ // Create random edges (avg 10 per node)
117
+ for (let i = 0; i < nodeCount; i++) {
118
+ for (let j = 0; j < 10; j++) {
119
+ const target = Math.floor(Math.random() * nodeCount);
120
+ if (i !== target) {
121
+ graph.addEdge(`http://page${i}.com`, `http://page${target}.com`);
122
+ }
123
+ }
124
+ }
125
+
126
+ const start = Date.now();
127
+ computeHITS(graph, { iterations: 20 });
128
+ const duration = Date.now() - start;
129
+
130
+ console.log(`HITS on 5000 nodes took ${duration}ms`);
131
+ expect(duration).toBeLessThan(2000); // Should be very fast, but allow buffer for CI environments
132
+ expect(graph.nodes.get('http://page0.com')?.hubScore).toBeDefined();
133
+ });
134
+ });
@@ -0,0 +1,58 @@
1
+ import { describe, expect, test } from 'vitest';
2
+ import { generateHtml } from '../src/report/html.js';
3
+ import { Metrics } from '../src/graph/metrics.js';
4
+
5
+ describe('html report generator', () => {
6
+ test('generates valid html string with metrics', () => {
7
+ const mockMetrics: Metrics = {
8
+ totalPages: 10,
9
+ totalEdges: 20,
10
+ orphanPages: ['https://example.com/orphan'],
11
+ nearOrphans: [],
12
+ deepPages: [],
13
+ topAuthorityPages: [{ url: 'https://example.com/', authority: 0.9 }],
14
+ averageOutDegree: 2.0,
15
+ maxDepthFound: 5,
16
+ crawlEfficiencyScore: 0.8,
17
+ averageDepth: 3.0,
18
+ structuralEntropy: 1.5,
19
+ topPageRankPages: [],
20
+ limitReached: false,
21
+ sessionStats: {
22
+ pagesFetched: 5,
23
+ pagesCached: 2,
24
+ pagesSkipped: 0,
25
+ totalFound: 7
26
+ }
27
+ };
28
+
29
+ const mockGraphData = {
30
+ nodes: [{ url: 'https://example.com/', depth: 0, inLinks: 5, outLinks: 2, status: 200 }],
31
+ edges: []
32
+ };
33
+
34
+ const html = generateHtml(mockGraphData, mockMetrics);
35
+
36
+ expect(html).toContain('<!DOCTYPE html>');
37
+ expect(html).toContain('Crawlith Site Graph');
38
+ expect(html).toContain('10</span>'); // totalPages
39
+ expect(html).toContain('5 pages</span>'); // pagesFetched
40
+ expect(html).toContain('2</span>'); // pagesCached
41
+ expect(html).toContain('https://example.com/orphan');
42
+ expect(html).toContain('window.GRAPH_DATA =');
43
+ });
44
+
45
+ test('handles missing session stats', () => {
46
+ const mockMetrics: any = {
47
+ totalPages: 10,
48
+ totalEdges: 20,
49
+ orphanPages: [],
50
+ averageOutDegree: 2.0,
51
+ maxDepthFound: 5,
52
+ topAuthorityPages: [],
53
+ sessionStats: null
54
+ };
55
+ const html = generateHtml({ nodes: [], edges: [] }, mockMetrics as any);
56
+ expect(html).not.toContain('Session Crawl:');
57
+ });
58
+ });
@@ -0,0 +1,138 @@
1
+ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
2
+ import { LockManager } from '../../src/lock/lockManager.js';
3
+ import { generateLockKey } from '../../src/lock/hashKey.js';
4
+ import fs from 'node:fs/promises';
5
+ import { existsSync, unlinkSync, readFileSync } from 'node:fs';
6
+ import path from 'node:path';
7
+ import os from 'node:os';
8
+ import { isPidAlive } from '../../src/lock/pidCheck.js';
9
+
10
+ // Mock fs and os
11
+ vi.mock('node:fs/promises');
12
+ vi.mock('node:fs');
13
+ vi.mock('node:os');
14
+ vi.mock('../../src/lock/pidCheck.js', () => ({
15
+ isPidAlive: vi.fn()
16
+ }));
17
+
18
+ describe('LockManager', () => {
19
+ const mockHomeDir = '/home/user';
20
+ const lockDir = path.join(mockHomeDir, '.crawlith', 'locks');
21
+ const command = 'test-command';
22
+ const target = 'http://example.com';
23
+ const options = { limit: 10 };
24
+ const lockHash = generateLockKey(command, target, options);
25
+ const lockPath = path.join(lockDir, `${lockHash}.lock`);
26
+
27
+ beforeEach(() => {
28
+ vi.resetAllMocks();
29
+ vi.mocked(os.homedir).mockReturnValue(mockHomeDir);
30
+ vi.mocked(fs.mkdir).mockResolvedValue(undefined);
31
+ vi.mocked(fs.writeFile).mockResolvedValue(undefined);
32
+ vi.mocked(existsSync).mockReturnValue(false);
33
+ vi.mocked(readFileSync).mockReturnValue('{}');
34
+ vi.mocked(unlinkSync).mockReturnValue(undefined);
35
+
36
+ // Mock process.pid
37
+ Object.defineProperty(process, 'pid', { value: 12345, configurable: true });
38
+
39
+ // Mock process.exit to throw error to stop execution flow in tests
40
+ vi.spyOn(process, 'exit').mockImplementation((code) => {
41
+ throw new Error(`Process exit ${code}`);
42
+ });
43
+
44
+ // Mock console to suppress noise
45
+ vi.spyOn(console, 'log').mockImplementation(() => {});
46
+ vi.spyOn(console, 'warn').mockImplementation(() => {});
47
+ vi.spyOn(console, 'error').mockImplementation(() => {});
48
+
49
+ // Reset static state if any (LockManager stores lockFilePath)
50
+ // We can't easily reset private static via TS, but we can call releaseLock which clears it if set
51
+ LockManager.releaseLock();
52
+ });
53
+
54
+ afterEach(() => {
55
+ vi.restoreAllMocks();
56
+ });
57
+
58
+ it('should acquire lock when no lock exists', async () => {
59
+ await LockManager.acquireLock(command, target, options);
60
+
61
+ expect(fs.mkdir).toHaveBeenCalledWith(lockDir, { recursive: true });
62
+ expect(fs.writeFile).toHaveBeenCalledWith(
63
+ lockPath,
64
+ expect.stringContaining('"limit": 10'),
65
+ expect.objectContaining({ flag: 'wx' })
66
+ );
67
+ });
68
+
69
+ it('should fail if lock exists and PID is alive', async () => {
70
+ vi.mocked(existsSync).mockReturnValue(true);
71
+ vi.mocked(readFileSync).mockReturnValue(JSON.stringify({
72
+ pid: 9999,
73
+ startedAt: Date.now(),
74
+ command,
75
+ target,
76
+ args: options
77
+ }));
78
+ vi.mocked(isPidAlive).mockReturnValue(true);
79
+
80
+ await expect(LockManager.acquireLock(command, target, options)).rejects.toThrow('Process exit 1');
81
+
82
+ expect(console.error).toHaveBeenCalledWith(expect.stringContaining('already running'));
83
+ });
84
+
85
+ it('should clear stale lock and acquire if PID is dead', async () => {
86
+ vi.mocked(existsSync).mockReturnValue(true);
87
+ vi.mocked(readFileSync).mockReturnValue(JSON.stringify({
88
+ pid: 9999,
89
+ startedAt: Date.now(),
90
+ command,
91
+ target,
92
+ args: options
93
+ }));
94
+ vi.mocked(isPidAlive).mockReturnValue(false);
95
+
96
+ await LockManager.acquireLock(command, target, options);
97
+
98
+ expect(unlinkSync).toHaveBeenCalledWith(lockPath);
99
+ expect(fs.writeFile).toHaveBeenCalled();
100
+ expect(console.log).toHaveBeenCalledWith(expect.stringContaining('Detected stale lock'));
101
+ });
102
+
103
+ it('should override lock if force is true', async () => {
104
+ vi.mocked(existsSync).mockReturnValue(true);
105
+ // Even if PID is alive
106
+ vi.mocked(readFileSync).mockReturnValue(JSON.stringify({
107
+ pid: 9999
108
+ }));
109
+ vi.mocked(isPidAlive).mockReturnValue(true);
110
+
111
+ await LockManager.acquireLock(command, target, options, true); // force = true
112
+
113
+ expect(unlinkSync).toHaveBeenCalledWith(lockPath);
114
+ expect(fs.writeFile).toHaveBeenCalled();
115
+ expect(console.warn).toHaveBeenCalledWith(expect.stringContaining('Force mode enabled'));
116
+ });
117
+
118
+ it('should handle race condition (EEXIST)', async () => {
119
+ vi.mocked(existsSync).mockReturnValue(false);
120
+ vi.mocked(fs.writeFile).mockRejectedValue({ code: 'EEXIST' });
121
+
122
+ await expect(LockManager.acquireLock(command, target, options)).rejects.toThrow('Process exit 1');
123
+ expect(console.error).toHaveBeenCalledWith(expect.stringContaining('Race condition'));
124
+ });
125
+
126
+ it('should release lock on exit', async () => {
127
+ // Acquire first (existsSync returns false by default from beforeEach)
128
+ await LockManager.acquireLock(command, target, options);
129
+
130
+ // Simulate file exists for release
131
+ vi.mocked(existsSync).mockReturnValue(true);
132
+
133
+ // Simulate release
134
+ LockManager.releaseLock();
135
+
136
+ expect(unlinkSync).toHaveBeenCalledWith(lockPath);
137
+ });
138
+ });