@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,98 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { Graph } from '../src/graph/graph.js';
3
- import { computePageRank } from '../src/graph/pagerank.js';
4
-
5
- describe('PageRank Engine', () => {
6
- it('should calculate identical PageRank for a simple loop', () => {
7
- const graph = new Graph();
8
- graph.addNode('https://a.com', 0, 200);
9
- graph.addNode('https://b.com', 1, 200);
10
- graph.addEdge('https://a.com', 'https://b.com');
11
- graph.addEdge('https://b.com', 'https://a.com');
12
-
13
- computePageRank(graph);
14
- const nodes = graph.getNodes();
15
-
16
- expect(nodes[0].pageRank).toBeCloseTo(0.5, 4);
17
- expect(nodes[1].pageRank).toBeCloseTo(0.5, 4);
18
- expect(nodes[0].pageRankScore).toBe(100);
19
- expect(nodes[1].pageRankScore).toBe(100);
20
- });
21
-
22
- it('should identify the center of a star graph as most important', () => {
23
- const graph = new Graph();
24
- graph.addNode('https://center.com', 0, 200);
25
- graph.addNode('https://p1.com', 1, 200);
26
- graph.addNode('https://p2.com', 1, 200);
27
- graph.addNode('https://p3.com', 1, 200);
28
-
29
- // Star in: all link to center
30
- graph.addEdge('https://p1.com', 'https://center.com');
31
- graph.addEdge('https://p2.com', 'https://center.com');
32
- graph.addEdge('https://p3.com', 'https://center.com');
33
-
34
- computePageRank(graph);
35
- const nodes = graph.getNodes();
36
-
37
- const center = nodes.find(n => n.url.includes('center'))!;
38
- const leaves = nodes.filter(n => !n.url.includes('center'));
39
-
40
- expect(center.pageRankScore).toBe(100);
41
- leaves.forEach(leaf => {
42
- expect(leaf.pageRankScore).toBeLessThan(100);
43
- expect(leaf.pageRank!).toBeLessThan(center.pageRank!);
44
- });
45
- });
46
-
47
- it('should respect link weights (Body > Nav > Footer)', () => {
48
- const graph = new Graph();
49
- graph.addNode('https://source.com', 0, 200);
50
- graph.addNode('https://body-target.com', 1, 200);
51
- graph.addNode('https://footer-target.com', 1, 200);
52
-
53
- // Body weight 1.0, Footer weight 0.4
54
- graph.addEdge('https://source.com', 'https://body-target.com', 1.0);
55
- graph.addEdge('https://source.com', 'https://footer-target.com', 0.4);
56
-
57
- computePageRank(graph);
58
-
59
- const bodyTarget = graph.nodes.get('https://body-target.com')!;
60
- const footerTarget = graph.nodes.get('https://footer-target.com')!;
61
-
62
- expect(bodyTarget.pageRank!).toBeGreaterThan(footerTarget.pageRank!);
63
- });
64
-
65
- it('should handle sink nodes by redistributing rank', () => {
66
- const graph = new Graph();
67
- graph.addNode('https://a.com', 0, 200);
68
- graph.addNode('https://b.com', 1, 200); // b is a sink
69
- graph.addEdge('https://a.com', 'https://b.com');
70
-
71
- computePageRank(graph);
72
-
73
- const nodeA = graph.nodes.get('https://a.com')!;
74
- const nodeB = graph.nodes.get('https://b.com')!;
75
-
76
- // Without redistribution, A would lose all rank.
77
- // With redistribution, A should still have some rank.
78
- expect(nodeA.pageRank).toBeGreaterThan(0);
79
- expect(nodeB.pageRank).toBeGreaterThan(nodeA.pageRank!);
80
- });
81
-
82
- it('should exclude noindex pages from receiving or passing rank', () => {
83
- const graph = new Graph();
84
- graph.addNode('https://a.com', 0, 200);
85
- graph.addNode('https://no-index.com', 1, 200);
86
- graph.nodes.get('https://no-index.com')!.noindex = true;
87
-
88
- graph.addEdge('https://a.com', 'https://no-index.com');
89
-
90
- computePageRank(graph);
91
-
92
- const nodeA = graph.nodes.get('https://a.com')!;
93
- const nodeNoIndex = graph.nodes.get('https://no-index.com')!;
94
-
95
- expect(nodeNoIndex.pageRank).toBeUndefined();
96
- expect(nodeA.pageRank).toBe(1.0); // Only one eligible node
97
- });
98
- });
@@ -1,117 +0,0 @@
1
- import { test, expect } from 'vitest';
2
- import { Parser } from '../src/crawler/parser.js';
3
-
4
- const parser = new Parser();
5
- const baseUrl = 'https://example.com';
6
-
7
- test('extracts links correctly', () => {
8
- const html = `
9
- <html>
10
- <body>
11
- <a href="/page1">Page 1</a>
12
- <a href="https://other.com">Other</a>
13
- <a href="#hash">Hash</a>
14
- <a href="javascript:void(0)">JS</a>
15
- </body>
16
- </html>
17
- `;
18
- const result = parser.parse(html, baseUrl, 200);
19
- const urls = result.links.map(l => l.url);
20
- expect(urls).toContain('https://example.com/page1');
21
- expect(urls).toContain('https://other.com/');
22
- expect(urls).not.toContain('https://example.com/#hash');
23
- // It also extracts the base URL itself from href="#hash"
24
- expect(urls).toContain('https://example.com/');
25
- expect(result.links.length).toBe(3);
26
- });
27
-
28
- test('respects nofollow on links', () => {
29
- const html = `
30
- <html>
31
- <body>
32
- <a href="/page1" rel="nofollow">Page 1</a>
33
- <a href="/page2">Page 2</a>
34
- </body>
35
- </html>
36
- `;
37
- const result = parser.parse(html, baseUrl, 200);
38
- const urls = result.links.map(l => l.url);
39
- expect(urls).not.toContain('https://example.com/page1');
40
- expect(urls).toContain('https://example.com/page2');
41
- });
42
-
43
- test('respects meta robots nofollow', () => {
44
- const html = `
45
- <html>
46
- <head>
47
- <meta name="robots" content="nofollow">
48
- </head>
49
- <body>
50
- <a href="/page1">Page 1</a>
51
- </body>
52
- </html>
53
- `;
54
- const result = parser.parse(html, baseUrl, 200);
55
- expect(result.nofollow).toBe(true);
56
- expect(result.links.length).toBe(0);
57
- });
58
-
59
- test('detects canonical', () => {
60
- const html = `
61
- <html>
62
- <head>
63
- <link rel="canonical" href="https://example.com/canon">
64
- </head>
65
- </html>
66
- `;
67
- const result = parser.parse(html, baseUrl, 200);
68
- expect(result.canonical).toBe('https://example.com/canon');
69
- });
70
-
71
- test('detects relative canonical', () => {
72
- const html = `
73
- <html>
74
- <head>
75
- <link rel="canonical" href="/canon">
76
- </head>
77
- </html>
78
- `;
79
- const result = parser.parse(html, baseUrl, 200);
80
- expect(result.canonical).toBe('https://example.com/canon');
81
- });
82
-
83
- test('detects soft 404', () => {
84
- const html = `
85
- <html>
86
- <head><title>Page Not Found</title></head>
87
- <body>Sorry, the page you are looking for does not exist.</body>
88
- </html>
89
- `;
90
- const result = parser.parse(html, baseUrl, 200);
91
- expect(result.soft404Score).toBeGreaterThanOrEqual(0.5);
92
- });
93
-
94
- test('content hash ignores scripts', () => {
95
- const html1 = `
96
- <html><body><script>var x=1;</script><p>Hello</p></body></html>
97
- `;
98
- const html2 = `
99
- <html><body><script>var x=2;</script><p>Hello</p></body></html>
100
- `;
101
- const result1 = parser.parse(html1, baseUrl, 200);
102
- const result2 = parser.parse(html2, baseUrl, 200);
103
- expect(result1.contentHash).toBe(result2.contentHash);
104
- });
105
-
106
- test('detects meta robots noindex', () => {
107
- const html = `
108
- <html>
109
- <head>
110
- <meta name="robots" content="noindex, nofollow">
111
- </head>
112
- </html>
113
- `;
114
- const result = parser.parse(html, baseUrl, 200);
115
- expect(result.noindex).toBe(true);
116
- expect(result.nofollow).toBe(true);
117
- });
@@ -1,57 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { Fetcher } from '../src/crawler/fetcher.js';
3
- import { request, ProxyAgent } from 'undici';
4
-
5
- vi.mock('undici', async (importOriginal) => {
6
- const original = await importOriginal<typeof import('undici')>();
7
- return {
8
- ...original,
9
- request: vi.fn(),
10
- ProxyAgent: vi.fn(function () {
11
- return {
12
- request: vi.fn(),
13
- close: vi.fn()
14
- };
15
- })
16
- };
17
- });
18
-
19
- describe('Proxy Integration', () => {
20
- beforeEach(() => {
21
- vi.clearAllMocks();
22
- });
23
-
24
- it('should use ProxyAgent when proxyUrl is provided', async () => {
25
- const fetcher = new Fetcher({ proxyUrl: 'http://proxy.com:8080', rate: 100 });
26
- const mockRequest = vi.mocked(request);
27
-
28
- // Mock the request to return a successful response immediately
29
- mockRequest.mockResolvedValueOnce({
30
- statusCode: 200,
31
- headers: {},
32
- body: {
33
- on: vi.fn((event, cb) => {
34
- if (event === 'data') {
35
- // Simulate async data chunk
36
- setTimeout(() => cb(Buffer.from('ok')), 0);
37
- }
38
- if (event === 'end') {
39
- // Simulate async end
40
- setTimeout(() => cb(), 0);
41
- }
42
- return { on: vi.fn() }; // chaining
43
- }),
44
- dump: vi.fn(),
45
- text: vi.fn().mockResolvedValue('ok')
46
- }
47
- } as any);
48
-
49
- await fetcher.fetch('http://target.com');
50
-
51
- expect(ProxyAgent).toHaveBeenCalledWith('http://proxy.com:8080');
52
- });
53
-
54
- it('should fail fast on invalid proxy URL', () => {
55
- expect(() => new Fetcher({ proxyUrl: 'not-a-url' })).toThrow('Invalid proxy URL');
56
- });
57
- });
@@ -1,77 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { RedirectController } from '../src/core/network/redirectController.js';
3
- import { Fetcher } from '../src/crawler/fetcher.js';
4
- import { request } from 'undici';
5
-
6
- vi.mock('undici', () => ({
7
- request: vi.fn(),
8
- ProxyAgent: vi.fn().mockImplementation(() => ({ dispatcher: {} })),
9
- Agent: class {
10
- dispatch = vi.fn();
11
- },
12
- Dispatcher: class {}
13
- }));
14
-
15
- describe('RedirectController', () => {
16
- it('should limit hops', () => {
17
- const ctrl = new RedirectController(2);
18
- expect(ctrl.nextHop('http://b.com')).toBe(null);
19
- expect(ctrl.nextHop('http://c.com')).toBe(null);
20
- expect(ctrl.nextHop('http://d.com')).toBe('redirect_limit_exceeded');
21
- });
22
-
23
- it('should detect loops', () => {
24
- const ctrl = new RedirectController(5);
25
- expect(ctrl.nextHop('http://b.com')).toBe(null);
26
- expect(ctrl.nextHop('http://a.com')).toBe(null);
27
- expect(ctrl.nextHop('http://b.com')).toBe('redirect_loop');
28
- });
29
- });
30
-
31
- describe('Fetcher Redirect Integration', () => {
32
- let fetcher: Fetcher;
33
-
34
- beforeEach(() => {
35
- vi.clearAllMocks();
36
- fetcher = new Fetcher({ rate: 100, maxRedirects: 2 });
37
- });
38
-
39
- it('should stop at max redirects', async () => {
40
- const mockRequest = vi.mocked(request);
41
-
42
- // Return 301 with unique locations
43
- mockRequest
44
- .mockResolvedValueOnce({
45
- statusCode: 301,
46
- headers: { location: 'http://a.com' },
47
- body: { dump: vi.fn().mockResolvedValue(undefined) }
48
- } as any)
49
- .mockResolvedValueOnce({
50
- statusCode: 301,
51
- headers: { location: 'http://b.com' },
52
- body: { dump: vi.fn().mockResolvedValue(undefined) }
53
- } as any)
54
- .mockResolvedValueOnce({
55
- statusCode: 301,
56
- headers: { location: 'http://c.com' },
57
- body: { dump: vi.fn().mockResolvedValue(undefined) }
58
- } as any);
59
-
60
- const res = await fetcher.fetch('http://start.com');
61
- expect(res.status).toBe('redirect_limit_exceeded');
62
- expect(res.redirectChain).toHaveLength(2);
63
- });
64
-
65
- it('should detect loops in fetch', async () => {
66
- const mockRequest = vi.mocked(request);
67
-
68
- mockRequest.mockResolvedValue({
69
- statusCode: 301,
70
- headers: { location: 'http://start.com' },
71
- body: { dump: vi.fn().mockResolvedValue(undefined) }
72
- } as any);
73
-
74
- const res = await fetcher.fetch('http://start.com');
75
- expect(res.status).toBe('redirect_loop');
76
- });
77
- });
@@ -1,183 +0,0 @@
1
- import { describe, expect, test } from 'vitest';
2
- import { renderAnalysisCsv, AnalysisResult } from '../src/analysis/analyze.js';
3
-
4
- describe('renderAnalysisCsv', () => {
5
- test('renders CSV with headers', () => {
6
- const result: AnalysisResult = {
7
- pages: [],
8
- site_summary: {
9
- pages_analyzed: 0,
10
- avg_seo_score: 0,
11
- thin_pages: 0,
12
- duplicate_titles: 0,
13
- site_score: 0
14
- },
15
- site_scores: {} as any,
16
- active_modules: {
17
- seo: true,
18
- content: true,
19
- accessibility: true
20
- }
21
- };
22
-
23
- const csv = renderAnalysisCsv(result);
24
- expect(csv).toContain('URL,SEO Score,Thin Score,HTTP Status,Title,Title Length,Meta Description,Desc Length,Word Count,Internal Links,External Links');
25
- });
26
-
27
- test('renders a single page correctly', () => {
28
- const result: AnalysisResult = {
29
- pages: [
30
- {
31
- url: 'https://example.com',
32
- status: 200,
33
- seoScore: 85,
34
- thinScore: 10,
35
- title: { value: 'Example Domain', length: 14, status: 'ok' },
36
- metaDescription: { value: 'This is an example description.', length: 29, status: 'ok' },
37
- content: { wordCount: 500 } as any,
38
- links: { internalLinks: 5, externalLinks: 2 } as any,
39
- h1: {} as any,
40
- images: {} as any,
41
- structuredData: {} as any,
42
- meta: {}
43
- }
44
- ],
45
- site_summary: {
46
- pages_analyzed: 1,
47
- avg_seo_score: 85,
48
- thin_pages: 0,
49
- duplicate_titles: 0,
50
- site_score: 85
51
- },
52
- site_scores: {} as any,
53
- active_modules: {
54
- seo: true,
55
- content: true,
56
- accessibility: true
57
- }
58
- };
59
-
60
- const csv = renderAnalysisCsv(result);
61
- const lines = csv.split('\n');
62
- expect(lines.length).toBe(2);
63
- expect(lines[1]).toContain('https://example.com,85,10,200,"Example Domain",14,"This is an example description.",29,500,5,2');
64
- });
65
-
66
- test('escapes quotes in title and meta description', () => {
67
- const result: AnalysisResult = {
68
- pages: [
69
- {
70
- url: 'https://example.com/quote',
71
- status: 200,
72
- seoScore: 90,
73
- thinScore: 5,
74
- title: { value: 'Example "Quoted" Domain', length: 23, status: 'ok' },
75
- metaDescription: { value: 'This description contains "quotes" inside.', length: 42, status: 'ok' },
76
- content: { wordCount: 300 } as any,
77
- links: { internalLinks: 3, externalLinks: 1 } as any,
78
- h1: {} as any,
79
- images: {} as any,
80
- structuredData: {} as any,
81
- meta: {}
82
- }
83
- ],
84
- site_summary: {
85
- pages_analyzed: 1,
86
- avg_seo_score: 90,
87
- thin_pages: 0,
88
- duplicate_titles: 0,
89
- site_score: 90
90
- },
91
- site_scores: {} as any,
92
- active_modules: {
93
- seo: true,
94
- content: true,
95
- accessibility: true
96
- }
97
- };
98
-
99
- const csv = renderAnalysisCsv(result);
100
- const lines = csv.split('\n');
101
- // Expect double quotes to be escaped with double quotes: " -> ""
102
- // And the whole field wrapped in quotes
103
- expect(lines[1]).toContain('"Example ""Quoted"" Domain"');
104
- expect(lines[1]).toContain('"This description contains ""quotes"" inside."');
105
- });
106
-
107
- test('handles Pending/Limit status (status: 0)', () => {
108
- const result: AnalysisResult = {
109
- pages: [
110
- {
111
- url: 'https://example.com/pending',
112
- status: 0,
113
- seoScore: 0,
114
- thinScore: 0,
115
- title: { value: null, length: 0, status: 'missing' },
116
- metaDescription: { value: null, length: 0, status: 'missing' },
117
- content: { wordCount: 0 } as any,
118
- links: { internalLinks: 0, externalLinks: 0 } as any,
119
- h1: {} as any,
120
- images: {} as any,
121
- structuredData: {} as any,
122
- meta: {}
123
- }
124
- ],
125
- site_summary: {
126
- pages_analyzed: 1,
127
- avg_seo_score: 0,
128
- thin_pages: 0,
129
- duplicate_titles: 0,
130
- site_score: 0
131
- },
132
- site_scores: {} as any,
133
- active_modules: {
134
- seo: true,
135
- content: true,
136
- accessibility: true
137
- }
138
- };
139
-
140
- const csv = renderAnalysisCsv(result);
141
- const lines = csv.split('\n');
142
- expect(lines[1]).toContain('Pending/Limit');
143
- });
144
-
145
- test('handles missing title and description gracefully', () => {
146
- const result: AnalysisResult = {
147
- pages: [
148
- {
149
- url: 'https://example.com/missing',
150
- status: 404,
151
- seoScore: 0,
152
- thinScore: 0,
153
- title: { value: undefined as any, length: 0, status: 'missing' },
154
- metaDescription: { value: null as any, length: 0, status: 'missing' },
155
- content: { wordCount: 0 } as any,
156
- links: { internalLinks: 0, externalLinks: 0 } as any,
157
- h1: {} as any,
158
- images: {} as any,
159
- structuredData: {} as any,
160
- meta: {}
161
- }
162
- ],
163
- site_summary: {
164
- pages_analyzed: 1,
165
- avg_seo_score: 0,
166
- thin_pages: 0,
167
- duplicate_titles: 0,
168
- site_score: 0
169
- },
170
- site_scores: {} as any,
171
- active_modules: {
172
- seo: true,
173
- content: true,
174
- accessibility: true
175
- }
176
- };
177
-
178
- const csv = renderAnalysisCsv(result);
179
- const lines = csv.split('\n');
180
- // Should produce empty quoted strings ""
181
- expect(lines[1]).toContain(',"",0,"",0,0,0,0');
182
- });
183
- });
@@ -1,126 +0,0 @@
1
- import { describe, it, expect, vi } from 'vitest';
2
- import { IPGuard } from '../src/core/security/ipGuard.js';
3
- import { RateLimiter } from '../src/core/network/rateLimiter.js';
4
- import { RetryPolicy } from '../src/core/network/retryPolicy.js';
5
- import { ResponseLimiter } from '../src/core/network/responseLimiter.js';
6
- import { Readable } from 'stream';
7
- import * as dns from 'dns';
8
-
9
- vi.mock('dns', () => ({
10
- resolve4: vi.fn(),
11
- resolve6: vi.fn(),
12
- }));
13
-
14
- describe('IPGuard', () => {
15
- it('should block IPv4 internal ranges', () => {
16
- expect(IPGuard.isInternal('127.0.0.1')).toBe(true);
17
- expect(IPGuard.isInternal('10.0.0.1')).toBe(true);
18
- expect(IPGuard.isInternal('192.168.1.1')).toBe(true);
19
- expect(IPGuard.isInternal('172.16.0.1')).toBe(true);
20
- expect(IPGuard.isInternal('172.31.255.255')).toBe(true);
21
- expect(IPGuard.isInternal('169.254.1.1')).toBe(true);
22
- expect(IPGuard.isInternal('0.0.0.0')).toBe(true);
23
- });
24
-
25
- it('should allow public IPv4', () => {
26
- expect(IPGuard.isInternal('8.8.8.8')).toBe(false);
27
- expect(IPGuard.isInternal('1.1.1.1')).toBe(false);
28
- expect(IPGuard.isInternal('172.32.0.1')).toBe(false);
29
- });
30
-
31
- it('should block IPv6 internal/local addresses', () => {
32
- expect(IPGuard.isInternal('::1')).toBe(true);
33
- expect(IPGuard.isInternal('fc00::1')).toBe(true);
34
- expect(IPGuard.isInternal('fe80::1')).toBe(true);
35
- });
36
-
37
- it('should block IPv4-mapped IPv6 internal addresses', () => {
38
- expect(IPGuard.isInternal('::ffff:127.0.0.1')).toBe(true);
39
- expect(IPGuard.isInternal('::ffff:10.0.0.1')).toBe(true);
40
- expect(IPGuard.isInternal('::ffff:192.168.1.1')).toBe(true);
41
- expect(IPGuard.isInternal('::ffff:169.254.169.254')).toBe(true);
42
- expect(IPGuard.isInternal('::ffff:7f00:0001')).toBe(true); // Hex 127.0.0.1
43
- });
44
-
45
- it('should allow IPv4-mapped IPv6 public addresses', () => {
46
- expect(IPGuard.isInternal('::ffff:8.8.8.8')).toBe(false);
47
- });
48
-
49
- it('should validate hostname by resolving IPs', async () => {
50
- const resolve4Spy = vi.mocked(dns.resolve4);
51
- const resolve6Spy = vi.mocked(dns.resolve6);
52
-
53
- resolve4Spy.mockImplementation((_h: string, cb: any) => cb(null, ['1.1.1.1']));
54
- resolve6Spy.mockImplementation((_h: string, cb: any) => cb(null, []));
55
- expect(await IPGuard.validateHost('example.com')).toBe(true);
56
-
57
- resolve4Spy.mockImplementation((_h: string, cb: any) => cb(null, ['127.0.0.1']));
58
- expect(await IPGuard.validateHost('localhost')).toBe(false);
59
- });
60
- });
61
-
62
- describe('RateLimiter', () => {
63
- it('should enforce rate limits', async () => {
64
- const limiter = new RateLimiter(1); // 1 req/sec = 1000ms interval
65
- const start = Date.now();
66
-
67
- await limiter.waitForToken('host1'); // returns immediately, tokens becomes 0
68
- await limiter.waitForToken('host1'); // waits for refill (1s)
69
-
70
- const elapsed = Date.now() - start;
71
- expect(elapsed).toBeGreaterThanOrEqual(1000);
72
- }, 5000);
73
-
74
- it('should have separate buckets for hosts', async () => {
75
- const limiter = new RateLimiter(1);
76
- const start = Date.now();
77
-
78
- await limiter.waitForToken('host1');
79
- await limiter.waitForToken('host2');
80
-
81
- const elapsed = Date.now() - start;
82
- expect(elapsed).toBeLessThan(100);
83
- });
84
-
85
- it('should respect crawlDelay if higher than rate', async () => {
86
- const limiter = new RateLimiter(1); // 1000ms interval
87
- const start = Date.now();
88
-
89
- await limiter.waitForToken('host3'); // returns immediately, tokens = 0
90
- await limiter.waitForToken('host3', 1); // 1s crawl delay
91
-
92
- const elapsed = Date.now() - start;
93
- expect(elapsed).toBeGreaterThanOrEqual(1000);
94
- }, 5000);
95
- });
96
-
97
- describe('RetryPolicy', () => {
98
- it('should retry transient failures', async () => {
99
- let calls = 0;
100
- const result = await RetryPolicy.execute(
101
- async () => {
102
- calls++;
103
- if (calls < 3) throw new Error('Status 500');
104
- return 'success';
105
- },
106
- (err) => err.message === 'Status 500',
107
- { maxRetries: 3, baseDelay: 10 }
108
- );
109
-
110
- expect(result).toBe('success');
111
- expect(calls).toBe(3);
112
- });
113
- });
114
-
115
- describe('ResponseLimiter', () => {
116
- it('should stream to string', async () => {
117
- const stream = Readable.from(['hello ', 'world']);
118
- const result = await ResponseLimiter.streamToString(stream, 100);
119
- expect(result).toBe('hello world');
120
- });
121
-
122
- it('should abort if limit exceeded', async () => {
123
- const stream = Readable.from(['too ', 'large ', 'content']);
124
- await expect(ResponseLimiter.streamToString(stream, 5)).rejects.toThrow('Oversized response');
125
- });
126
- });