@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,57 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { Fetcher } from '../src/crawler/fetcher.js';
3
- import { request, ProxyAgent } from 'undici';
4
-
5
- vi.mock('undici', async (importOriginal) => {
6
- const original = await importOriginal<typeof import('undici')>();
7
- return {
8
- ...original,
9
- request: vi.fn(),
10
- ProxyAgent: vi.fn(function () {
11
- return {
12
- request: vi.fn(),
13
- close: vi.fn()
14
- };
15
- })
16
- };
17
- });
18
-
19
- describe('Proxy Integration', () => {
20
- beforeEach(() => {
21
- vi.clearAllMocks();
22
- });
23
-
24
- it('should use ProxyAgent when proxyUrl is provided', async () => {
25
- const fetcher = new Fetcher({ proxyUrl: 'http://proxy.com:8080', rate: 100 });
26
- const mockRequest = vi.mocked(request);
27
-
28
- // Mock the request to return a successful response immediately
29
- mockRequest.mockResolvedValueOnce({
30
- statusCode: 200,
31
- headers: {},
32
- body: {
33
- on: vi.fn((event, cb) => {
34
- if (event === 'data') {
35
- // Simulate async data chunk
36
- setTimeout(() => cb(Buffer.from('ok')), 0);
37
- }
38
- if (event === 'end') {
39
- // Simulate async end
40
- setTimeout(() => cb(), 0);
41
- }
42
- return { on: vi.fn() }; // chaining
43
- }),
44
- dump: vi.fn(),
45
- text: vi.fn().mockResolvedValue('ok')
46
- }
47
- } as any);
48
-
49
- await fetcher.fetch('http://target.com');
50
-
51
- expect(ProxyAgent).toHaveBeenCalledWith('http://proxy.com:8080');
52
- });
53
-
54
- it('should fail fast on invalid proxy URL', () => {
55
- expect(() => new Fetcher({ proxyUrl: 'not-a-url' })).toThrow('Invalid proxy URL');
56
- });
57
- });
@@ -1,73 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { RedirectController } from '../src/core/network/redirectController.js';
3
- import { Fetcher } from '../src/crawler/fetcher.js';
4
- import { request } from 'undici';
5
-
6
- vi.mock('undici', () => ({
7
- request: vi.fn(),
8
- ProxyAgent: vi.fn().mockImplementation(() => ({ dispatcher: {} }))
9
- }));
10
-
11
- describe('RedirectController', () => {
12
- it('should limit hops', () => {
13
- const ctrl = new RedirectController(2);
14
- expect(ctrl.nextHop('http://b.com')).toBe(null);
15
- expect(ctrl.nextHop('http://c.com')).toBe(null);
16
- expect(ctrl.nextHop('http://d.com')).toBe('redirect_limit_exceeded');
17
- });
18
-
19
- it('should detect loops', () => {
20
- const ctrl = new RedirectController(5);
21
- expect(ctrl.nextHop('http://b.com')).toBe(null);
22
- expect(ctrl.nextHop('http://a.com')).toBe(null);
23
- expect(ctrl.nextHop('http://b.com')).toBe('redirect_loop');
24
- });
25
- });
26
-
27
- describe('Fetcher Redirect Integration', () => {
28
- let fetcher: Fetcher;
29
-
30
- beforeEach(() => {
31
- vi.clearAllMocks();
32
- fetcher = new Fetcher({ rate: 100, maxRedirects: 2 });
33
- });
34
-
35
- it('should stop at max redirects', async () => {
36
- const mockRequest = vi.mocked(request);
37
-
38
- // Return 301 with unique locations
39
- mockRequest
40
- .mockResolvedValueOnce({
41
- statusCode: 301,
42
- headers: { location: 'http://a.com' },
43
- body: { dump: vi.fn().mockResolvedValue(undefined) }
44
- } as any)
45
- .mockResolvedValueOnce({
46
- statusCode: 301,
47
- headers: { location: 'http://b.com' },
48
- body: { dump: vi.fn().mockResolvedValue(undefined) }
49
- } as any)
50
- .mockResolvedValueOnce({
51
- statusCode: 301,
52
- headers: { location: 'http://c.com' },
53
- body: { dump: vi.fn().mockResolvedValue(undefined) }
54
- } as any);
55
-
56
- const res = await fetcher.fetch('http://start.com');
57
- expect(res.status).toBe('redirect_limit_exceeded');
58
- expect(res.redirectChain).toHaveLength(2);
59
- });
60
-
61
- it('should detect loops in fetch', async () => {
62
- const mockRequest = vi.mocked(request);
63
-
64
- mockRequest.mockResolvedValue({
65
- statusCode: 301,
66
- headers: { location: 'http://start.com' },
67
- body: { dump: vi.fn().mockResolvedValue(undefined) }
68
- } as any);
69
-
70
- const res = await fetcher.fetch('http://start.com');
71
- expect(res.status).toBe('redirect_loop');
72
- });
73
- });
@@ -1,114 +0,0 @@
1
- import { describe, it, expect, vi } from 'vitest';
2
- import { IPGuard } from '../src/core/security/ipGuard.js';
3
- import { RateLimiter } from '../src/core/network/rateLimiter.js';
4
- import { RetryPolicy } from '../src/core/network/retryPolicy.js';
5
- import { ResponseLimiter } from '../src/core/network/responseLimiter.js';
6
- import { Readable } from 'stream';
7
- import * as dns from 'dns';
8
-
9
- vi.mock('dns', () => ({
10
- resolve4: vi.fn(),
11
- resolve6: vi.fn(),
12
- }));
13
-
14
- describe('IPGuard', () => {
15
- it('should block IPv4 internal ranges', () => {
16
- expect(IPGuard.isInternal('127.0.0.1')).toBe(true);
17
- expect(IPGuard.isInternal('10.0.0.1')).toBe(true);
18
- expect(IPGuard.isInternal('192.168.1.1')).toBe(true);
19
- expect(IPGuard.isInternal('172.16.0.1')).toBe(true);
20
- expect(IPGuard.isInternal('172.31.255.255')).toBe(true);
21
- expect(IPGuard.isInternal('169.254.1.1')).toBe(true);
22
- expect(IPGuard.isInternal('0.0.0.0')).toBe(true);
23
- });
24
-
25
- it('should allow public IPv4', () => {
26
- expect(IPGuard.isInternal('8.8.8.8')).toBe(false);
27
- expect(IPGuard.isInternal('1.1.1.1')).toBe(false);
28
- expect(IPGuard.isInternal('172.32.0.1')).toBe(false);
29
- });
30
-
31
- it('should block IPv6 internal/local addresses', () => {
32
- expect(IPGuard.isInternal('::1')).toBe(true);
33
- expect(IPGuard.isInternal('fc00::1')).toBe(true);
34
- expect(IPGuard.isInternal('fe80::1')).toBe(true);
35
- });
36
-
37
- it('should validate hostname by resolving IPs', async () => {
38
- const resolve4Spy = vi.mocked(dns.resolve4);
39
- const resolve6Spy = vi.mocked(dns.resolve6);
40
-
41
- resolve4Spy.mockImplementation((_h: string, cb: any) => cb(null, ['1.1.1.1']));
42
- resolve6Spy.mockImplementation((_h: string, cb: any) => cb(null, []));
43
- expect(await IPGuard.validateHost('example.com')).toBe(true);
44
-
45
- resolve4Spy.mockImplementation((_h: string, cb: any) => cb(null, ['127.0.0.1']));
46
- expect(await IPGuard.validateHost('localhost')).toBe(false);
47
- });
48
- });
49
-
50
- describe('RateLimiter', () => {
51
- it('should enforce rate limits', async () => {
52
- const limiter = new RateLimiter(1); // 1 req/sec = 1000ms interval
53
- const start = Date.now();
54
-
55
- await limiter.waitForToken('host1'); // returns immediately, tokens becomes 0
56
- await limiter.waitForToken('host1'); // waits for refill (1s)
57
-
58
- const elapsed = Date.now() - start;
59
- expect(elapsed).toBeGreaterThanOrEqual(1000);
60
- }, 5000);
61
-
62
- it('should have separate buckets for hosts', async () => {
63
- const limiter = new RateLimiter(1);
64
- const start = Date.now();
65
-
66
- await limiter.waitForToken('host1');
67
- await limiter.waitForToken('host2');
68
-
69
- const elapsed = Date.now() - start;
70
- expect(elapsed).toBeLessThan(100);
71
- });
72
-
73
- it('should respect crawlDelay if higher than rate', async () => {
74
- const limiter = new RateLimiter(1); // 1000ms interval
75
- const start = Date.now();
76
-
77
- await limiter.waitForToken('host3'); // returns immediately, tokens = 0
78
- await limiter.waitForToken('host3', 1); // 1s crawl delay
79
-
80
- const elapsed = Date.now() - start;
81
- expect(elapsed).toBeGreaterThanOrEqual(1000);
82
- }, 5000);
83
- });
84
-
85
- describe('RetryPolicy', () => {
86
- it('should retry transient failures', async () => {
87
- let calls = 0;
88
- const result = await RetryPolicy.execute(
89
- async () => {
90
- calls++;
91
- if (calls < 3) throw new Error('Status 500');
92
- return 'success';
93
- },
94
- (err) => err.message === 'Status 500',
95
- { maxRetries: 3, baseDelay: 10 }
96
- );
97
-
98
- expect(result).toBe('success');
99
- expect(calls).toBe(3);
100
- });
101
- });
102
-
103
- describe('ResponseLimiter', () => {
104
- it('should stream to string', async () => {
105
- const stream = Readable.from(['hello ', 'world']);
106
- const result = await ResponseLimiter.streamToString(stream, 100);
107
- expect(result).toBe('hello world');
108
- });
109
-
110
- it('should abort if limit exceeded', async () => {
111
- const stream = Readable.from(['too ', 'large ', 'content']);
112
- await expect(ResponseLimiter.streamToString(stream, 5)).rejects.toThrow('Oversized response');
113
- });
114
- });
@@ -1,66 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { DomainFilter } from '../src/core/scope/domainFilter.js';
3
- import { SubdomainPolicy } from '../src/core/scope/subdomainPolicy.js';
4
- import { ScopeManager } from '../src/core/scope/scopeManager.js';
5
-
6
- describe('DomainFilter', () => {
7
- it('should normalize hostnames', () => {
8
- const filter = new DomainFilter(['EXAMPLE.COM.'], ['DENY.COM.']);
9
- expect(filter.isAllowed('example.com')).toBe(true);
10
- expect(filter.isAllowed('deny.com')).toBe(false);
11
- });
12
-
13
- it('should respect precedence (deny wins)', () => {
14
- const filter = new DomainFilter(['example.com'], ['example.com']);
15
- expect(filter.isAllowed('example.com')).toBe(false);
16
- });
17
-
18
- it('should handle punycode', () => {
19
- // xn--80ak6aa92e.com is punycode for пример.com
20
- const filter = new DomainFilter(['xn--80ak6aa92e.com']);
21
- expect(filter.isAllowed('XN--80AK6AA92E.COM')).toBe(true);
22
- });
23
-
24
- it('should block if not in allow list (when list not empty)', () => {
25
- const filter = new DomainFilter(['allowed.com']);
26
- expect(filter.isAllowed('other.com')).toBe(false);
27
- });
28
- });
29
-
30
- describe('SubdomainPolicy', () => {
31
- it('should enforce exact match by default', () => {
32
- const policy = new SubdomainPolicy('https://example.com');
33
- expect(policy.isAllowed('example.com')).toBe(true);
34
- expect(policy.isAllowed('sub.example.com')).toBe(false);
35
- });
36
-
37
- it('should allow valid subdomains when enabled', () => {
38
- const policy = new SubdomainPolicy('https://example.com', true);
39
- expect(policy.isAllowed('example.com')).toBe(true);
40
- expect(policy.isAllowed('sub.example.com')).toBe(true);
41
- expect(policy.isAllowed('deep.sub.example.com')).toBe(true);
42
- });
43
-
44
- it('should reject malicious suffix matches', () => {
45
- const policy = new SubdomainPolicy('https://example.com', true);
46
- expect(policy.isAllowed('evil-example.com')).toBe(false);
47
- expect(policy.isAllowed('example.com.evil.com')).toBe(false);
48
- });
49
- });
50
-
51
- describe('ScopeManager', () => {
52
- it('should compose policies correctly', () => {
53
- const manager = new ScopeManager({
54
- rootUrl: 'https://example.com',
55
- allowedDomains: ['example.com', 'sub.example.com', 'other.com'],
56
- deniedDomains: ['bad.example.com'],
57
- includeSubdomains: true
58
- });
59
-
60
- expect(manager.isUrlEligible('https://example.com/')).toBe('allowed');
61
- expect(manager.isUrlEligible('https://sub.example.com/')).toBe('allowed');
62
- expect(manager.isUrlEligible('https://bad.example.com/')).toBe('blocked_by_domain_filter');
63
- expect(manager.isUrlEligible('https://other.com/')).toBe('allowed');
64
- expect(manager.isUrlEligible('https://google.com/')).toBe('blocked_by_domain_filter');
65
- });
66
- });
@@ -1,59 +0,0 @@
1
- import { expect, test } from 'vitest';
2
- import { scorePageSeo, aggregateSiteScore } from '../src/analysis/scoring.js';
3
- import { PageAnalysis } from '../src/analysis/analyze.js';
4
-
5
- const basePage: PageAnalysis = {
6
- url: 'https://example.com',
7
- status: 200,
8
- title: { value: 'x'.repeat(55), length: 55, status: 'ok' },
9
- metaDescription: { value: 'x'.repeat(150), length: 150, status: 'ok' },
10
- h1: { count: 1, status: 'ok', matchesTitle: false },
11
- content: { wordCount: 700, textHtmlRatio: 0.3, uniqueSentenceCount: 8 },
12
- thinScore: 0,
13
- images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
14
- links: { internalLinks: 5, externalLinks: 2, nofollowCount: 1, externalRatio: 2 / 7 },
15
- structuredData: { present: true, valid: true, types: ['Article'] },
16
- seoScore: 0
17
- };
18
-
19
- test('page score stays in 0-100', () => {
20
- expect(scorePageSeo(basePage)).toBeGreaterThanOrEqual(0);
21
- expect(scorePageSeo(basePage)).toBeLessThanOrEqual(100);
22
-
23
- const badPage: PageAnalysis = {
24
- ...basePage,
25
- title: { value: null, length: 0, status: 'missing' },
26
- metaDescription: { value: null, length: 0, status: 'missing' },
27
- h1: { count: 0, status: 'critical', matchesTitle: false },
28
- content: { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
29
- thinScore: 100,
30
- images: { totalImages: 2, missingAlt: 2, emptyAlt: 0 },
31
- structuredData: { present: false, valid: false, types: [] },
32
- links: { internalLinks: 0, externalLinks: 9, nofollowCount: 9, externalRatio: 1 }
33
- };
34
- expect(scorePageSeo(badPage)).toBeLessThan(50);
35
- });
36
-
37
- test('aggregate site score includes existing metrics signals', () => {
38
- const score = aggregateSiteScore({
39
- totalPages: 2,
40
- totalEdges: 1,
41
- orphanPages: ['https://example.com/x'],
42
- nearOrphans: [],
43
- deepPages: [],
44
- topAuthorityPages: [{ url: 'a', authority: 1 }],
45
- averageOutDegree: 1,
46
- maxDepthFound: 1,
47
- crawlEfficiencyScore: 0.8,
48
- averageDepth: 1,
49
- structuralEntropy: 2,
50
- limitReached: false
51
- }, [
52
- { ...basePage, seoScore: 70 },
53
- { ...basePage, seoScore: 90, url: 'https://example.com/2' }
54
- ]);
55
-
56
- expect(score.seoHealthScore).toBe(80);
57
- expect(score.overallScore).toBeGreaterThan(0);
58
- expect(score.overallScore).toBeLessThanOrEqual(100);
59
- });
@@ -1,88 +0,0 @@
1
- import { test, expect, beforeEach } from 'vitest';
2
- import { Sitemap } from '../src/crawler/sitemap.js';
3
- import { MockAgent, setGlobalDispatcher } from 'undici';
4
-
5
- let mockAgent: MockAgent;
6
-
7
- beforeEach(() => {
8
- mockAgent = new MockAgent();
9
- mockAgent.disableNetConnect();
10
- setGlobalDispatcher(mockAgent);
11
- });
12
-
13
- test('fetches and parses simple sitemap', async () => {
14
- const client = mockAgent.get('https://example.com');
15
- client.intercept({
16
- path: '/sitemap.xml',
17
- method: 'GET'
18
- }).reply(200, `
19
- <?xml version="1.0" encoding="UTF-8"?>
20
- <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
21
- <url>
22
- <loc>https://example.com/page1</loc>
23
- </url>
24
- <url>
25
- <loc>https://example.com/page2</loc>
26
- </url>
27
- </urlset>
28
- `);
29
-
30
- const sitemap = new Sitemap();
31
- const urls = await sitemap.fetch('https://example.com/sitemap.xml');
32
- expect(urls).toContain('https://example.com/page1');
33
- expect(urls).toContain('https://example.com/page2');
34
- expect(urls.length).toBe(2);
35
- });
36
-
37
- test('handles sitemap index recursively', async () => {
38
- const client = mockAgent.get('https://example.com');
39
-
40
- // Index
41
- client.intercept({
42
- path: '/sitemap-index.xml',
43
- method: 'GET'
44
- }).reply(200, `
45
- <?xml version="1.0" encoding="UTF-8"?>
46
- <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
47
- <sitemap>
48
- <loc>https://example.com/sitemap1.xml</loc>
49
- </sitemap>
50
- </sitemapindex>
51
- `);
52
-
53
- // Child sitemap
54
- client.intercept({
55
- path: '/sitemap1.xml',
56
- method: 'GET'
57
- }).reply(200, `
58
- <?xml version="1.0" encoding="UTF-8"?>
59
- <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
60
- <url>
61
- <loc>https://example.com/page3</loc>
62
- </url>
63
- </urlset>
64
- `);
65
-
66
- const sitemap = new Sitemap();
67
- const urls = await sitemap.fetch('https://example.com/sitemap-index.xml');
68
- expect(urls).toContain('https://example.com/page3');
69
- expect(urls.length).toBe(1);
70
- });
71
-
72
- test('handles invalid xml gracefully', async () => {
73
- const client = mockAgent.get('https://example.com');
74
- client.intercept({ path: '/bad.xml', method: 'GET' }).reply(200, 'Not XML');
75
-
76
- const sitemap = new Sitemap();
77
- const urls = await sitemap.fetch('https://example.com/bad.xml');
78
- expect(urls.length).toBe(0);
79
- });
80
-
81
- test('handles fetch errors gracefully', async () => {
82
- const client = mockAgent.get('https://example.com');
83
- client.intercept({ path: '/error.xml', method: 'GET' }).reply(500, 'Error');
84
-
85
- const sitemap = new Sitemap();
86
- const urls = await sitemap.fetch('https://example.com/error.xml');
87
- expect(urls.length).toBe(0);
88
- });
@@ -1,41 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { Parser } from '../src/crawler/parser.js';
3
-
4
- describe('Soft 404 Detection', () => {
5
- const parser = new Parser();
6
- const baseUrl = 'https://example.com';
7
-
8
- it('should detect soft 404 by title pattern', () => {
9
- const html = '<html><head><title>Page Not Found</title></head><body>Welcome to the site</body></html>';
10
- const result = parser.parse(html, baseUrl, 200);
11
- expect(result.soft404Score).toBeGreaterThan(0.3);
12
- expect(result.soft404Signals).toContain('title_pattern_not_found');
13
- });
14
-
15
- it('should detect soft 404 by H1 pattern', () => {
16
- const html = '<html><body><h1>404 Error</h1></body></html>';
17
- const result = parser.parse(html, baseUrl, 200);
18
- expect(result.soft404Score).toBeGreaterThan(0.2);
19
- expect(result.soft404Signals).toContain('h1_pattern_404');
20
- });
21
-
22
- it('should detect soft 404 by very low word count', () => {
23
- const html = '<html><body>Short text</body></html>';
24
- const result = parser.parse(html, baseUrl, 200);
25
- expect(result.soft404Score).toBeGreaterThan(0.2);
26
- expect(result.soft404Signals).toContain('very_low_word_count');
27
- });
28
-
29
- it('should detect soft 404 by lack of outbound links', () => {
30
- const html = '<html><body>A page with some text but no links.</body></html>';
31
- const result = parser.parse(html, baseUrl, 200);
32
- expect(result.soft404Signals).toContain('no_outbound_links');
33
- });
34
-
35
- it('should combine multiple signals for high score', () => {
36
- const html = '<html><head><title>Error</title></head><body><h1>Not Found</h1><p>The requested page was not found.</p></body></html>';
37
- const result = parser.parse(html, baseUrl, 200);
38
- // title (0.4) + h1 (0.3) + body phrase (0.2) + low word count (0.3) = 1.2 -> capped at 1.0
39
- expect(result.soft404Score).toBe(1.0);
40
- });
41
- });
@@ -1,39 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { TrapDetector } from '../src/crawler/trap.js';
3
-
4
- describe('TrapDetector', () => {
5
- const detector = new TrapDetector();
6
-
7
- it('should detect session ID traps', () => {
8
- const result = detector.checkTrap('https://example.com/page?sid=12345', 1);
9
- expect(result.risk).toBeGreaterThan(0.8);
10
- expect(result.type).toBe('session_trap');
11
- });
12
-
13
- it('should detect calendar patterns', () => {
14
- const result = detector.checkTrap('https://example.com/archive/2023/12/01/', 1);
15
- expect(result.risk).toBeGreaterThan(0.6);
16
- expect(result.type).toBe('calendar_trap');
17
- });
18
-
19
- it('should detect pagination loops', () => {
20
- // Simulate many pages
21
- for (let i = 1; i <= 60; i++) {
22
- detector.checkTrap(`https://example.com/blog?page=${i}`, 1);
23
- }
24
- const result = detector.checkTrap('https://example.com/blog?page=61', 1);
25
- expect(result.risk).toBeGreaterThan(0.8);
26
- expect(result.type).toBe('pagination_loop');
27
- });
28
-
29
- it('should detect faceted navigation / parameter explosion', () => {
30
- detector.reset();
31
- const basePath = 'https://example.com/products';
32
- for (let i = 1; i <= 35; i++) {
33
- detector.checkTrap(`${basePath}?color=red&size=${i}`, 1);
34
- }
35
- const result = detector.checkTrap(`${basePath}?color=blue&size=large`, 1);
36
- expect(result.risk).toBeGreaterThan(0.9);
37
- expect(result.type).toBe('faceted_navigation');
38
- });
39
- });
@@ -1,46 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { SITEGRAPH_HTML } from '../src/report/sitegraph_template.js';
3
- import { Graph } from '../src/graph/graph.js';
4
- import { computePageRank } from '../src/graph/pagerank.js';
5
-
6
- describe('Visualization Data & Template', () => {
7
- it('should include pageRankScore in graph JSON output after PageRank computation', () => {
8
- const graph = new Graph();
9
- graph.addNode('https://a.com', 0, 200);
10
- graph.addNode('https://b.com', 1, 200);
11
- graph.addEdge('https://a.com', 'https://b.com');
12
-
13
- computePageRank(graph);
14
-
15
- const json = graph.toJSON();
16
- const nodeA = json.nodes.find(n => n.url === 'https://a.com');
17
- const nodeB = json.nodes.find(n => n.url === 'https://b.com');
18
-
19
- expect(nodeA).toBeDefined();
20
- expect(nodeB).toBeDefined();
21
- expect(typeof nodeA?.pageRankScore).toBe('number');
22
- expect(typeof nodeB?.pageRankScore).toBe('number');
23
- });
24
-
25
- it('should contain UI toggle buttons for Authority Mode', () => {
26
- expect(SITEGRAPH_HTML).toContain('id="btn-auth-pagerank"');
27
- expect(SITEGRAPH_HTML).toContain('id="btn-auth-structural"');
28
- });
29
-
30
- it('should contain setAuthorityMode function', () => {
31
- // Use regex to be flexible with whitespace
32
- expect(SITEGRAPH_HTML).toMatch(/function\s+setAuthorityMode\s*\(mode,\s*btn\)/);
33
- expect(SITEGRAPH_HTML).toContain('n.authority = mode === \'pagerank\' ? n.pageRankAuthority : n.structuralAuthority');
34
- });
35
-
36
- it('should contain logic to calculate pageRankAuthority from pageRankScore', () => {
37
- expect(SITEGRAPH_HTML).toContain('n.pageRankAuthority = n.pageRankScore / 100');
38
- expect(SITEGRAPH_HTML).toContain('n.structuralAuthority = Math.log(1 + n.inLinks)');
39
- });
40
-
41
- it('should update details panel to show both metrics', () => {
42
- expect(SITEGRAPH_HTML).toContain('id="d-auth-container"');
43
- expect(SITEGRAPH_HTML).toContain('In-Degree: ${structVal}');
44
- expect(SITEGRAPH_HTML).toContain('PR: <strong>${prVal}</strong>');
45
- });
46
- });
package/tsconfig.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "extends": "../../tsconfig.json",
3
- "compilerOptions": {
4
- "outDir": "dist",
5
- "rootDir": "src",
6
- "declaration": true
7
- },
8
- "include": [
9
- "src"
10
- ]
11
- }