@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,198 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
2
- import { LockManager } from '../../src/lock/lockManager.js';
3
- import { generateLockKey } from '../../src/lock/hashKey.js';
4
- import fs from 'node:fs/promises';
5
- import { existsSync, unlinkSync, readFileSync } from 'node:fs';
6
- import path from 'node:path';
7
- import os from 'node:os';
8
- import { isPidAlive } from '../../src/lock/pidCheck.js';
9
- import { EngineContext } from '../../src/events.js';
10
-
11
- // Mock fs and os
12
- vi.mock('node:fs/promises');
13
- vi.mock('node:fs');
14
- vi.mock('node:os');
15
- vi.mock('../../src/lock/pidCheck.js', () => ({
16
- isPidAlive: vi.fn()
17
- }));
18
-
19
- const mockContext: EngineContext = { emit: vi.fn() };
20
-
21
- describe('LockManager', () => {
22
- const mockHomeDir = '/home/user';
23
- const lockDir = path.join(mockHomeDir, '.crawlith', 'locks');
24
- const command = 'test-command';
25
- const target = 'http://example.com';
26
- const options = { limit: 10 };
27
- const lockHash = generateLockKey(command, target, options);
28
- const lockPath = path.join(lockDir, `${lockHash}.lock`);
29
-
30
- beforeEach(() => {
31
- vi.resetAllMocks();
32
- vi.mocked(os.homedir).mockReturnValue(mockHomeDir);
33
- vi.mocked(fs.mkdir).mockResolvedValue(undefined);
34
- vi.mocked(fs.writeFile).mockResolvedValue(undefined);
35
- vi.mocked(existsSync).mockReturnValue(false);
36
- vi.mocked(readFileSync).mockReturnValue('{}');
37
- vi.mocked(unlinkSync).mockReturnValue(undefined);
38
-
39
- // Mock process.pid
40
- Object.defineProperty(process, 'pid', { value: 12345, configurable: true });
41
-
42
- // Mock process.exit to throw error to stop execution flow in tests
43
- vi.spyOn(process, 'exit').mockImplementation((code) => {
44
- throw new Error(`Process exit ${code}`);
45
- });
46
- // Reset static state if any (LockManager stores lockFilePath)
47
- LockManager.releaseLock();
48
- });
49
-
50
- afterEach(() => {
51
- vi.restoreAllMocks();
52
- // Reset static state
53
- LockManager.releaseLock();
54
- });
55
-
56
- it('should acquire lock when no lock exists', async () => {
57
- await LockManager.acquireLock(command, target, options, mockContext);
58
-
59
- expect(fs.mkdir).toHaveBeenCalledWith(lockDir, { recursive: true });
60
- expect(fs.writeFile).toHaveBeenCalledWith(
61
- lockPath,
62
- expect.stringContaining('"limit": 10'),
63
- expect.objectContaining({ flag: 'wx' })
64
- );
65
- });
66
-
67
- it('should fail if lock exists and PID is alive', async () => {
68
- vi.mocked(existsSync).mockReturnValue(true);
69
- vi.mocked(readFileSync).mockReturnValue(JSON.stringify({
70
- pid: 9999,
71
- startedAt: Date.now(),
72
- command,
73
- target,
74
- args: options
75
- }));
76
- vi.mocked(isPidAlive).mockReturnValue(true);
77
-
78
- await expect(LockManager.acquireLock(command, target, options, mockContext)).rejects.toThrow('Process exit 1');
79
-
80
- expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('already running') }));
81
- });
82
-
83
- it('should clear stale lock and acquire if PID is dead', async () => {
84
- vi.mocked(existsSync).mockReturnValue(true);
85
- vi.mocked(readFileSync).mockReturnValue(JSON.stringify({
86
- pid: 9999,
87
- startedAt: Date.now(),
88
- command,
89
- target,
90
- args: options
91
- }));
92
- vi.mocked(isPidAlive).mockReturnValue(false);
93
-
94
- await LockManager.acquireLock(command, target, options, mockContext);
95
-
96
- expect(unlinkSync).toHaveBeenCalledWith(lockPath);
97
- expect(fs.writeFile).toHaveBeenCalled();
98
- expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'info', message: expect.stringContaining('Detected stale lock') }));
99
- });
100
-
101
- it('should override lock if force is true', async () => {
102
- vi.mocked(existsSync).mockReturnValue(true);
103
- // Even if PID is alive
104
- vi.mocked(readFileSync).mockReturnValue(JSON.stringify({
105
- pid: 9999
106
- }));
107
- vi.mocked(isPidAlive).mockReturnValue(true);
108
-
109
- await LockManager.acquireLock(command, target, options, mockContext, true); // force = true
110
-
111
- expect(unlinkSync).toHaveBeenCalledWith(lockPath);
112
- expect(fs.writeFile).toHaveBeenCalled();
113
- expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'warn', message: expect.stringContaining('Force mode enabled') }));
114
- });
115
-
116
- it('should handle race condition (EEXIST)', async () => {
117
- vi.mocked(existsSync).mockReturnValue(false);
118
- vi.mocked(fs.writeFile).mockRejectedValue({ code: 'EEXIST' });
119
-
120
- await expect(LockManager.acquireLock(command, target, options, mockContext)).rejects.toThrow('Process exit 1');
121
- expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('Race condition') }));
122
- });
123
-
124
- it('should release lock on exit', async () => {
125
- // Acquire first (existsSync returns false by default from beforeEach)
126
- await LockManager.acquireLock(command, target, options, mockContext);
127
-
128
- // Simulate file exists for release
129
- vi.mocked(existsSync).mockReturnValue(true);
130
-
131
- // Simulate release
132
- LockManager.releaseLock();
133
-
134
- expect(unlinkSync).toHaveBeenCalledWith(lockPath);
135
- });
136
-
137
- it('should register signal handlers and cleanup on SIGINT', async () => {
138
- const processOnSpy = vi.spyOn(process, 'on');
139
- await LockManager.acquireLock(command, target, options, mockContext);
140
-
141
- // Find the handler
142
- const sigintCall = processOnSpy.mock.calls.find(call => call[0] === 'SIGINT');
143
- expect(sigintCall).toBeDefined();
144
- const handler = sigintCall![1] as () => void;
145
-
146
- // Trigger handler
147
- vi.mocked(existsSync).mockReturnValue(true); // Simulate file still exists
148
- try {
149
- handler();
150
- } catch (e: any) {
151
- // Expect process.exit(130) which throws error in our mock
152
- expect(e.message).toBe('Process exit 130');
153
- }
154
-
155
- expect(unlinkSync).toHaveBeenCalledWith(lockPath);
156
- });
157
-
158
- it('should register signal handlers and cleanup on SIGTERM', async () => {
159
- const processOnSpy = vi.spyOn(process, 'on');
160
- await LockManager.acquireLock(command, target, options, mockContext);
161
-
162
- // Find the handler
163
- const sigtermCall = processOnSpy.mock.calls.find(call => call[0] === 'SIGTERM');
164
- expect(sigtermCall).toBeDefined();
165
- const handler = sigtermCall![1] as () => void;
166
-
167
- // Trigger handler
168
- vi.mocked(existsSync).mockReturnValue(true);
169
- try {
170
- handler();
171
- } catch (e: any) {
172
- expect(e.message).toBe('Process exit 143');
173
- }
174
-
175
- expect(unlinkSync).toHaveBeenCalledWith(lockPath);
176
- });
177
-
178
- it('should register signal handlers and cleanup on uncaughtException', async () => {
179
- const processOnSpy = vi.spyOn(process, 'on');
180
- await LockManager.acquireLock(command, target, options, mockContext);
181
-
182
- // Find the handler
183
- const uncaughtExceptionCall = processOnSpy.mock.calls.find(call => call[0] === 'uncaughtException');
184
- expect(uncaughtExceptionCall).toBeDefined();
185
- const handler = uncaughtExceptionCall![1] as (err: Error) => void;
186
-
187
- // Trigger handler
188
- vi.mocked(existsSync).mockReturnValue(true);
189
- try {
190
- handler(new Error('Test error'));
191
- } catch (e: any) {
192
- expect(e.message).toBe('Process exit 1');
193
- }
194
-
195
- expect(unlinkSync).toHaveBeenCalledWith(lockPath);
196
- expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('Uncaught Exception'), error: expect.any(Error) }));
197
- });
198
- });
@@ -1,196 +0,0 @@
1
- import { Graph } from '../src/graph/graph.js';
2
- import { calculateMetrics } from '../src/graph/metrics.js';
3
- import { test, expect } from 'vitest';
4
-
5
- test('graph metrics basic', () => {
6
- const g = new Graph();
7
-
8
- // Structure:
9
- // A -> B
10
- // A -> C
11
- // B -> C
12
- // C -> A
13
-
14
- g.addNode('A', 0, 200);
15
- g.addNode('B', 1, 200);
16
- g.addNode('C', 1, 200);
17
-
18
- g.addEdge('A', 'B');
19
- g.addEdge('A', 'C');
20
- g.addEdge('B', 'C');
21
- g.addEdge('C', 'A');
22
-
23
- const metrics = calculateMetrics(g, 5);
24
-
25
- expect(metrics.totalPages).toBe(3);
26
- expect(metrics.totalEdges).toBe(4);
27
-
28
- // Check degrees on nodes directly
29
- const nodeA = g.nodes.get('A');
30
- expect(nodeA?.inLinks).toBe(1);
31
- expect(nodeA?.outLinks).toBe(2);
32
-
33
- const nodeC = g.nodes.get('C');
34
- expect(nodeC?.inLinks).toBe(2);
35
- expect(nodeC?.outLinks).toBe(1);
36
-
37
- expect(metrics.averageOutDegree).toBeCloseTo(4/3);
38
-
39
- // Top authority should be C with 2 in-links, authority = 1
40
- expect(metrics.topAuthorityPages[0].url).toBe('C');
41
- expect(metrics.topAuthorityPages[0].authority).toBeCloseTo(1);
42
-
43
- // Max depth found
44
- expect(metrics.maxDepthFound).toBe(1);
45
-
46
- // Orphan pages (none)
47
- expect(metrics.orphanPages).toEqual([]);
48
- });
49
-
50
- test('orphan pages', () => {
51
- const g = new Graph();
52
- g.addNode('Root', 0, 200);
53
- g.addNode('Orphan', 1, 200);
54
- // Orphan is at depth 1 but no incoming edges recorded (maybe missed or filtered)
55
-
56
- const metrics = calculateMetrics(g, 5);
57
- expect(metrics.orphanPages).toContain('Orphan');
58
- expect(metrics.orphanPages).not.toContain('Root');
59
- });
60
- test('metrics v2 calculations', () => {
61
- const g = new Graph();
62
-
63
- // Root (depth 0, in=0, out=2)
64
- g.addNode('root', 0, 200);
65
-
66
- // A (depth 1, in=1, out=1)
67
- g.addNode('A', 1, 200);
68
- g.addEdge('root', 'A');
69
-
70
- // B (depth 1, in=1, out=0)
71
- g.addNode('B', 1, 200);
72
- g.addEdge('root', 'B');
73
-
74
- // C (depth 2, in=1, out=0)
75
- g.addNode('C', 2, 200);
76
- g.addEdge('A', 'C');
77
-
78
- // Orphan (depth 1, in=0) - e.g. added but no edge to it?
79
- // If it's in graph with depth > 0 and inLinks=0, it's an orphan.
80
- g.addNode('orphan', 1, 200);
81
-
82
- // Near Orphan (depth 3, in=1)
83
- g.addNode('D', 2, 200);
84
- g.addNode('nearOrphan', 3, 200);
85
- g.addEdge('C', 'D'); // C->D
86
- g.addEdge('D', 'nearOrphan'); // D->nearOrphan
87
-
88
- // Deep page (depth 4)
89
- g.addNode('deep', 4, 200);
90
- g.addEdge('nearOrphan', 'deep');
91
-
92
- // Nodes: root(0), A(1), B(1), C(2), orphan(1), D(2), nearOrphan(3), deep(4)
93
- // Total pages: 8
94
-
95
- // Edges: root->A, root->B, A->C, C->D, D->nearOrphan, nearOrphan->deep
96
- // Total edges: 6
97
-
98
- // InLinks:
99
- // root: 0
100
- // A: 1
101
- // B: 1
102
- // C: 1
103
- // orphan: 0
104
- // D: 1
105
- // nearOrphan: 1
106
- // deep: 1
107
-
108
- // Max InLinks = 1.
109
- // Authority Score = log(1 + in) / log(1 + maxIn)
110
- // If maxIn = 1, log(2).
111
- // For A: log(2)/log(2) = 1.
112
- // For root: log(1)/log(2) = 0.
113
-
114
- // Let's make maxIn > 1 to test better.
115
- g.addNode('popular', 1, 200);
116
- g.addEdge('root', 'popular');
117
- g.addEdge('A', 'popular');
118
- // popular inLinks = 2. MaxIn = 2.
119
- // Authority popular = log(3)/log(3) = 1.
120
- // Authority A = log(2)/log(3) approx 0.63
121
-
122
- const metrics = calculateMetrics(g, 10); // maxDepth arg (not used for calculation logic of deepPages which is hardcoded >=4 per prompt?)
123
- // Prompt says "deepPages: depth >= 4".
124
- // Existing calculateMetrics takes maxDepth arg.
125
- // Existing: deepPages = nodes.filter(n => n.depth >= maxDepth)
126
- // New requirement: deepPages: depth >= 4.
127
- // I should probably ignore the argument or update the requirement interpretation.
128
- // "deepPages: depth >= 4" implies fixed threshold.
129
-
130
- // Orphan pages: inLinks === 0 && depth > 0
131
- expect(metrics.orphanPages).toContain('orphan');
132
- expect(metrics.orphanPages).not.toContain('root'); // depth 0
133
-
134
- // Near orphans: inLinks === 1 && depth >= 3
135
- expect(metrics.nearOrphans).toContain('nearOrphan'); // depth 3, in 1
136
- expect(metrics.nearOrphans).toContain('deep'); // depth 4, in 1 (from nearOrphan)
137
- expect(metrics.nearOrphans).not.toContain('D'); // depth 2
138
-
139
- // Deep pages: depth >= 4
140
- expect(metrics.deepPages).toContain('deep');
141
- expect(metrics.deepPages).not.toContain('nearOrphan');
142
-
143
- // Crawl Efficiency Score: 1 - (deepPagesCount / totalPages)
144
- // Total: 9 nodes (root, A, B, C, orphan, D, nearOrphan, deep, popular)
145
- // Deep: 1 (deep)
146
- // Score: 1 - 1/9 = 8/9 = 0.888...
147
- expect(metrics.crawlEfficiencyScore).toBeCloseTo(8/9);
148
-
149
- // Average Depth: sum(depth) / totalPages
150
- // Depths: 0, 1, 1, 2, 1, 2, 3, 4, 1
151
- // Sum: 15
152
- // Avg: 15/9 = 1.666...
153
- expect(metrics.averageDepth).toBeCloseTo(15/9);
154
-
155
- // Structural Entropy
156
- // OutDegrees:
157
- // root: 3 (A, B, popular)
158
- // A: 2 (C, popular)
159
- // B: 0
160
- // C: 1 (D)
161
- // orphan: 0
162
- // D: 1 (nearOrphan)
163
- // nearOrphan: 1 (deep)
164
- // deep: 0
165
- // popular: 0
166
-
167
- // Distribution:
168
- // 0: 4 nodes (B, orphan, deep, popular)
169
- // 1: 3 nodes (C, D, nearOrphan)
170
- // 2: 1 node (A)
171
- // 3: 1 node (root)
172
-
173
- // P(0) = 4/9
174
- // P(1) = 3/9
175
- // P(2) = 1/9
176
- // P(3) = 1/9
177
-
178
- // Entropy = - (4/9 log2(4/9) + 3/9 log2(3/9) + 1/9 log2(1/9) + 1/9 log2(1/9))
179
- // = - (0.444 * -1.17 + 0.333 * -1.58 + 0.111 * -3.17 + 0.111 * -3.17)
180
- // approx 1.75
181
-
182
- // Let's compute exact expected value
183
- const p0 = 4/9;
184
- const p1 = 3/9;
185
- const p2 = 1/9;
186
- const p3 = 1/9;
187
- const entropy = - (p0 * Math.log2(p0) + p1 * Math.log2(p1) + p2 * Math.log2(p2) + p3 * Math.log2(p3));
188
-
189
- expect(metrics.structuralEntropy).toBeCloseTo(entropy);
190
-
191
- // Limit Reached
192
- expect(metrics.limitReached).toBe(false);
193
- g.limitReached = true;
194
- const metrics2 = calculateMetrics(g, 10);
195
- expect(metrics2.limitReached).toBe(true);
196
- });
@@ -1,88 +0,0 @@
1
- import { normalizeUrl } from '../src/crawler/normalize.js';
2
- import { test, expect } from 'vitest';
3
-
4
- test('normalizeUrl', () => {
5
- expect(normalizeUrl('https://Example.com/Foo/', '')).toBe('https://example.com/Foo');
6
- expect(normalizeUrl('http://example.com:80/bar', '')).toBe('http://example.com/bar');
7
- expect(normalizeUrl('https://example.com/baz#frag', '')).toBe('https://example.com/baz');
8
- expect(normalizeUrl('https://example.com/qux?a=1', '', { stripQuery: true })).toBe('https://example.com/qux');
9
- expect(normalizeUrl('https://example.com/qux?a=1', '', { stripQuery: false })).toBe('https://example.com/qux?a=1');
10
- expect(normalizeUrl('https://example.com/', '')).toBe('https://example.com/');
11
- });
12
-
13
- test('normalizeUrl: absolute resolution', () => {
14
- expect(normalizeUrl('/foo', 'https://example.com')).toBe('https://example.com/foo');
15
- expect(normalizeUrl('bar', 'https://example.com/baz/')).toBe('https://example.com/baz/bar');
16
- expect(normalizeUrl('//other.com/foo', 'https://example.com')).toBe('https://other.com/foo');
17
- });
18
-
19
- test('normalizeUrl: only http/https', () => {
20
- expect(normalizeUrl('ftp://example.com/file', 'https://example.com')).toBeNull();
21
- expect(normalizeUrl('mailto:user@example.com', 'https://example.com')).toBeNull();
22
- expect(normalizeUrl('javascript:alert(1)', 'https://example.com')).toBeNull();
23
- });
24
-
25
- test('normalizeUrl: lowercase hostname', () => {
26
- expect(normalizeUrl('https://EXAMPLE.com/foo', '')).toBe('https://example.com/foo');
27
- });
28
-
29
- test('normalizeUrl: remove default ports', () => {
30
- expect(normalizeUrl('http://example.com:80/foo', '')).toBe('http://example.com/foo');
31
- expect(normalizeUrl('https://example.com:443/foo', '')).toBe('https://example.com/foo');
32
- expect(normalizeUrl('http://example.com:8080/foo', '')).toBe('http://example.com:8080/foo');
33
- });
34
-
35
- test('normalizeUrl: remove hash fragments', () => {
36
- expect(normalizeUrl('https://example.com/foo#bar', '')).toBe('https://example.com/foo');
37
- });
38
-
39
- test('normalizeUrl: strip query', () => {
40
- expect(normalizeUrl('https://example.com/foo?a=1&b=2', '', { stripQuery: true })).toBe('https://example.com/foo');
41
- });
42
-
43
- test('normalizeUrl: filter tracking params', () => {
44
- const url = 'https://example.com/foo?utm_source=google&utm_medium=cpc&a=1&fbclid=123';
45
- expect(normalizeUrl(url, '', { stripQuery: false })).toBe('https://example.com/foo?a=1');
46
-
47
- const url2 = 'https://example.com/foo?gclid=abc&msclkid=def';
48
- expect(normalizeUrl(url2, '', { stripQuery: false })).toBe('https://example.com/foo');
49
- });
50
-
51
- test('normalizeUrl: trailing slash', () => {
52
- expect(normalizeUrl('https://example.com/foo/', '')).toBe('https://example.com/foo');
53
- expect(normalizeUrl('https://example.com/', '')).toBe('https://example.com/');
54
- });
55
-
56
- test('normalizeUrl: collapse duplicate slashes', () => {
57
- expect(normalizeUrl('https://example.com/foo//bar', '')).toBe('https://example.com/foo/bar');
58
- expect(normalizeUrl('https://example.com//foo///bar', '')).toBe('https://example.com/foo/bar');
59
- });
60
-
61
- test('normalizeUrl: skip non-HTML assets', () => {
62
- expect(normalizeUrl('https://example.com/file.pdf', '')).toBeNull();
63
- expect(normalizeUrl('https://example.com/image.jpg', '')).toBeNull();
64
- expect(normalizeUrl('https://example.com/image.png', '')).toBeNull();
65
- expect(normalizeUrl('https://example.com/image.svg', '')).toBeNull();
66
- expect(normalizeUrl('https://example.com/image.webp', '')).toBeNull();
67
- expect(normalizeUrl('https://example.com/image.gif', '')).toBeNull();
68
- expect(normalizeUrl('https://example.com/archive.zip', '')).toBeNull();
69
- expect(normalizeUrl('https://example.com/data.xml', '')).toBeNull();
70
- expect(normalizeUrl('https://example.com/data.json', '')).toBeNull();
71
- expect(normalizeUrl('https://example.com/video.mp4', '')).toBeNull();
72
-
73
- // HTML extensions should pass (or no extension)
74
- expect(normalizeUrl('https://example.com/page.html', '')).toBe('https://example.com/page.html');
75
- expect(normalizeUrl('https://example.com/page.htm', '')).toBe('https://example.com/page.htm');
76
- expect(normalizeUrl('https://example.com/page', '')).toBe('https://example.com/page');
77
- });
78
-
79
- test('normalizeUrl: invalid URL', () => {
80
- expect(normalizeUrl('/foo', '')).toBeNull();
81
- expect(normalizeUrl('invalid-url', '')).toBeNull();
82
- expect(normalizeUrl('/foo', 'invalid-base')).toBeNull();
83
- });
84
-
85
- test('normalizeUrl: return format', () => {
86
- const res = normalizeUrl('https://example.com/foo?a=1', '');
87
- expect(res).toBe('https://example.com/foo?a=1');
88
- });
@@ -1,160 +0,0 @@
1
- import { describe, expect, test } from 'vitest';
2
- import { annotateOrphans, calculateOrphanSeverity, mapImpactLevel, type CrawlNode, type CrawlEdge } from '../src/scoring/orphanSeverity.js';
3
-
4
- function baseNode(url: string, overrides: Partial<CrawlNode> = {}): CrawlNode {
5
- return {
6
- url,
7
- depth: 1,
8
- inLinks: 0,
9
- outLinks: 0,
10
- status: 200,
11
- ...overrides
12
- };
13
- }
14
-
15
- describe('orphan detection and severity scoring', () => {
16
- test('hard orphan detection and homepage exclusion', () => {
17
- const nodes: CrawlNode[] = [
18
- baseNode('https://example.com/', { depth: 0, inLinks: 0 }),
19
- baseNode('https://example.com/orphan', { inLinks: 0 })
20
- ];
21
- const edges: CrawlEdge[] = [];
22
-
23
- const result = annotateOrphans(nodes, edges, {
24
- enabled: true,
25
- severityEnabled: false,
26
- includeSoftOrphans: false,
27
- minInbound: 2,
28
- rootUrl: 'https://example.com/'
29
- });
30
-
31
- expect(result[0]).toMatchObject({ orphan: false });
32
- expect(result[1]).toMatchObject({ orphan: true, orphanType: 'hard' });
33
- });
34
-
35
- test('near orphan threshold override', () => {
36
- const nodes = [baseNode('https://example.com/near', { inLinks: 2 })];
37
- const edges: CrawlEdge[] = [];
38
-
39
- const resultDefault = annotateOrphans(nodes, edges, {
40
- enabled: true,
41
- severityEnabled: false,
42
- includeSoftOrphans: false,
43
- minInbound: 2
44
- });
45
- const resultStrict = annotateOrphans(nodes, edges, {
46
- enabled: true,
47
- severityEnabled: false,
48
- includeSoftOrphans: false,
49
- minInbound: 1
50
- });
51
-
52
- expect(resultDefault[0]).toMatchObject({ orphan: true, orphanType: 'near' });
53
- expect(resultStrict[0]).toMatchObject({ orphan: false });
54
- });
55
-
56
- test('soft orphan detection only when enabled and inbound only from low-value sources', () => {
57
- const nodes: CrawlNode[] = [
58
- baseNode('https://example.com/tag/seo', { pageType: 'tag', outLinks: 1 }),
59
- baseNode('https://example.com/list?page=2', { pageType: 'pagination', outLinks: 1 }),
60
- baseNode('https://example.com/target', { inLinks: 2 }),
61
- baseNode('https://example.com/normal', { outLinks: 1 })
62
- ];
63
-
64
- const edges: CrawlEdge[] = [
65
- { source: 'https://example.com/tag/seo', target: 'https://example.com/target' },
66
- { source: 'https://example.com/list?page=2', target: 'https://example.com/target' }
67
- ];
68
-
69
- const withSoft = annotateOrphans(nodes, edges, {
70
- enabled: true,
71
- severityEnabled: false,
72
- includeSoftOrphans: true,
73
- minInbound: 1
74
- });
75
-
76
- const withoutSoft = annotateOrphans(nodes, edges, {
77
- enabled: true,
78
- severityEnabled: false,
79
- includeSoftOrphans: false,
80
- minInbound: 1
81
- });
82
-
83
- expect(withSoft.find((n) => n.url.endsWith('/target'))).toMatchObject({ orphan: true, orphanType: 'soft' });
84
- expect(withoutSoft.find((n) => n.url.endsWith('/target'))).toMatchObject({ orphan: false });
85
- });
86
-
87
- test('crawl-only orphan detection', () => {
88
- const nodes = [baseNode('https://example.com/sitemap-only', { inLinks: 0, discoveredViaSitemap: true })];
89
- const result = annotateOrphans(nodes, [], {
90
- enabled: true,
91
- severityEnabled: false,
92
- includeSoftOrphans: false,
93
- minInbound: 2
94
- });
95
-
96
- expect(result[0]).toMatchObject({ orphan: true, orphanType: 'crawl-only' });
97
- });
98
-
99
- test('severity calculation modifiers and score clamping', () => {
100
- const high = calculateOrphanSeverity('hard', baseNode('https://example.com/high', {
101
- inLinks: 0,
102
- wordCount: 1500,
103
- hasStructuredData: true,
104
- depth: 1,
105
- isProductOrCommercial: true
106
- }));
107
-
108
- const low = calculateOrphanSeverity('hard', baseNode('https://example.com/low', {
109
- inLinks: 0,
110
- wordCount: 120,
111
- noindex: true,
112
- duplicateContent: true,
113
- pageType: 'archive'
114
- }));
115
-
116
- expect(high).toBe(100);
117
- expect(low).toBe(80);
118
- });
119
-
120
- test('impact level mapping', () => {
121
- expect(mapImpactLevel(0)).toBe('low');
122
- expect(mapImpactLevel(39)).toBe('low');
123
- expect(mapImpactLevel(40)).toBe('medium');
124
- expect(mapImpactLevel(69)).toBe('medium');
125
- expect(mapImpactLevel(70)).toBe('high');
126
- expect(mapImpactLevel(89)).toBe('high');
127
- expect(mapImpactLevel(90)).toBe('critical');
128
- expect(mapImpactLevel(100)).toBe('critical');
129
- });
130
-
131
- test('canonical consolidation, robots exclusion, and deterministic JSON output snapshot', () => {
132
- const nodes: CrawlNode[] = [
133
- baseNode('https://example.com/canonical', { inLinks: 0 }),
134
- baseNode('https://example.com/variant?a=1', { canonicalUrl: 'https://example.com/canonical', inLinks: 1 }),
135
- baseNode('https://example.com/blocked', { inLinks: 0, robotsExcluded: true }),
136
- baseNode('https://example.com/redirect-target', { inLinks: 1 })
137
- ];
138
-
139
- const edges: CrawlEdge[] = [
140
- { source: 'https://example.com/redirect-source', target: 'https://example.com/redirect-target' }
141
- ];
142
-
143
- const options = {
144
- enabled: true,
145
- severityEnabled: true,
146
- includeSoftOrphans: true,
147
- minInbound: 2
148
- };
149
-
150
- const first = annotateOrphans(nodes, edges, options);
151
- const second = annotateOrphans(nodes, edges, options);
152
-
153
- expect(first).toEqual(second);
154
- expect(first.find((n) => n.url.endsWith('/canonical'))).toMatchObject({ orphan: true, orphanType: 'near' });
155
- expect(first.find((n) => n.url.endsWith('/blocked'))).toMatchObject({ orphan: false });
156
-
157
- const normalized = JSON.stringify(first, null, 2).replace(/\r\n/g, '\n');
158
- expect(normalized).toMatchSnapshot();
159
- });
160
- });