@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,364 +0,0 @@
1
- import { test, expect, beforeEach, afterEach, vi } from 'vitest';
2
- import { crawl } from '../src/crawler/crawl.js';
3
- import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
4
- import { closeDb } from '../src/db/index.js';
5
- import { MockAgent, setGlobalDispatcher } from 'undici';
6
- import { IPGuard } from '../src/core/security/ipGuard.js';
7
- import { EngineContext } from '../src/events.js';
8
-
9
- let mockAgent: MockAgent;
10
- const mockContext: EngineContext = { emit: vi.fn() };
11
-
12
- beforeEach(() => {
13
- process.env.CRAWLITH_DB_PATH = ':memory:';
14
- mockAgent = new MockAgent();
15
- mockAgent.disableNetConnect();
16
- setGlobalDispatcher(mockAgent);
17
-
18
- // IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
19
- vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
20
- });
21
-
22
- afterEach(() => {
23
- closeDb();
24
- });
25
-
26
- test('crawler should crawl and build graph', async () => {
27
- const client = mockAgent.get('https://example.com');
28
-
29
- // Root
30
- client.intercept({
31
- path: '/',
32
- method: 'GET'
33
- }).reply(200, `
34
- <html><body>
35
- <a href="/page1">Page 1</a>
36
- <a href="/page2">Page 2</a>
37
- </body></html>
38
- `, {
39
- headers: { 'content-type': 'text/html' }
40
- });
41
-
42
- // Page 1
43
- client.intercept({
44
- path: '/page1',
45
- method: 'GET'
46
- }).reply(200, `
47
- <html><body>
48
- <a href="/page2">Page 2</a>
49
- </body></html>
50
- `, {
51
- headers: { 'content-type': 'text/html' }
52
- });
53
-
54
- // Page 2
55
- client.intercept({
56
- path: '/page2',
57
- method: 'GET'
58
- }).reply(200, `
59
- <html><body>
60
- <a href="/">Home</a>
61
- </body></html>
62
- `, {
63
- headers: { 'content-type': 'text/html' }
64
- });
65
-
66
- // Robots.txt
67
- client.intercept({
68
- path: '/robots.txt',
69
- method: 'GET'
70
- }).reply(404, 'Not Found');
71
-
72
- const snapshotId = await crawl('https://example.com', {
73
- limit: 10,
74
- depth: 2,
75
- ignoreRobots: false,
76
- rate: 1000
77
- }, mockContext);
78
- const graph = loadGraphFromSnapshot(snapshotId);
79
-
80
- const nodes = graph.getNodes();
81
- expect(nodes.length).toBe(3);
82
-
83
- const root = graph.nodes.get('https://example.com/');
84
- expect(root).toBeDefined();
85
- expect(root?.depth).toBe(0);
86
- expect(root?.outLinks).toBe(2);
87
-
88
- const page1 = graph.nodes.get('https://example.com/page1');
89
- expect(page1).toBeDefined();
90
- expect(page1?.depth).toBe(1);
91
- expect(page1?.inLinks).toBe(1);
92
-
93
- const page2 = graph.nodes.get('https://example.com/page2');
94
- expect(page2).toBeDefined();
95
- expect(page2?.inLinks).toBe(2);
96
- });
97
-
98
- test('hard page limit', async () => {
99
- const client = mockAgent.get('https://limit.com');
100
-
101
- // Robots
102
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
103
-
104
- // Root links to 1, 2, 3
105
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
106
- <html><a href="/1">1</a><a href="/2">2</a><a href="/3">3</a></html>
107
- `, { headers: { 'content-type': 'text/html' } });
108
-
109
- // 1, 2, 3 return html
110
- client.intercept({ path: '/1', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
111
- client.intercept({ path: '/2', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
112
- client.intercept({ path: '/3', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
113
-
114
- const snapshotId = await crawl('https://limit.com', {
115
- limit: 2, // root + 1 page
116
- depth: 5,
117
- ignoreRobots: true,
118
- rate: 1000
119
- }, mockContext);
120
- const graph = loadGraphFromSnapshot(snapshotId);
121
-
122
- // Should have visited root + 1 other page (total 2 nodes with status > 0)
123
- const crawledNodes = graph.getNodes().filter(n => n.status > 0);
124
- expect(crawledNodes.length).toBeLessThanOrEqual(2);
125
- });
126
-
127
- test('hard depth cap', async () => {
128
- const client = mockAgent.get('https://depth.com');
129
-
130
- // Robots
131
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
132
-
133
- // Chain of 12 pages
134
- for (let i = 0; i < 12; i++) {
135
- const path = i === 0 ? '/' : `/p${i}`;
136
- const nextPath = `/p${i + 1}`;
137
- client.intercept({ path, method: 'GET' }).reply(200, `
138
- <html><a href="${nextPath}">Next</a></html>
139
- `, { headers: { 'content-type': 'text/html' } });
140
- }
141
-
142
- const snapshotId = await crawl('https://depth.com', {
143
- limit: 100,
144
- depth: 20, // requested 20, but internal hard cap is 10
145
- ignoreRobots: true,
146
- rate: 1000
147
- }, mockContext);
148
- const graph = loadGraphFromSnapshot(snapshotId);
149
-
150
- const crawledNodes = graph.getNodes().filter(n => n.status > 0);
151
- const maxCrawledDepth = crawledNodes.reduce((max, n) => Math.max(max, n.depth), 0);
152
-
153
- expect(maxCrawledDepth).toBeLessThanOrEqual(10);
154
- });
155
-
156
- test('parameter explosion control', async () => {
157
- const client = mockAgent.get('https://params.com');
158
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
159
-
160
- // Root links to many variations
161
- let links = '';
162
- for (let i = 0; i < 10; i++) {
163
- links += `<a href="/search?q=${i}">q${i}</a>`;
164
- }
165
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
166
- <html>${links}</html>
167
- `, { headers: { 'content-type': 'text/html' } });
168
-
169
- // Intercept all variations
170
- for (let i = 0; i < 40; i++) {
171
- client.intercept({ path: `/search?q=${i}`, method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
172
- }
173
-
174
- const snapshotId = await crawl('https://params.com', {
175
- limit: 100,
176
- depth: 5,
177
- ignoreRobots: true,
178
- stripQuery: false,
179
- detectTraps: true,
180
- rate: 1000
181
- }, mockContext);
182
- const graph = loadGraphFromSnapshot(snapshotId);
183
-
184
- // Should only crawl 5 variations + root
185
- const nodes = graph.getNodes();
186
- // Filter nodes that match /search pathname
187
- const searchNodes = nodes.filter(n => n.url.includes('/search') && n.status > 0);
188
-
189
- expect(searchNodes.length).toBeLessThanOrEqual(31);
190
- });
191
-
192
- test('redirect safety', async () => {
193
- const client = mockAgent.get('https://redirect.com');
194
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
195
-
196
- // Root -> /redir1
197
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
198
- <html><a href="/redir1">Go</a></html>
199
- `, { headers: { 'content-type': 'text/html' } });
200
-
201
- // /redir1 -> 301 -> /dest
202
- client.intercept({ path: '/redir1', method: 'GET' }).reply(301, '', {
203
- headers: { 'location': '/dest' }
204
- });
205
-
206
- // /dest -> 200
207
- client.intercept({ path: '/dest', method: 'GET' }).reply(200, '<html>Success</html>', { headers: { 'content-type': 'text/html' } });
208
-
209
- const snapshotId = await crawl('https://redirect.com', {
210
- limit: 10,
211
- depth: 5,
212
- ignoreRobots: true,
213
- rate: 1000
214
- }, mockContext);
215
- const graph = loadGraphFromSnapshot(snapshotId);
216
-
217
- const destNode = graph.nodes.get('https://redirect.com/dest');
218
- expect(destNode).toBeDefined();
219
- expect(destNode?.status).toBe(200);
220
-
221
- // Redirect loop: A -> B -> A
222
- const clientLoop = mockAgent.get('https://loop.com');
223
- clientLoop.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
224
- clientLoop.intercept({ path: '/', method: 'GET' }).reply(200, `
225
- <html><a href="/a">Loop</a></html>
226
- `, { headers: { 'content-type': 'text/html' } });
227
-
228
- clientLoop.intercept({ path: '/a', method: 'GET' }).reply(301, '', { headers: { location: '/b' } });
229
- clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
230
- // We might mock /a again if it retries, but it shouldn't infinitely loop
231
-
232
- const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
233
- const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
234
- // It should eventually stop
235
- expect(graphLoop.getNodes().length).toBeGreaterThan(0);
236
- });
237
-
238
- test('mime check', async () => {
239
- const client = mockAgent.get('https://mime.com');
240
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
241
-
242
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
243
- <html><a href="/image.png">Img</a></html>
244
- `, { headers: { 'content-type': 'text/html' } });
245
-
246
- client.intercept({ path: '/data', method: 'GET' }).reply(200, `
247
- <html><a href="/hidden">Hidden</a></html>
248
- `, { headers: { 'content-type': 'application/json' } });
249
-
250
- // Root links to /data
251
- client.intercept({ path: '/start', method: 'GET' }).reply(200, `
252
- <html><a href="/data">Data</a></html>
253
- `, { headers: { 'content-type': 'text/html' } });
254
-
255
- const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
256
- const graph = loadGraphFromSnapshot(snapshotId);
257
-
258
- // /data should be in graph
259
- const dataNode = graph.nodes.get('https://mime.com/data');
260
- expect(dataNode).toBeDefined();
261
- // But we should NOT have parsed it, so /hidden should NOT be in graph
262
- const hiddenNode = graph.nodes.get('https://mime.com/hidden');
263
- expect(hiddenNode).toBeUndefined();
264
- });
265
-
266
- test('self-link guard', async () => {
267
- const client = mockAgent.get('https://self.com');
268
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
269
-
270
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
271
- <html><a href="/">Self</a><a href="/other">Other</a></html>
272
- `, { headers: { 'content-type': 'text/html' } });
273
-
274
- client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
275
-
276
- const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
277
- const graph = loadGraphFromSnapshot(snapshotId);
278
-
279
- const edges = graph.getEdges();
280
- const selfEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/');
281
- expect(selfEdge).toBeUndefined();
282
-
283
- const otherEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/other');
284
- expect(otherEdge).toBeDefined();
285
- });
286
-
287
- test('limit warning', async () => {
288
- const client = mockAgent.get('https://warn.com');
289
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
290
-
291
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
292
- <html><a href="/1">1</a><a href="/2">2</a></html>
293
- `, { headers: { 'content-type': 'text/html' } });
294
-
295
- client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
296
-
297
- const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
298
- const graph = loadGraphFromSnapshot(snapshotId);
299
-
300
- expect(graph.limitReached).toBe(true);
301
- });
302
-
303
- test('seeds from sitemap', async () => {
304
- const client = mockAgent.get('https://sitemap-seed.com');
305
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
306
-
307
- // Sitemap
308
- client.intercept({ path: '/sitemap.xml', method: 'GET' }).reply(200, `
309
- <urlset><url><loc>https://sitemap-seed.com/page1</loc></url></urlset>
310
- `);
311
-
312
- // Root
313
- client.intercept({ path: '/', method: 'GET' }).reply(200, '<html>Root</html>', { headers: { 'content-type': 'text/html' } });
314
-
315
- // Page 1
316
- client.intercept({ path: '/page1', method: 'GET' }).reply(200, '<html>Page 1</html>', { headers: { 'content-type': 'text/html' } });
317
-
318
- const snapshotId = await crawl('https://sitemap-seed.com', {
319
- limit: 10,
320
- depth: 5,
321
- ignoreRobots: true,
322
- sitemap: 'true',
323
- rate: 1000
324
- }, mockContext);
325
- const graph = loadGraphFromSnapshot(snapshotId);
326
-
327
- const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
328
- expect(page1).toBeDefined();
329
- expect(page1?.status).toBe(200);
330
- });
331
-
332
- test('incremental crawl uses etags', async () => {
333
- const client = mockAgent.get('https://incremental.com');
334
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
335
-
336
- // First crawl setup
337
- client.intercept({ path: '/', method: 'GET' }).reply(200, 'Original', {
338
- headers: { 'content-type': 'text/html', 'etag': '"v1"' }
339
- });
340
-
341
- const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 }, mockContext);
342
- const graph1 = loadGraphFromSnapshot(snapshotId1);
343
- const node1 = graph1.nodes.get('https://incremental.com/');
344
- expect(node1?.etag).toBe('"v1"');
345
-
346
- // Second crawl setup
347
- client.intercept({
348
- path: '/',
349
- method: 'GET',
350
- headers: { 'If-None-Match': '"v1"' }
351
- }).reply(304, '', { headers: { 'etag': '"v1"' } });
352
-
353
- const snapshotId2 = await crawl('https://incremental.com', {
354
- limit: 10,
355
- depth: 1,
356
- ignoreRobots: true,
357
- previousGraph: graph1,
358
- rate: 1000
359
- }, mockContext);
360
- const graph2 = loadGraphFromSnapshot(snapshotId2);
361
-
362
- const node2 = graph2.nodes.get('https://incremental.com/');
363
- expect(node2?.incrementalStatus).toBe('unchanged');
364
- });
@@ -1,134 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
2
- import { getDbPath, getDb, closeDb } from '../../src/db/index.js';
3
- import fs from 'node:fs';
4
- import os from 'node:os';
5
- import path from 'node:path';
6
-
7
- vi.mock('node:fs');
8
- vi.mock('node:os');
9
- vi.mock('better-sqlite3', () => {
10
- return {
11
- default: vi.fn(function () {
12
- return {
13
- pragma: vi.fn().mockReturnValue('ok'),
14
- prepare: vi.fn().mockReturnValue({
15
- run: vi.fn(),
16
- get: vi.fn(),
17
- iterate: vi.fn(),
18
- all: vi.fn()
19
- }),
20
- exec: vi.fn(),
21
- close: vi.fn(),
22
- transaction: vi.fn((fn) => fn),
23
- };
24
- }),
25
- };
26
- });
27
- vi.mock('../../src/db/schema.js', () => ({
28
- initSchema: vi.fn(),
29
- }));
30
-
31
- describe('DB Index', () => {
32
- const originalEnv = process.env;
33
-
34
- beforeEach(() => {
35
- vi.resetAllMocks();
36
- closeDb();
37
- process.env = { ...originalEnv };
38
- // Default mock behaviors
39
- vi.mocked(os.homedir).mockReturnValue('/home/user');
40
- vi.mocked(fs.existsSync).mockReturnValue(false);
41
- vi.mocked(fs.mkdirSync).mockImplementation(() => undefined as any);
42
- vi.mocked(fs.chmodSync).mockImplementation(() => undefined);
43
- });
44
-
45
- afterEach(() => {
46
- process.env = originalEnv;
47
- closeDb();
48
- });
49
-
50
- describe('getDbPath', () => {
51
- it('should return :memory: in test environment', () => {
52
- process.env.NODE_ENV = 'test';
53
- expect(getDbPath()).toBe(':memory:');
54
- });
55
-
56
- it('should return custom path if CRAWLITH_DB_PATH is set', () => {
57
- process.env.NODE_ENV = 'production';
58
- process.env.CRAWLITH_DB_PATH = '/custom/path/db.sqlite';
59
- expect(getDbPath()).toBe('/custom/path/db.sqlite');
60
- });
61
-
62
- it('should return default path in home dir if no env var', () => {
63
- process.env.NODE_ENV = 'production';
64
- delete process.env.CRAWLITH_DB_PATH;
65
-
66
- const expectedPath = path.join('/home/user', '.crawlith', 'crawlith.db');
67
- expect(getDbPath()).toBe(expectedPath);
68
-
69
- expect(fs.mkdirSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), { recursive: true });
70
- expect(fs.chmodSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), 0o700);
71
- });
72
-
73
- it('should not create dir if it exists', () => {
74
- process.env.NODE_ENV = 'production';
75
- vi.mocked(fs.existsSync).mockReturnValue(true);
76
-
77
- getDbPath();
78
-
79
- expect(fs.mkdirSync).not.toHaveBeenCalled();
80
- });
81
- });
82
-
83
- describe('getDb', () => {
84
- it('should create a new database instance', () => {
85
- process.env.NODE_ENV = 'production';
86
- const db = getDb();
87
- expect(db).toBeDefined();
88
- // Check if pragma was called
89
- expect(db.pragma).toHaveBeenCalledWith('journal_mode = WAL');
90
- });
91
-
92
- it('should return existing instance if called twice', () => {
93
- process.env.NODE_ENV = 'production';
94
- const db1 = getDb();
95
- const db2 = getDb();
96
- expect(db1).toBe(db2);
97
- });
98
-
99
- it('should handle permission errors gracefully', () => {
100
- process.env.NODE_ENV = 'production';
101
- // Avoid getDbPath throwing
102
- vi.mocked(fs.existsSync).mockReturnValue(true);
103
-
104
- vi.mocked(fs.chmodSync).mockImplementation((path) => {
105
- if (path.toString().endsWith('crawlith.db')) {
106
- throw new Error('EPERM');
107
- }
108
- });
109
-
110
- expect(() => getDb()).not.toThrow();
111
- });
112
-
113
- it('should warn if integrity check fails', async () => {
114
- const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
115
- process.env.NODE_ENV = 'production';
116
- vi.mocked(fs.existsSync).mockReturnValue(true);
117
-
118
- const MockDatabase = (await import('better-sqlite3')).default;
119
- vi.mocked(MockDatabase).mockImplementationOnce(function() {
120
- return {
121
- pragma: vi.fn().mockReturnValue('corrupt'),
122
- prepare: vi.fn(),
123
- exec: vi.fn(),
124
- close: vi.fn(),
125
- transaction: vi.fn(),
126
- } as any;
127
- });
128
-
129
- getDb();
130
-
131
- expect(warnSpy).toHaveBeenCalledWith('Database integrity check failed:', 'corrupt');
132
- });
133
- });
134
- });
@@ -1,115 +0,0 @@
1
- import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
- import Database from 'better-sqlite3';
3
- import { PageRepository } from '../../src/db/repositories/PageRepository.js';
4
- import { initSchema } from '../../src/db/schema.js';
5
-
6
- describe('PageRepository', () => {
7
- let db: Database.Database;
8
- let repo: PageRepository;
9
-
10
- beforeEach(() => {
11
- db = new Database(':memory:');
12
- initSchema(db);
13
- repo = new PageRepository(db);
14
-
15
- // Seed required tables (sites, snapshots)
16
- db.prepare("INSERT INTO sites (domain) VALUES ('example.com')").run();
17
- db.prepare("INSERT INTO snapshots (site_id, type) VALUES (1, 'full')").run();
18
- });
19
-
20
- afterEach(() => {
21
- db.close();
22
- });
23
-
24
- it('should get pages by URLs in chunks', () => {
25
- const urls: string[] = [];
26
- const siteId = 1;
27
- const snapshotId = 1;
28
-
29
- // Create 1000 pages (chunk size is 900)
30
- const insertStmt = db.prepare(`
31
- INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
32
- VALUES (?, ?, ?, ?)
33
- `);
34
-
35
- const tx = db.transaction(() => {
36
- for (let i = 0; i < 1000; i++) {
37
- const url = `http://example.com/page${i}`;
38
- urls.push(url);
39
- insertStmt.run(siteId, url, snapshotId, snapshotId);
40
- }
41
- });
42
- tx();
43
-
44
- // Fetch pages
45
- const pages = repo.getPagesByUrls(siteId, urls);
46
-
47
- expect(pages).toHaveLength(1000);
48
- expect(pages[0].normalized_url).toBe('http://example.com/page0');
49
- expect(pages[999].normalized_url).toBe('http://example.com/page999');
50
- });
51
-
52
- it('should return empty array for empty URL list', () => {
53
- const pages = repo.getPagesByUrls(1, []);
54
- expect(pages).toEqual([]);
55
- });
56
-
57
- it('should iterate over pages by snapshot', () => {
58
- const siteId = 1;
59
- const snapshotId = 1;
60
- const insertStmt = db.prepare(`
61
- INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
62
- VALUES (?, ?, ?, ?)
63
- `);
64
-
65
- db.transaction(() => {
66
- insertStmt.run(siteId, 'http://example.com/1', snapshotId, snapshotId);
67
- insertStmt.run(siteId, 'http://example.com/2', snapshotId, snapshotId);
68
- insertStmt.run(siteId, 'http://example.com/3', snapshotId, snapshotId);
69
- })();
70
-
71
- const iterator = repo.getPagesIteratorBySnapshot(snapshotId);
72
- const pages = Array.from(iterator);
73
-
74
- expect(pages).toHaveLength(3);
75
- expect(pages.map(p => p.normalized_url).sort()).toEqual([
76
- 'http://example.com/1',
77
- 'http://example.com/2',
78
- 'http://example.com/3'
79
- ]);
80
- });
81
-
82
- it('should upsert and get ID', () => {
83
- const pageData = {
84
- site_id: 1,
85
- normalized_url: 'http://example.com/new',
86
- last_seen_snapshot_id: 1,
87
- http_status: 200,
88
- };
89
-
90
- const id = repo.upsertAndGetId(pageData);
91
- expect(id).toBeGreaterThan(0);
92
-
93
- const sameId = repo.upsertAndGetId({ ...pageData, http_status: 404 });
94
- expect(sameId).toBe(id);
95
-
96
- const page = repo.getPage(1, 'http://example.com/new');
97
- expect(page?.http_status).toBe(404);
98
- });
99
-
100
- it('should get ID by URL', () => {
101
- const pageData = {
102
- site_id: 1,
103
- normalized_url: 'http://example.com/id-test',
104
- last_seen_snapshot_id: 1,
105
- };
106
- repo.upsertPage(pageData);
107
-
108
- const id = repo.getIdByUrl(1, 'http://example.com/id-test');
109
- expect(id).toBeDefined();
110
- expect(id).toBeGreaterThan(0);
111
-
112
- const missingId = repo.getIdByUrl(1, 'http://example.com/missing');
113
- expect(missingId).toBeUndefined();
114
- });
115
- });