@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,358 @@
1
+ import { test, expect, beforeEach, afterEach } from 'vitest';
2
+ import { crawl } from '../src/crawler/crawl.js';
3
+ import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
4
+ import { closeDb } from '../src/db/index.js';
5
+ import { MockAgent, setGlobalDispatcher } from 'undici';
6
+
7
+ let mockAgent: MockAgent;
8
+
9
+ beforeEach(() => {
10
+ process.env.CRAWLITH_DB_PATH = ':memory:';
11
+ mockAgent = new MockAgent();
12
+ mockAgent.disableNetConnect();
13
+ setGlobalDispatcher(mockAgent);
14
+ });
15
+
16
+ afterEach(() => {
17
+ closeDb();
18
+ });
19
+
20
+ test('crawler should crawl and build graph', async () => {
21
+ const client = mockAgent.get('https://example.com');
22
+
23
+ // Root
24
+ client.intercept({
25
+ path: '/',
26
+ method: 'GET'
27
+ }).reply(200, `
28
+ <html><body>
29
+ <a href="/page1">Page 1</a>
30
+ <a href="/page2">Page 2</a>
31
+ </body></html>
32
+ `, {
33
+ headers: { 'content-type': 'text/html' }
34
+ });
35
+
36
+ // Page 1
37
+ client.intercept({
38
+ path: '/page1',
39
+ method: 'GET'
40
+ }).reply(200, `
41
+ <html><body>
42
+ <a href="/page2">Page 2</a>
43
+ </body></html>
44
+ `, {
45
+ headers: { 'content-type': 'text/html' }
46
+ });
47
+
48
+ // Page 2
49
+ client.intercept({
50
+ path: '/page2',
51
+ method: 'GET'
52
+ }).reply(200, `
53
+ <html><body>
54
+ <a href="/">Home</a>
55
+ </body></html>
56
+ `, {
57
+ headers: { 'content-type': 'text/html' }
58
+ });
59
+
60
+ // Robots.txt
61
+ client.intercept({
62
+ path: '/robots.txt',
63
+ method: 'GET'
64
+ }).reply(404, 'Not Found');
65
+
66
+ const snapshotId = await crawl('https://example.com', {
67
+ limit: 10,
68
+ depth: 2,
69
+ ignoreRobots: false,
70
+ rate: 1000
71
+ });
72
+ const graph = loadGraphFromSnapshot(snapshotId);
73
+
74
+ const nodes = graph.getNodes();
75
+ expect(nodes.length).toBe(3);
76
+
77
+ const root = graph.nodes.get('https://example.com/');
78
+ expect(root).toBeDefined();
79
+ expect(root?.depth).toBe(0);
80
+ expect(root?.outLinks).toBe(2);
81
+
82
+ const page1 = graph.nodes.get('https://example.com/page1');
83
+ expect(page1).toBeDefined();
84
+ expect(page1?.depth).toBe(1);
85
+ expect(page1?.inLinks).toBe(1);
86
+
87
+ const page2 = graph.nodes.get('https://example.com/page2');
88
+ expect(page2).toBeDefined();
89
+ expect(page2?.inLinks).toBe(2);
90
+ });
91
+
92
+ test('hard page limit', async () => {
93
+ const client = mockAgent.get('https://limit.com');
94
+
95
+ // Robots
96
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
97
+
98
+ // Root links to 1, 2, 3
99
+ client.intercept({ path: '/', method: 'GET' }).reply(200, `
100
+ <html><a href="/1">1</a><a href="/2">2</a><a href="/3">3</a></html>
101
+ `, { headers: { 'content-type': 'text/html' } });
102
+
103
+ // 1, 2, 3 return html
104
+ client.intercept({ path: '/1', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
105
+ client.intercept({ path: '/2', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
106
+ client.intercept({ path: '/3', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
107
+
108
+ const snapshotId = await crawl('https://limit.com', {
109
+ limit: 2, // root + 1 page
110
+ depth: 5,
111
+ ignoreRobots: true,
112
+ rate: 1000
113
+ });
114
+ const graph = loadGraphFromSnapshot(snapshotId);
115
+
116
+ // Should have visited root + 1 other page (total 2 nodes with status > 0)
117
+ const crawledNodes = graph.getNodes().filter(n => n.status > 0);
118
+ expect(crawledNodes.length).toBeLessThanOrEqual(2);
119
+ });
120
+
121
+ test('hard depth cap', async () => {
122
+ const client = mockAgent.get('https://depth.com');
123
+
124
+ // Robots
125
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
126
+
127
+ // Chain of 12 pages
128
+ for (let i = 0; i < 12; i++) {
129
+ const path = i === 0 ? '/' : `/p${i}`;
130
+ const nextPath = `/p${i + 1}`;
131
+ client.intercept({ path, method: 'GET' }).reply(200, `
132
+ <html><a href="${nextPath}">Next</a></html>
133
+ `, { headers: { 'content-type': 'text/html' } });
134
+ }
135
+
136
+ const snapshotId = await crawl('https://depth.com', {
137
+ limit: 100,
138
+ depth: 20, // requested 20, but internal hard cap is 10
139
+ ignoreRobots: true,
140
+ rate: 1000
141
+ });
142
+ const graph = loadGraphFromSnapshot(snapshotId);
143
+
144
+ const crawledNodes = graph.getNodes().filter(n => n.status > 0);
145
+ const maxCrawledDepth = crawledNodes.reduce((max, n) => Math.max(max, n.depth), 0);
146
+
147
+ expect(maxCrawledDepth).toBeLessThanOrEqual(10);
148
+ });
149
+
150
+ test('parameter explosion control', async () => {
151
+ const client = mockAgent.get('https://params.com');
152
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
153
+
154
+ // Root links to many variations
155
+ let links = '';
156
+ for (let i = 0; i < 10; i++) {
157
+ links += `<a href="/search?q=${i}">q${i}</a>`;
158
+ }
159
+ client.intercept({ path: '/', method: 'GET' }).reply(200, `
160
+ <html>${links}</html>
161
+ `, { headers: { 'content-type': 'text/html' } });
162
+
163
+ // Intercept all variations
164
+ for (let i = 0; i < 40; i++) {
165
+ client.intercept({ path: `/search?q=${i}`, method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
166
+ }
167
+
168
+ const snapshotId = await crawl('https://params.com', {
169
+ limit: 100,
170
+ depth: 5,
171
+ ignoreRobots: true,
172
+ stripQuery: false,
173
+ detectTraps: true,
174
+ rate: 1000
175
+ });
176
+ const graph = loadGraphFromSnapshot(snapshotId);
177
+
178
+ // Should only crawl 5 variations + root
179
+ const nodes = graph.getNodes();
180
+ // Filter nodes that match /search pathname
181
+ const searchNodes = nodes.filter(n => n.url.includes('/search') && n.status > 0);
182
+
183
+ expect(searchNodes.length).toBeLessThanOrEqual(31);
184
+ });
185
+
186
+ test('redirect safety', async () => {
187
+ const client = mockAgent.get('https://redirect.com');
188
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
189
+
190
+ // Root -> /redir1
191
+ client.intercept({ path: '/', method: 'GET' }).reply(200, `
192
+ <html><a href="/redir1">Go</a></html>
193
+ `, { headers: { 'content-type': 'text/html' } });
194
+
195
+ // /redir1 -> 301 -> /dest
196
+ client.intercept({ path: '/redir1', method: 'GET' }).reply(301, '', {
197
+ headers: { 'location': '/dest' }
198
+ });
199
+
200
+ // /dest -> 200
201
+ client.intercept({ path: '/dest', method: 'GET' }).reply(200, '<html>Success</html>', { headers: { 'content-type': 'text/html' } });
202
+
203
+ const snapshotId = await crawl('https://redirect.com', {
204
+ limit: 10,
205
+ depth: 5,
206
+ ignoreRobots: true,
207
+ rate: 1000
208
+ });
209
+ const graph = loadGraphFromSnapshot(snapshotId);
210
+
211
+ const destNode = graph.nodes.get('https://redirect.com/dest');
212
+ expect(destNode).toBeDefined();
213
+ expect(destNode?.status).toBe(200);
214
+
215
+ // Redirect loop: A -> B -> A
216
+ const clientLoop = mockAgent.get('https://loop.com');
217
+ clientLoop.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
218
+ clientLoop.intercept({ path: '/', method: 'GET' }).reply(200, `
219
+ <html><a href="/a">Loop</a></html>
220
+ `, { headers: { 'content-type': 'text/html' } });
221
+
222
+ clientLoop.intercept({ path: '/a', method: 'GET' }).reply(301, '', { headers: { location: '/b' } });
223
+ clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
224
+ // We might mock /a again if it retries, but it shouldn't infinitely loop
225
+
226
+ const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
227
+ const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
228
+ // It should eventually stop
229
+ expect(graphLoop.getNodes().length).toBeGreaterThan(0);
230
+ });
231
+
232
+ test('mime check', async () => {
233
+ const client = mockAgent.get('https://mime.com');
234
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
235
+
236
+ client.intercept({ path: '/', method: 'GET' }).reply(200, `
237
+ <html><a href="/image.png">Img</a></html>
238
+ `, { headers: { 'content-type': 'text/html' } });
239
+
240
+ client.intercept({ path: '/data', method: 'GET' }).reply(200, `
241
+ <html><a href="/hidden">Hidden</a></html>
242
+ `, { headers: { 'content-type': 'application/json' } });
243
+
244
+ // Root links to /data
245
+ client.intercept({ path: '/start', method: 'GET' }).reply(200, `
246
+ <html><a href="/data">Data</a></html>
247
+ `, { headers: { 'content-type': 'text/html' } });
248
+
249
+ const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
250
+ const graph = loadGraphFromSnapshot(snapshotId);
251
+
252
+ // /data should be in graph
253
+ const dataNode = graph.nodes.get('https://mime.com/data');
254
+ expect(dataNode).toBeDefined();
255
+ // But we should NOT have parsed it, so /hidden should NOT be in graph
256
+ const hiddenNode = graph.nodes.get('https://mime.com/hidden');
257
+ expect(hiddenNode).toBeUndefined();
258
+ });
259
+
260
+ test('self-link guard', async () => {
261
+ const client = mockAgent.get('https://self.com');
262
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
263
+
264
+ client.intercept({ path: '/', method: 'GET' }).reply(200, `
265
+ <html><a href="/">Self</a><a href="/other">Other</a></html>
266
+ `, { headers: { 'content-type': 'text/html' } });
267
+
268
+ client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
269
+
270
+ const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
271
+ const graph = loadGraphFromSnapshot(snapshotId);
272
+
273
+ const edges = graph.getEdges();
274
+ const selfEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/');
275
+ expect(selfEdge).toBeUndefined();
276
+
277
+ const otherEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/other');
278
+ expect(otherEdge).toBeDefined();
279
+ });
280
+
281
+ test('limit warning', async () => {
282
+ const client = mockAgent.get('https://warn.com');
283
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
284
+
285
+ client.intercept({ path: '/', method: 'GET' }).reply(200, `
286
+ <html><a href="/1">1</a><a href="/2">2</a></html>
287
+ `, { headers: { 'content-type': 'text/html' } });
288
+
289
+ client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
290
+
291
+ const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 });
292
+ const graph = loadGraphFromSnapshot(snapshotId);
293
+
294
+ expect(graph.limitReached).toBe(true);
295
+ });
296
+
297
+ test('seeds from sitemap', async () => {
298
+ const client = mockAgent.get('https://sitemap-seed.com');
299
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
300
+
301
+ // Sitemap
302
+ client.intercept({ path: '/sitemap.xml', method: 'GET' }).reply(200, `
303
+ <urlset><url><loc>https://sitemap-seed.com/page1</loc></url></urlset>
304
+ `);
305
+
306
+ // Root
307
+ client.intercept({ path: '/', method: 'GET' }).reply(200, '<html>Root</html>', { headers: { 'content-type': 'text/html' } });
308
+
309
+ // Page 1
310
+ client.intercept({ path: '/page1', method: 'GET' }).reply(200, '<html>Page 1</html>', { headers: { 'content-type': 'text/html' } });
311
+
312
+ const snapshotId = await crawl('https://sitemap-seed.com', {
313
+ limit: 10,
314
+ depth: 5,
315
+ ignoreRobots: true,
316
+ sitemap: 'true',
317
+ rate: 1000
318
+ });
319
+ const graph = loadGraphFromSnapshot(snapshotId);
320
+
321
+ const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
322
+ expect(page1).toBeDefined();
323
+ expect(page1?.status).toBe(200);
324
+ });
325
+
326
+ test('incremental crawl uses etags', async () => {
327
+ const client = mockAgent.get('https://incremental.com');
328
+ client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
329
+
330
+ // First crawl setup
331
+ client.intercept({ path: '/', method: 'GET' }).reply(200, 'Original', {
332
+ headers: { 'content-type': 'text/html', 'etag': '"v1"' }
333
+ });
334
+
335
+ const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 });
336
+ const graph1 = loadGraphFromSnapshot(snapshotId1);
337
+ const node1 = graph1.nodes.get('https://incremental.com/');
338
+ expect(node1?.etag).toBe('"v1"');
339
+
340
+ // Second crawl setup
341
+ client.intercept({
342
+ path: '/',
343
+ method: 'GET',
344
+ headers: { 'If-None-Match': '"v1"' }
345
+ }).reply(304, '', { headers: { 'etag': '"v1"' } });
346
+
347
+ const snapshotId2 = await crawl('https://incremental.com', {
348
+ limit: 10,
349
+ depth: 1,
350
+ ignoreRobots: true,
351
+ previousGraph: graph1,
352
+ rate: 1000
353
+ });
354
+ const graph2 = loadGraphFromSnapshot(snapshotId2);
355
+
356
+ const node2 = graph2.nodes.get('https://incremental.com/');
357
+ expect(node2?.incrementalStatus).toBe('unchanged');
358
+ });
@@ -0,0 +1,159 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import Database from 'better-sqlite3';
3
+ import { initSchema } from '../src/db/schema.js';
4
+ import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
5
+ import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
6
+ import { PageRepository } from '../src/db/repositories/PageRepository.js';
7
+ import { EdgeRepository } from '../src/db/repositories/EdgeRepository.js';
8
+ import { MetricsRepository } from '../src/db/repositories/MetricsRepository.js';
9
+
10
+ describe('Database Layer', () => {
11
+ let db: Database.Database;
12
+ let siteRepo: SiteRepository;
13
+ let snapshotRepo: SnapshotRepository;
14
+ let pageRepo: PageRepository;
15
+ let edgeRepo: EdgeRepository;
16
+ let metricsRepo: MetricsRepository;
17
+
18
+ beforeEach(() => {
19
+ db = new Database(':memory:');
20
+ initSchema(db);
21
+ siteRepo = new SiteRepository(db);
22
+ snapshotRepo = new SnapshotRepository(db);
23
+ pageRepo = new PageRepository(db);
24
+ edgeRepo = new EdgeRepository(db);
25
+ metricsRepo = new MetricsRepository(db);
26
+ });
27
+
28
+ afterEach(() => {
29
+ db.close();
30
+ });
31
+
32
+ it('should create and retrieve a site', () => {
33
+ const domain = 'example.com';
34
+ const id = siteRepo.createSite(domain);
35
+ expect(id).toBeGreaterThan(0);
36
+
37
+ const site = siteRepo.getSite(domain);
38
+ expect(site).toBeDefined();
39
+ expect(site?.domain).toBe(domain);
40
+ });
41
+
42
+ it('should create and retrieve a snapshot', () => {
43
+ const siteId = siteRepo.createSite('example.com');
44
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
45
+ expect(snapshotId).toBeGreaterThan(0);
46
+
47
+ const snapshot = snapshotRepo.getLatestSnapshot(siteId);
48
+ expect(snapshot).toBeDefined();
49
+ expect(snapshot?.status).toBe('running');
50
+
51
+ snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 10, edge_count: 5 });
52
+ const updated = snapshotRepo.getLatestSnapshot(siteId);
53
+ expect(updated?.status).toBe('completed');
54
+ expect(updated?.node_count).toBe(10);
55
+ });
56
+
57
+ it('should upsert pages', () => {
58
+ const siteId = siteRepo.createSite('example.com');
59
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
60
+ const url = 'http://example.com';
61
+
62
+ // First insert
63
+ pageRepo.upsertPage({
64
+ site_id: siteId,
65
+ normalized_url: url,
66
+ last_seen_snapshot_id: snapshotId,
67
+ http_status: 200,
68
+ depth: 0
69
+ });
70
+
71
+ let page = pageRepo.getPage(siteId, url);
72
+ expect(page).toBeDefined();
73
+ expect(page?.first_seen_snapshot_id).toBe(snapshotId);
74
+ expect(page?.last_seen_snapshot_id).toBe(snapshotId);
75
+ expect(page?.http_status).toBe(200);
76
+
77
+ // Update (second snapshot)
78
+ const snapshotId2 = snapshotRepo.createSnapshot(siteId, 'incremental');
79
+ pageRepo.upsertPage({
80
+ site_id: siteId,
81
+ normalized_url: url,
82
+ last_seen_snapshot_id: snapshotId2,
83
+ http_status: 200, // same status
84
+ depth: 0
85
+ });
86
+
87
+ page = pageRepo.getPage(siteId, url);
88
+ expect(page?.first_seen_snapshot_id).toBe(snapshotId); // Should remain the first one
89
+ expect(page?.last_seen_snapshot_id).toBe(snapshotId2); // Should update to the second one
90
+ });
91
+
92
+ it('should persist new columns (nofollow, security_error, retries)', () => {
93
+ const siteId = siteRepo.createSite('new-cols.com');
94
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
95
+ const url = 'http://new-cols.com';
96
+
97
+ pageRepo.upsertPage({
98
+ site_id: siteId,
99
+ normalized_url: url,
100
+ last_seen_snapshot_id: snapshotId,
101
+ nofollow: 1,
102
+ security_error: 'blocked',
103
+ retries: 3
104
+ });
105
+
106
+ const page = pageRepo.getPage(siteId, url);
107
+ expect(page?.nofollow).toBe(1);
108
+ expect(page?.security_error).toBe('blocked');
109
+ expect(page?.retries).toBe(3);
110
+ });
111
+
112
+ it('should insert and retrieve edges', () => {
113
+ const siteId = siteRepo.createSite('example.com');
114
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
115
+
116
+ // Create pages first
117
+ pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/1', last_seen_snapshot_id: snapshotId });
118
+ pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/2', last_seen_snapshot_id: snapshotId });
119
+
120
+ const p1 = pageRepo.getPage(siteId, 'http://example.com/1')!;
121
+ const p2 = pageRepo.getPage(siteId, 'http://example.com/2')!;
122
+
123
+ edgeRepo.insertEdge(snapshotId, p1.id, p2.id, 1.0, 'internal');
124
+
125
+ const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
126
+ expect(edges).toHaveLength(1);
127
+ expect(edges[0].source_page_id).toBe(p1.id);
128
+ expect(edges[0].target_page_id).toBe(p2.id);
129
+ });
130
+
131
+ it('should insert and retrieve metrics', () => {
132
+ const siteId = siteRepo.createSite('example.com');
133
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
134
+ pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/1', last_seen_snapshot_id: snapshotId });
135
+ const p1 = pageRepo.getPage(siteId, 'http://example.com/1')!;
136
+
137
+ metricsRepo.insertMetrics({
138
+ snapshot_id: snapshotId,
139
+ page_id: p1.id,
140
+ authority_score: 0.5,
141
+ hub_score: 0.2,
142
+ pagerank: 0.8,
143
+ pagerank_score: 80.0,
144
+ link_role: 'authority',
145
+ crawl_status: 'fetched',
146
+ word_count: 100,
147
+ thin_content_score: 0.1,
148
+ external_link_ratio: 0.0,
149
+ orphan_score: 0,
150
+ duplicate_cluster_id: null,
151
+ duplicate_type: null,
152
+ is_cluster_primary: 0
153
+ });
154
+
155
+ const metrics = metricsRepo.getMetricsForPage(snapshotId, p1.id);
156
+ expect(metrics).toBeDefined();
157
+ expect(metrics?.authority_score).toBe(0.5);
158
+ });
159
+ });
@@ -0,0 +1,67 @@
1
+ import { test, expect } from 'vitest';
2
+ import { Graph } from '../src/graph/graph.js';
3
+ import { compareGraphs } from '../src/diff/compare.js';
4
+
5
+ test('detects added and removed urls', () => {
6
+ const oldGraph = new Graph();
7
+ oldGraph.addNode('https://example.com/a', 0, 200);
8
+ oldGraph.addNode('https://example.com/b', 1, 200);
9
+
10
+ const newGraph = new Graph();
11
+ newGraph.addNode('https://example.com/a', 0, 200);
12
+ newGraph.addNode('https://example.com/c', 1, 200); // Added
13
+
14
+ const diff = compareGraphs(oldGraph, newGraph);
15
+ expect(diff.addedUrls).toContain('https://example.com/c');
16
+ expect(diff.removedUrls).toContain('https://example.com/b');
17
+ });
18
+
19
+ test('detects status changes', () => {
20
+ const oldGraph = new Graph();
21
+ oldGraph.addNode('https://example.com/a', 0, 200);
22
+
23
+ const newGraph = new Graph();
24
+ newGraph.addNode('https://example.com/a', 0, 404);
25
+
26
+ const diff = compareGraphs(oldGraph, newGraph);
27
+ expect(diff.changedStatus).toHaveLength(1);
28
+ expect(diff.changedStatus[0]).toEqual({
29
+ url: 'https://example.com/a',
30
+ oldStatus: 200,
31
+ newStatus: 404
32
+ });
33
+ });
34
+
35
+ test('detects canonical changes', () => {
36
+ const oldGraph = new Graph();
37
+ oldGraph.addNode('https://example.com/a', 0, 200);
38
+ oldGraph.updateNodeData('https://example.com/a', { canonical: 'https://example.com/canon1' });
39
+
40
+ const newGraph = new Graph();
41
+ newGraph.addNode('https://example.com/a', 0, 200);
42
+ newGraph.updateNodeData('https://example.com/a', { canonical: 'https://example.com/canon2' });
43
+
44
+ const diff = compareGraphs(oldGraph, newGraph);
45
+ expect(diff.changedCanonical).toHaveLength(1);
46
+ expect(diff.changedCanonical[0]).toEqual({
47
+ url: 'https://example.com/a',
48
+ oldCanonical: 'https://example.com/canon1',
49
+ newCanonical: 'https://example.com/canon2'
50
+ });
51
+ });
52
+
53
+ test('calculates metric deltas', () => {
54
+ const oldGraph = new Graph();
55
+ // Orphan: A (depth 1, inLinks 0)
56
+ oldGraph.addNode('https://example.com/a', 1, 200);
57
+
58
+ const newGraph = new Graph();
59
+ // Not Orphan: Root -> A
60
+ newGraph.addNode('https://example.com/', 0, 200);
61
+ newGraph.addNode('https://example.com/a', 1, 200);
62
+ newGraph.addEdge('https://example.com/', 'https://example.com/a');
63
+
64
+ const diff = compareGraphs(oldGraph, newGraph);
65
+ // Old orphan count: 1 (A). New: 0. Delta: -1.
66
+ expect(diff.metricDeltas.orphanCount).toBe(-1);
67
+ });