@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,358 +0,0 @@
1
- import { test, expect, beforeEach, afterEach } from 'vitest';
2
- import { crawl } from '../src/crawler/crawl.js';
3
- import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
4
- import { closeDb } from '../src/db/index.js';
5
- import { MockAgent, setGlobalDispatcher } from 'undici';
6
-
7
- let mockAgent: MockAgent;
8
-
9
- beforeEach(() => {
10
- process.env.CRAWLITH_DB_PATH = ':memory:';
11
- mockAgent = new MockAgent();
12
- mockAgent.disableNetConnect();
13
- setGlobalDispatcher(mockAgent);
14
- });
15
-
16
- afterEach(() => {
17
- closeDb();
18
- });
19
-
20
- test('crawler should crawl and build graph', async () => {
21
- const client = mockAgent.get('https://example.com');
22
-
23
- // Root
24
- client.intercept({
25
- path: '/',
26
- method: 'GET'
27
- }).reply(200, `
28
- <html><body>
29
- <a href="/page1">Page 1</a>
30
- <a href="/page2">Page 2</a>
31
- </body></html>
32
- `, {
33
- headers: { 'content-type': 'text/html' }
34
- });
35
-
36
- // Page 1
37
- client.intercept({
38
- path: '/page1',
39
- method: 'GET'
40
- }).reply(200, `
41
- <html><body>
42
- <a href="/page2">Page 2</a>
43
- </body></html>
44
- `, {
45
- headers: { 'content-type': 'text/html' }
46
- });
47
-
48
- // Page 2
49
- client.intercept({
50
- path: '/page2',
51
- method: 'GET'
52
- }).reply(200, `
53
- <html><body>
54
- <a href="/">Home</a>
55
- </body></html>
56
- `, {
57
- headers: { 'content-type': 'text/html' }
58
- });
59
-
60
- // Robots.txt
61
- client.intercept({
62
- path: '/robots.txt',
63
- method: 'GET'
64
- }).reply(404, 'Not Found');
65
-
66
- const snapshotId = await crawl('https://example.com', {
67
- limit: 10,
68
- depth: 2,
69
- ignoreRobots: false,
70
- rate: 1000
71
- });
72
- const graph = loadGraphFromSnapshot(snapshotId);
73
-
74
- const nodes = graph.getNodes();
75
- expect(nodes.length).toBe(3);
76
-
77
- const root = graph.nodes.get('https://example.com/');
78
- expect(root).toBeDefined();
79
- expect(root?.depth).toBe(0);
80
- expect(root?.outLinks).toBe(2);
81
-
82
- const page1 = graph.nodes.get('https://example.com/page1');
83
- expect(page1).toBeDefined();
84
- expect(page1?.depth).toBe(1);
85
- expect(page1?.inLinks).toBe(1);
86
-
87
- const page2 = graph.nodes.get('https://example.com/page2');
88
- expect(page2).toBeDefined();
89
- expect(page2?.inLinks).toBe(2);
90
- });
91
-
92
- test('hard page limit', async () => {
93
- const client = mockAgent.get('https://limit.com');
94
-
95
- // Robots
96
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
97
-
98
- // Root links to 1, 2, 3
99
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
100
- <html><a href="/1">1</a><a href="/2">2</a><a href="/3">3</a></html>
101
- `, { headers: { 'content-type': 'text/html' } });
102
-
103
- // 1, 2, 3 return html
104
- client.intercept({ path: '/1', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
105
- client.intercept({ path: '/2', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
106
- client.intercept({ path: '/3', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
107
-
108
- const snapshotId = await crawl('https://limit.com', {
109
- limit: 2, // root + 1 page
110
- depth: 5,
111
- ignoreRobots: true,
112
- rate: 1000
113
- });
114
- const graph = loadGraphFromSnapshot(snapshotId);
115
-
116
- // Should have visited root + 1 other page (total 2 nodes with status > 0)
117
- const crawledNodes = graph.getNodes().filter(n => n.status > 0);
118
- expect(crawledNodes.length).toBeLessThanOrEqual(2);
119
- });
120
-
121
- test('hard depth cap', async () => {
122
- const client = mockAgent.get('https://depth.com');
123
-
124
- // Robots
125
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
126
-
127
- // Chain of 12 pages
128
- for (let i = 0; i < 12; i++) {
129
- const path = i === 0 ? '/' : `/p${i}`;
130
- const nextPath = `/p${i + 1}`;
131
- client.intercept({ path, method: 'GET' }).reply(200, `
132
- <html><a href="${nextPath}">Next</a></html>
133
- `, { headers: { 'content-type': 'text/html' } });
134
- }
135
-
136
- const snapshotId = await crawl('https://depth.com', {
137
- limit: 100,
138
- depth: 20, // requested 20, but internal hard cap is 10
139
- ignoreRobots: true,
140
- rate: 1000
141
- });
142
- const graph = loadGraphFromSnapshot(snapshotId);
143
-
144
- const crawledNodes = graph.getNodes().filter(n => n.status > 0);
145
- const maxCrawledDepth = crawledNodes.reduce((max, n) => Math.max(max, n.depth), 0);
146
-
147
- expect(maxCrawledDepth).toBeLessThanOrEqual(10);
148
- });
149
-
150
- test('parameter explosion control', async () => {
151
- const client = mockAgent.get('https://params.com');
152
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
153
-
154
- // Root links to many variations
155
- let links = '';
156
- for (let i = 0; i < 10; i++) {
157
- links += `<a href="/search?q=${i}">q${i}</a>`;
158
- }
159
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
160
- <html>${links}</html>
161
- `, { headers: { 'content-type': 'text/html' } });
162
-
163
- // Intercept all variations
164
- for (let i = 0; i < 40; i++) {
165
- client.intercept({ path: `/search?q=${i}`, method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
166
- }
167
-
168
- const snapshotId = await crawl('https://params.com', {
169
- limit: 100,
170
- depth: 5,
171
- ignoreRobots: true,
172
- stripQuery: false,
173
- detectTraps: true,
174
- rate: 1000
175
- });
176
- const graph = loadGraphFromSnapshot(snapshotId);
177
-
178
- // Should only crawl 5 variations + root
179
- const nodes = graph.getNodes();
180
- // Filter nodes that match /search pathname
181
- const searchNodes = nodes.filter(n => n.url.includes('/search') && n.status > 0);
182
-
183
- expect(searchNodes.length).toBeLessThanOrEqual(31);
184
- });
185
-
186
- test('redirect safety', async () => {
187
- const client = mockAgent.get('https://redirect.com');
188
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
189
-
190
- // Root -> /redir1
191
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
192
- <html><a href="/redir1">Go</a></html>
193
- `, { headers: { 'content-type': 'text/html' } });
194
-
195
- // /redir1 -> 301 -> /dest
196
- client.intercept({ path: '/redir1', method: 'GET' }).reply(301, '', {
197
- headers: { 'location': '/dest' }
198
- });
199
-
200
- // /dest -> 200
201
- client.intercept({ path: '/dest', method: 'GET' }).reply(200, '<html>Success</html>', { headers: { 'content-type': 'text/html' } });
202
-
203
- const snapshotId = await crawl('https://redirect.com', {
204
- limit: 10,
205
- depth: 5,
206
- ignoreRobots: true,
207
- rate: 1000
208
- });
209
- const graph = loadGraphFromSnapshot(snapshotId);
210
-
211
- const destNode = graph.nodes.get('https://redirect.com/dest');
212
- expect(destNode).toBeDefined();
213
- expect(destNode?.status).toBe(200);
214
-
215
- // Redirect loop: A -> B -> A
216
- const clientLoop = mockAgent.get('https://loop.com');
217
- clientLoop.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
218
- clientLoop.intercept({ path: '/', method: 'GET' }).reply(200, `
219
- <html><a href="/a">Loop</a></html>
220
- `, { headers: { 'content-type': 'text/html' } });
221
-
222
- clientLoop.intercept({ path: '/a', method: 'GET' }).reply(301, '', { headers: { location: '/b' } });
223
- clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
224
- // We might mock /a again if it retries, but it shouldn't infinitely loop
225
-
226
- const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
227
- const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
228
- // It should eventually stop
229
- expect(graphLoop.getNodes().length).toBeGreaterThan(0);
230
- });
231
-
232
- test('mime check', async () => {
233
- const client = mockAgent.get('https://mime.com');
234
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
235
-
236
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
237
- <html><a href="/image.png">Img</a></html>
238
- `, { headers: { 'content-type': 'text/html' } });
239
-
240
- client.intercept({ path: '/data', method: 'GET' }).reply(200, `
241
- <html><a href="/hidden">Hidden</a></html>
242
- `, { headers: { 'content-type': 'application/json' } });
243
-
244
- // Root links to /data
245
- client.intercept({ path: '/start', method: 'GET' }).reply(200, `
246
- <html><a href="/data">Data</a></html>
247
- `, { headers: { 'content-type': 'text/html' } });
248
-
249
- const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
250
- const graph = loadGraphFromSnapshot(snapshotId);
251
-
252
- // /data should be in graph
253
- const dataNode = graph.nodes.get('https://mime.com/data');
254
- expect(dataNode).toBeDefined();
255
- // But we should NOT have parsed it, so /hidden should NOT be in graph
256
- const hiddenNode = graph.nodes.get('https://mime.com/hidden');
257
- expect(hiddenNode).toBeUndefined();
258
- });
259
-
260
- test('self-link guard', async () => {
261
- const client = mockAgent.get('https://self.com');
262
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
263
-
264
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
265
- <html><a href="/">Self</a><a href="/other">Other</a></html>
266
- `, { headers: { 'content-type': 'text/html' } });
267
-
268
- client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
269
-
270
- const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
271
- const graph = loadGraphFromSnapshot(snapshotId);
272
-
273
- const edges = graph.getEdges();
274
- const selfEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/');
275
- expect(selfEdge).toBeUndefined();
276
-
277
- const otherEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/other');
278
- expect(otherEdge).toBeDefined();
279
- });
280
-
281
- test('limit warning', async () => {
282
- const client = mockAgent.get('https://warn.com');
283
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
284
-
285
- client.intercept({ path: '/', method: 'GET' }).reply(200, `
286
- <html><a href="/1">1</a><a href="/2">2</a></html>
287
- `, { headers: { 'content-type': 'text/html' } });
288
-
289
- client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
290
-
291
- const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 });
292
- const graph = loadGraphFromSnapshot(snapshotId);
293
-
294
- expect(graph.limitReached).toBe(true);
295
- });
296
-
297
- test('seeds from sitemap', async () => {
298
- const client = mockAgent.get('https://sitemap-seed.com');
299
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
300
-
301
- // Sitemap
302
- client.intercept({ path: '/sitemap.xml', method: 'GET' }).reply(200, `
303
- <urlset><url><loc>https://sitemap-seed.com/page1</loc></url></urlset>
304
- `);
305
-
306
- // Root
307
- client.intercept({ path: '/', method: 'GET' }).reply(200, '<html>Root</html>', { headers: { 'content-type': 'text/html' } });
308
-
309
- // Page 1
310
- client.intercept({ path: '/page1', method: 'GET' }).reply(200, '<html>Page 1</html>', { headers: { 'content-type': 'text/html' } });
311
-
312
- const snapshotId = await crawl('https://sitemap-seed.com', {
313
- limit: 10,
314
- depth: 5,
315
- ignoreRobots: true,
316
- sitemap: 'true',
317
- rate: 1000
318
- });
319
- const graph = loadGraphFromSnapshot(snapshotId);
320
-
321
- const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
322
- expect(page1).toBeDefined();
323
- expect(page1?.status).toBe(200);
324
- });
325
-
326
- test('incremental crawl uses etags', async () => {
327
- const client = mockAgent.get('https://incremental.com');
328
- client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
329
-
330
- // First crawl setup
331
- client.intercept({ path: '/', method: 'GET' }).reply(200, 'Original', {
332
- headers: { 'content-type': 'text/html', 'etag': '"v1"' }
333
- });
334
-
335
- const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 });
336
- const graph1 = loadGraphFromSnapshot(snapshotId1);
337
- const node1 = graph1.nodes.get('https://incremental.com/');
338
- expect(node1?.etag).toBe('"v1"');
339
-
340
- // Second crawl setup
341
- client.intercept({
342
- path: '/',
343
- method: 'GET',
344
- headers: { 'If-None-Match': '"v1"' }
345
- }).reply(304, '', { headers: { 'etag': '"v1"' } });
346
-
347
- const snapshotId2 = await crawl('https://incremental.com', {
348
- limit: 10,
349
- depth: 1,
350
- ignoreRobots: true,
351
- previousGraph: graph1,
352
- rate: 1000
353
- });
354
- const graph2 = loadGraphFromSnapshot(snapshotId2);
355
-
356
- const node2 = graph2.nodes.get('https://incremental.com/');
357
- expect(node2?.incrementalStatus).toBe('unchanged');
358
- });
package/tests/db.test.ts DELETED
@@ -1,159 +0,0 @@
1
- import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
- import Database from 'better-sqlite3';
3
- import { initSchema } from '../src/db/schema.js';
4
- import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
5
- import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
6
- import { PageRepository } from '../src/db/repositories/PageRepository.js';
7
- import { EdgeRepository } from '../src/db/repositories/EdgeRepository.js';
8
- import { MetricsRepository } from '../src/db/repositories/MetricsRepository.js';
9
-
10
- describe('Database Layer', () => {
11
- let db: Database.Database;
12
- let siteRepo: SiteRepository;
13
- let snapshotRepo: SnapshotRepository;
14
- let pageRepo: PageRepository;
15
- let edgeRepo: EdgeRepository;
16
- let metricsRepo: MetricsRepository;
17
-
18
- beforeEach(() => {
19
- db = new Database(':memory:');
20
- initSchema(db);
21
- siteRepo = new SiteRepository(db);
22
- snapshotRepo = new SnapshotRepository(db);
23
- pageRepo = new PageRepository(db);
24
- edgeRepo = new EdgeRepository(db);
25
- metricsRepo = new MetricsRepository(db);
26
- });
27
-
28
- afterEach(() => {
29
- db.close();
30
- });
31
-
32
- it('should create and retrieve a site', () => {
33
- const domain = 'example.com';
34
- const id = siteRepo.createSite(domain);
35
- expect(id).toBeGreaterThan(0);
36
-
37
- const site = siteRepo.getSite(domain);
38
- expect(site).toBeDefined();
39
- expect(site?.domain).toBe(domain);
40
- });
41
-
42
- it('should create and retrieve a snapshot', () => {
43
- const siteId = siteRepo.createSite('example.com');
44
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
45
- expect(snapshotId).toBeGreaterThan(0);
46
-
47
- const snapshot = snapshotRepo.getLatestSnapshot(siteId);
48
- expect(snapshot).toBeDefined();
49
- expect(snapshot?.status).toBe('running');
50
-
51
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 10, edge_count: 5 });
52
- const updated = snapshotRepo.getLatestSnapshot(siteId);
53
- expect(updated?.status).toBe('completed');
54
- expect(updated?.node_count).toBe(10);
55
- });
56
-
57
- it('should upsert pages', () => {
58
- const siteId = siteRepo.createSite('example.com');
59
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
60
- const url = 'http://example.com';
61
-
62
- // First insert
63
- pageRepo.upsertPage({
64
- site_id: siteId,
65
- normalized_url: url,
66
- last_seen_snapshot_id: snapshotId,
67
- http_status: 200,
68
- depth: 0
69
- });
70
-
71
- let page = pageRepo.getPage(siteId, url);
72
- expect(page).toBeDefined();
73
- expect(page?.first_seen_snapshot_id).toBe(snapshotId);
74
- expect(page?.last_seen_snapshot_id).toBe(snapshotId);
75
- expect(page?.http_status).toBe(200);
76
-
77
- // Update (second snapshot)
78
- const snapshotId2 = snapshotRepo.createSnapshot(siteId, 'incremental');
79
- pageRepo.upsertPage({
80
- site_id: siteId,
81
- normalized_url: url,
82
- last_seen_snapshot_id: snapshotId2,
83
- http_status: 200, // same status
84
- depth: 0
85
- });
86
-
87
- page = pageRepo.getPage(siteId, url);
88
- expect(page?.first_seen_snapshot_id).toBe(snapshotId); // Should remain the first one
89
- expect(page?.last_seen_snapshot_id).toBe(snapshotId2); // Should update to the second one
90
- });
91
-
92
- it('should persist new columns (nofollow, security_error, retries)', () => {
93
- const siteId = siteRepo.createSite('new-cols.com');
94
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
95
- const url = 'http://new-cols.com';
96
-
97
- pageRepo.upsertPage({
98
- site_id: siteId,
99
- normalized_url: url,
100
- last_seen_snapshot_id: snapshotId,
101
- nofollow: 1,
102
- security_error: 'blocked',
103
- retries: 3
104
- });
105
-
106
- const page = pageRepo.getPage(siteId, url);
107
- expect(page?.nofollow).toBe(1);
108
- expect(page?.security_error).toBe('blocked');
109
- expect(page?.retries).toBe(3);
110
- });
111
-
112
- it('should insert and retrieve edges', () => {
113
- const siteId = siteRepo.createSite('example.com');
114
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
115
-
116
- // Create pages first
117
- pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/1', last_seen_snapshot_id: snapshotId });
118
- pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/2', last_seen_snapshot_id: snapshotId });
119
-
120
- const p1 = pageRepo.getPage(siteId, 'http://example.com/1')!;
121
- const p2 = pageRepo.getPage(siteId, 'http://example.com/2')!;
122
-
123
- edgeRepo.insertEdge(snapshotId, p1.id, p2.id, 1.0, 'internal');
124
-
125
- const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
126
- expect(edges).toHaveLength(1);
127
- expect(edges[0].source_page_id).toBe(p1.id);
128
- expect(edges[0].target_page_id).toBe(p2.id);
129
- });
130
-
131
- it('should insert and retrieve metrics', () => {
132
- const siteId = siteRepo.createSite('example.com');
133
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
134
- pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/1', last_seen_snapshot_id: snapshotId });
135
- const p1 = pageRepo.getPage(siteId, 'http://example.com/1')!;
136
-
137
- metricsRepo.insertMetrics({
138
- snapshot_id: snapshotId,
139
- page_id: p1.id,
140
- authority_score: 0.5,
141
- hub_score: 0.2,
142
- pagerank: 0.8,
143
- pagerank_score: 80.0,
144
- link_role: 'authority',
145
- crawl_status: 'fetched',
146
- word_count: 100,
147
- thin_content_score: 0.1,
148
- external_link_ratio: 0.0,
149
- orphan_score: 0,
150
- duplicate_cluster_id: null,
151
- duplicate_type: null,
152
- is_cluster_primary: 0
153
- });
154
-
155
- const metrics = metricsRepo.getMetricsForPage(snapshotId, p1.id);
156
- expect(metrics).toBeDefined();
157
- expect(metrics?.authority_score).toBe(0.5);
158
- });
159
- });
@@ -1,67 +0,0 @@
1
- import { test, expect } from 'vitest';
2
- import { Graph } from '../src/graph/graph.js';
3
- import { compareGraphs } from '../src/diff/compare.js';
4
-
5
- test('detects added and removed urls', () => {
6
- const oldGraph = new Graph();
7
- oldGraph.addNode('https://example.com/a', 0, 200);
8
- oldGraph.addNode('https://example.com/b', 1, 200);
9
-
10
- const newGraph = new Graph();
11
- newGraph.addNode('https://example.com/a', 0, 200);
12
- newGraph.addNode('https://example.com/c', 1, 200); // Added
13
-
14
- const diff = compareGraphs(oldGraph, newGraph);
15
- expect(diff.addedUrls).toContain('https://example.com/c');
16
- expect(diff.removedUrls).toContain('https://example.com/b');
17
- });
18
-
19
- test('detects status changes', () => {
20
- const oldGraph = new Graph();
21
- oldGraph.addNode('https://example.com/a', 0, 200);
22
-
23
- const newGraph = new Graph();
24
- newGraph.addNode('https://example.com/a', 0, 404);
25
-
26
- const diff = compareGraphs(oldGraph, newGraph);
27
- expect(diff.changedStatus).toHaveLength(1);
28
- expect(diff.changedStatus[0]).toEqual({
29
- url: 'https://example.com/a',
30
- oldStatus: 200,
31
- newStatus: 404
32
- });
33
- });
34
-
35
- test('detects canonical changes', () => {
36
- const oldGraph = new Graph();
37
- oldGraph.addNode('https://example.com/a', 0, 200);
38
- oldGraph.updateNodeData('https://example.com/a', { canonical: 'https://example.com/canon1' });
39
-
40
- const newGraph = new Graph();
41
- newGraph.addNode('https://example.com/a', 0, 200);
42
- newGraph.updateNodeData('https://example.com/a', { canonical: 'https://example.com/canon2' });
43
-
44
- const diff = compareGraphs(oldGraph, newGraph);
45
- expect(diff.changedCanonical).toHaveLength(1);
46
- expect(diff.changedCanonical[0]).toEqual({
47
- url: 'https://example.com/a',
48
- oldCanonical: 'https://example.com/canon1',
49
- newCanonical: 'https://example.com/canon2'
50
- });
51
- });
52
-
53
- test('calculates metric deltas', () => {
54
- const oldGraph = new Graph();
55
- // Orphan: A (depth 1, inLinks 0)
56
- oldGraph.addNode('https://example.com/a', 1, 200);
57
-
58
- const newGraph = new Graph();
59
- // Not Orphan: Root -> A
60
- newGraph.addNode('https://example.com/', 0, 200);
61
- newGraph.addNode('https://example.com/a', 1, 200);
62
- newGraph.addEdge('https://example.com/', 'https://example.com/a');
63
-
64
- const diff = compareGraphs(oldGraph, newGraph);
65
- // Old orphan count: 1 (A). New: 0. Delta: -1.
66
- expect(diff.metricDeltas.orphanCount).toBe(-1);
67
- });