@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
package/tests/db.test.ts DELETED
@@ -1,159 +0,0 @@
1
- import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
- import Database from 'better-sqlite3';
3
- import { initSchema } from '../src/db/schema.js';
4
- import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
5
- import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
6
- import { PageRepository } from '../src/db/repositories/PageRepository.js';
7
- import { EdgeRepository } from '../src/db/repositories/EdgeRepository.js';
8
- import { MetricsRepository } from '../src/db/repositories/MetricsRepository.js';
9
-
10
- describe('Database Layer', () => {
11
- let db: Database.Database;
12
- let siteRepo: SiteRepository;
13
- let snapshotRepo: SnapshotRepository;
14
- let pageRepo: PageRepository;
15
- let edgeRepo: EdgeRepository;
16
- let metricsRepo: MetricsRepository;
17
-
18
- beforeEach(() => {
19
- db = new Database(':memory:');
20
- initSchema(db);
21
- siteRepo = new SiteRepository(db);
22
- snapshotRepo = new SnapshotRepository(db);
23
- pageRepo = new PageRepository(db);
24
- edgeRepo = new EdgeRepository(db);
25
- metricsRepo = new MetricsRepository(db);
26
- });
27
-
28
- afterEach(() => {
29
- db.close();
30
- });
31
-
32
- it('should create and retrieve a site', () => {
33
- const domain = 'example.com';
34
- const id = siteRepo.createSite(domain);
35
- expect(id).toBeGreaterThan(0);
36
-
37
- const site = siteRepo.getSite(domain);
38
- expect(site).toBeDefined();
39
- expect(site?.domain).toBe(domain);
40
- });
41
-
42
- it('should create and retrieve a snapshot', () => {
43
- const siteId = siteRepo.createSite('example.com');
44
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
45
- expect(snapshotId).toBeGreaterThan(0);
46
-
47
- const snapshot = snapshotRepo.getLatestSnapshot(siteId);
48
- expect(snapshot).toBeDefined();
49
- expect(snapshot?.status).toBe('running');
50
-
51
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 10, edge_count: 5 });
52
- const updated = snapshotRepo.getLatestSnapshot(siteId);
53
- expect(updated?.status).toBe('completed');
54
- expect(updated?.node_count).toBe(10);
55
- });
56
-
57
- it('should upsert pages', () => {
58
- const siteId = siteRepo.createSite('example.com');
59
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
60
- const url = 'http://example.com';
61
-
62
- // First insert
63
- pageRepo.upsertPage({
64
- site_id: siteId,
65
- normalized_url: url,
66
- last_seen_snapshot_id: snapshotId,
67
- http_status: 200,
68
- depth: 0
69
- });
70
-
71
- let page = pageRepo.getPage(siteId, url);
72
- expect(page).toBeDefined();
73
- expect(page?.first_seen_snapshot_id).toBe(snapshotId);
74
- expect(page?.last_seen_snapshot_id).toBe(snapshotId);
75
- expect(page?.http_status).toBe(200);
76
-
77
- // Update (second snapshot)
78
- const snapshotId2 = snapshotRepo.createSnapshot(siteId, 'incremental');
79
- pageRepo.upsertPage({
80
- site_id: siteId,
81
- normalized_url: url,
82
- last_seen_snapshot_id: snapshotId2,
83
- http_status: 200, // same status
84
- depth: 0
85
- });
86
-
87
- page = pageRepo.getPage(siteId, url);
88
- expect(page?.first_seen_snapshot_id).toBe(snapshotId); // Should remain the first one
89
- expect(page?.last_seen_snapshot_id).toBe(snapshotId2); // Should update to the second one
90
- });
91
-
92
- it('should persist new columns (nofollow, security_error, retries)', () => {
93
- const siteId = siteRepo.createSite('new-cols.com');
94
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
95
- const url = 'http://new-cols.com';
96
-
97
- pageRepo.upsertPage({
98
- site_id: siteId,
99
- normalized_url: url,
100
- last_seen_snapshot_id: snapshotId,
101
- nofollow: 1,
102
- security_error: 'blocked',
103
- retries: 3
104
- });
105
-
106
- const page = pageRepo.getPage(siteId, url);
107
- expect(page?.nofollow).toBe(1);
108
- expect(page?.security_error).toBe('blocked');
109
- expect(page?.retries).toBe(3);
110
- });
111
-
112
- it('should insert and retrieve edges', () => {
113
- const siteId = siteRepo.createSite('example.com');
114
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
115
-
116
- // Create pages first
117
- pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/1', last_seen_snapshot_id: snapshotId });
118
- pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/2', last_seen_snapshot_id: snapshotId });
119
-
120
- const p1 = pageRepo.getPage(siteId, 'http://example.com/1')!;
121
- const p2 = pageRepo.getPage(siteId, 'http://example.com/2')!;
122
-
123
- edgeRepo.insertEdge(snapshotId, p1.id, p2.id, 1.0, 'internal');
124
-
125
- const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
126
- expect(edges).toHaveLength(1);
127
- expect(edges[0].source_page_id).toBe(p1.id);
128
- expect(edges[0].target_page_id).toBe(p2.id);
129
- });
130
-
131
- it('should insert and retrieve metrics', () => {
132
- const siteId = siteRepo.createSite('example.com');
133
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
134
- pageRepo.upsertPage({ site_id: siteId, normalized_url: 'http://example.com/1', last_seen_snapshot_id: snapshotId });
135
- const p1 = pageRepo.getPage(siteId, 'http://example.com/1')!;
136
-
137
- metricsRepo.insertMetrics({
138
- snapshot_id: snapshotId,
139
- page_id: p1.id,
140
- authority_score: 0.5,
141
- hub_score: 0.2,
142
- pagerank: 0.8,
143
- pagerank_score: 80.0,
144
- link_role: 'authority',
145
- crawl_status: 'fetched',
146
- word_count: 100,
147
- thin_content_score: 0.1,
148
- external_link_ratio: 0.0,
149
- orphan_score: 0,
150
- duplicate_cluster_id: null,
151
- duplicate_type: null,
152
- is_cluster_primary: 0
153
- });
154
-
155
- const metrics = metricsRepo.getMetricsForPage(snapshotId, p1.id);
156
- expect(metrics).toBeDefined();
157
- expect(metrics?.authority_score).toBe(0.5);
158
- });
159
- });
@@ -1,72 +0,0 @@
1
- import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
- import Database from 'better-sqlite3';
3
- import { initSchema } from '../src/db/schema.js';
4
- import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
5
- import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
6
-
7
- describe('SiteRepository & SnapshotRepository', () => {
8
- let db: Database.Database;
9
- let siteRepo: SiteRepository;
10
- let snapshotRepo: SnapshotRepository;
11
-
12
- beforeEach(() => {
13
- db = new Database(':memory:');
14
- initSchema(db);
15
- siteRepo = new SiteRepository(db);
16
- snapshotRepo = new SnapshotRepository(db);
17
- });
18
-
19
- afterEach(() => {
20
- db.close();
21
- });
22
-
23
- it('getAllSites should return all sites ordered by domain', () => {
24
- siteRepo.createSite('b.com');
25
- siteRepo.createSite('a.com');
26
- siteRepo.createSite('c.com');
27
-
28
- const sites = siteRepo.getAllSites();
29
- expect(sites).toHaveLength(3);
30
- expect(sites[0].domain).toBe('a.com');
31
- expect(sites[1].domain).toBe('b.com');
32
- expect(sites[2].domain).toBe('c.com');
33
- });
34
-
35
- it('getSnapshotCount should return correct count', () => {
36
- const siteId = siteRepo.createSite('test.com');
37
-
38
- expect(snapshotRepo.getSnapshotCount(siteId)).toBe(0);
39
-
40
- snapshotRepo.createSnapshot(siteId, 'full');
41
- expect(snapshotRepo.getSnapshotCount(siteId)).toBe(1);
42
-
43
- snapshotRepo.createSnapshot(siteId, 'partial');
44
- expect(snapshotRepo.getSnapshotCount(siteId)).toBe(2);
45
- });
46
-
47
- it('getLatestSnapshot should return the latest snapshot', () => {
48
- const siteId = siteRepo.createSite('test.com');
49
-
50
- // First snapshot
51
- snapshotRepo.createSnapshot(siteId, 'full', 'completed');
52
- // Wait a tiny bit to ensure timestamp diff if needed, but synchronous execution usually implies order
53
-
54
- // Second snapshot
55
- const secondId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
56
-
57
- const latest = snapshotRepo.getLatestSnapshot(siteId);
58
- expect(latest).toBeDefined();
59
- expect(latest?.id).toBe(secondId);
60
- expect(latest?.status).toBe('running');
61
- });
62
-
63
- it('getLatestSnapshot with status filter', () => {
64
- const siteId = siteRepo.createSite('test.com');
65
- const firstId = snapshotRepo.createSnapshot(siteId, 'full', 'completed');
66
- snapshotRepo.createSnapshot(siteId, 'full', 'running');
67
-
68
- const latestCompleted = snapshotRepo.getLatestSnapshot(siteId, 'completed');
69
- expect(latestCompleted).toBeDefined();
70
- expect(latestCompleted?.id).toBe(firstId);
71
- });
72
- });
@@ -1,67 +0,0 @@
1
- import { test, expect } from 'vitest';
2
- import { Graph } from '../src/graph/graph.js';
3
- import { compareGraphs } from '../src/diff/compare.js';
4
-
5
- test('detects added and removed urls', () => {
6
- const oldGraph = new Graph();
7
- oldGraph.addNode('https://example.com/a', 0, 200);
8
- oldGraph.addNode('https://example.com/b', 1, 200);
9
-
10
- const newGraph = new Graph();
11
- newGraph.addNode('https://example.com/a', 0, 200);
12
- newGraph.addNode('https://example.com/c', 1, 200); // Added
13
-
14
- const diff = compareGraphs(oldGraph, newGraph);
15
- expect(diff.addedUrls).toContain('https://example.com/c');
16
- expect(diff.removedUrls).toContain('https://example.com/b');
17
- });
18
-
19
- test('detects status changes', () => {
20
- const oldGraph = new Graph();
21
- oldGraph.addNode('https://example.com/a', 0, 200);
22
-
23
- const newGraph = new Graph();
24
- newGraph.addNode('https://example.com/a', 0, 404);
25
-
26
- const diff = compareGraphs(oldGraph, newGraph);
27
- expect(diff.changedStatus).toHaveLength(1);
28
- expect(diff.changedStatus[0]).toEqual({
29
- url: 'https://example.com/a',
30
- oldStatus: 200,
31
- newStatus: 404
32
- });
33
- });
34
-
35
- test('detects canonical changes', () => {
36
- const oldGraph = new Graph();
37
- oldGraph.addNode('https://example.com/a', 0, 200);
38
- oldGraph.updateNodeData('https://example.com/a', { canonical: 'https://example.com/canon1' });
39
-
40
- const newGraph = new Graph();
41
- newGraph.addNode('https://example.com/a', 0, 200);
42
- newGraph.updateNodeData('https://example.com/a', { canonical: 'https://example.com/canon2' });
43
-
44
- const diff = compareGraphs(oldGraph, newGraph);
45
- expect(diff.changedCanonical).toHaveLength(1);
46
- expect(diff.changedCanonical[0]).toEqual({
47
- url: 'https://example.com/a',
48
- oldCanonical: 'https://example.com/canon1',
49
- newCanonical: 'https://example.com/canon2'
50
- });
51
- });
52
-
53
- test('calculates metric deltas', () => {
54
- const oldGraph = new Graph();
55
- // Orphan: A (depth 1, inLinks 0)
56
- oldGraph.addNode('https://example.com/a', 1, 200);
57
-
58
- const newGraph = new Graph();
59
- // Not Orphan: Root -> A
60
- newGraph.addNode('https://example.com/', 0, 200);
61
- newGraph.addNode('https://example.com/a', 1, 200);
62
- newGraph.addEdge('https://example.com/', 'https://example.com/a');
63
-
64
- const diff = compareGraphs(oldGraph, newGraph);
65
- // Old orphan count: 1 (A). New: 0. Delta: -1.
66
- expect(diff.metricDeltas.orphanCount).toBe(-1);
67
- });
@@ -1,110 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { Graph } from '../src/graph/graph.js';
3
- import { detectDuplicates } from '../src/graph/duplicate.js';
4
- import { SimHash } from '../src/graph/simhash.js';
5
-
6
- describe('Duplicate Detection', () => {
7
- it('should detect exact duplicates based on contentHash', () => {
8
- const graph = new Graph();
9
- graph.addNode('https://example.com/a', 0, 200);
10
- graph.addNode('https://example.com/b', 0, 200);
11
- graph.addNode('https://example.com/c', 0, 200);
12
-
13
- graph.updateNodeData('https://example.com/a', { contentHash: 'hash1', uniqueTokenRatio: 1.0 });
14
- graph.updateNodeData('https://example.com/b', { contentHash: 'hash1', uniqueTokenRatio: 1.0 });
15
- graph.updateNodeData('https://example.com/c', { contentHash: 'hash2', uniqueTokenRatio: 1.0 });
16
-
17
- detectDuplicates(graph);
18
-
19
- expect(graph.duplicateClusters).toHaveLength(1);
20
- const cluster = graph.duplicateClusters[0];
21
- expect(cluster.type).toBe('exact');
22
- expect(cluster.size).toBe(2);
23
-
24
- const nodeA = graph.nodes.get('https://example.com/a')!;
25
- const nodeB = graph.nodes.get('https://example.com/b')!;
26
- expect(nodeA.duplicateClusterId).toBeDefined();
27
- expect(nodeA.duplicateClusterId).toBe(nodeB.duplicateClusterId);
28
-
29
- // One should be primary, one should be collapsed
30
- expect(!nodeA.isCollapsed !== !nodeB.isCollapsed).toBe(true);
31
- });
32
-
33
- it('should detect near duplicates using SimHash', () => {
34
- const graph = new Graph();
35
- graph.addNode('https://example.com/x', 0, 200);
36
- graph.addNode('https://example.com/y', 0, 200);
37
-
38
- // Calculate simhashes that are 1 bit apart
39
- const tokens1 = ['hello', 'world', 'this', 'is', 'a', 'test', 'document'];
40
- const tokens2 = ['hello', 'world', 'this', 'is', 'a', 'test', 'document2'];
41
-
42
- const h1 = SimHash.generate(tokens1);
43
- const h2 = SimHash.generate(tokens2);
44
-
45
- // Assume standard text gives < 3 diff. For reliability in test, we'll manually set string bigint representations.
46
- // Actually, we can just use the calculated ones.
47
- graph.updateNodeData('https://example.com/x', { contentHash: 'x', simhash: h1.toString(), uniqueTokenRatio: 1.0 });
48
- graph.updateNodeData('https://example.com/y', { contentHash: 'y', simhash: h2.toString(), uniqueTokenRatio: 1.0 });
49
-
50
- detectDuplicates(graph, { simhashThreshold: 10 }); // use high threshold to guarantee match
51
-
52
- expect(graph.duplicateClusters).toHaveLength(1);
53
- expect(graph.duplicateClusters[0].type).toBe('near');
54
- });
55
-
56
- it('should identify template-heavy clusters', () => {
57
- const graph = new Graph();
58
- graph.addNode('https://example.com/1', 0, 200);
59
- graph.addNode('https://example.com/2', 0, 200);
60
-
61
- graph.updateNodeData('https://example.com/1', { contentHash: 'h1', uniqueTokenRatio: 0.2 });
62
- graph.updateNodeData('https://example.com/2', { contentHash: 'h1', uniqueTokenRatio: 0.2 });
63
-
64
- detectDuplicates(graph);
65
-
66
- expect(graph.duplicateClusters[0].type).toBe('template_heavy');
67
- });
68
-
69
- it('should mark high severity on missing canonicals', () => {
70
- const graph = new Graph();
71
- graph.addNode('https://example.com/a', 0, 200);
72
- graph.addNode('https://example.com/b', 0, 200);
73
-
74
- graph.updateNodeData('https://example.com/a', { contentHash: 'h1', canonical: 'https://example.com/a' });
75
- graph.updateNodeData('https://example.com/b', { contentHash: 'h1', canonical: undefined }); // missing
76
-
77
- detectDuplicates(graph);
78
-
79
- expect(graph.duplicateClusters[0].severity).toBe('high');
80
- });
81
-
82
- it('should transfer edges during collapse', () => {
83
- const graph = new Graph();
84
- graph.addNode('https://example.com/a', 0, 200);
85
- graph.addNode('https://example.com/b', 0, 200);
86
- graph.addNode('https://example.com/source', 0, 200);
87
-
88
- graph.updateNodeData('https://example.com/a', { contentHash: 'h1' });
89
- graph.updateNodeData('https://example.com/b', { contentHash: 'h1' });
90
-
91
- // Add edge pointing to B
92
- graph.addEdge('https://example.com/source', 'https://example.com/b', 1);
93
-
94
- // Force A to be the representative by giving it higher inLinks manually, though it's determined dynamically
95
- graph.nodes.get('https://example.com/a')!.inLinks = 10;
96
-
97
- detectDuplicates(graph);
98
-
99
- const a = graph.nodes.get('https://example.com/a')!;
100
- const b = graph.nodes.get('https://example.com/b')!;
101
-
102
- expect(a.isClusterPrimary).toBe(true);
103
- expect(a.isCollapsed).toBe(false);
104
- expect(b.isCollapsed).toBe(true);
105
- expect(b.collapseInto).toBe('https://example.com/a');
106
-
107
- // Check edge transfer
108
- expect(graph.edges.has(Graph.getEdgeKey('https://example.com/source', 'https://example.com/a'))).toBe(true);
109
- });
110
- });
@@ -1,86 +0,0 @@
1
- import { extractLinks } from '../src/crawler/extract.js';
2
- import { test, expect, describe, vi, afterEach } from 'vitest';
3
- import * as cheerio from 'cheerio';
4
-
5
- // Mock cheerio.load to allow us to simulate errors
6
- vi.mock('cheerio', async (importOriginal) => {
7
- const mod = await importOriginal<any>();
8
- return {
9
- ...mod,
10
- load: vi.fn((...args: any[]) => mod.load(...args))
11
- };
12
- });
13
-
14
- describe('extractLinks', () => {
15
- afterEach(() => {
16
- vi.restoreAllMocks();
17
- });
18
-
19
- test('should extract links correctly', () => {
20
- const html = `
21
- <html>
22
- <body>
23
- <a href="/foo">Foo</a>
24
- <a href="bar">Bar</a>
25
- <a href="https://other.com/baz">Baz</a>
26
- <a href="#top">Top</a>
27
- </body>
28
- </html>
29
- `;
30
- const links = extractLinks(html, 'https://example.com/page/');
31
- expect(links).toContain('https://example.com/foo');
32
- expect(links).toContain('https://example.com/page/bar');
33
- expect(links).toContain('https://other.com/baz');
34
- expect(links).not.toContain('https://example.com/page/#top');
35
- expect(links).toContain('https://example.com/page/');
36
- });
37
-
38
- test('should handle cheerio errors gracefully', () => {
39
- const error = new Error('Cheerio error');
40
-
41
- // Mock cheerio.load to throw an error
42
- vi.mocked(cheerio.load).mockImplementationOnce(() => {
43
- throw error;
44
- });
45
-
46
- const links = extractLinks('<html></html>', 'https://example.com');
47
-
48
- expect(links).toEqual([]);
49
- // No console error expected
50
- });
51
-
52
- test('should handle non-Error exceptions gracefully', () => {
53
- const error = 'String error'; // Simulate a thrown string
54
-
55
- vi.mocked(cheerio.load).mockImplementationOnce(() => {
56
- throw error;
57
- });
58
-
59
- const links = extractLinks('<html></html>', 'https://example.com');
60
-
61
- expect(links).toEqual([]);
62
- // No console error expected
63
- });
64
-
65
- test('should ignore invalid URLs that cause URL constructor to throw', () => {
66
- const html = '<a href="http://[">Invalid</a>';
67
- const links = extractLinks(html, 'https://example.com');
68
- expect(links).toEqual([]);
69
- });
70
-
71
- test('should ignore non-http protocols', () => {
72
- const html = `
73
- <a href="mailto:test@example.com">Mail</a>
74
- <a href="javascript:void(0)">JS</a>
75
- <a href="ftp://example.com/file">FTP</a>
76
- `;
77
- const links = extractLinks(html, 'https://example.com');
78
- expect(links).toEqual([]);
79
- });
80
-
81
- test('should ignore links without href', () => {
82
- const html = '<a>No Href</a>';
83
- const links = extractLinks(html, 'https://example.com');
84
- expect(links).toEqual([]);
85
- });
86
- });
@@ -1,110 +0,0 @@
1
- import { test, expect, beforeEach, vi } from 'vitest';
2
- import { Fetcher } from '../src/crawler/fetcher.js';
3
- import { MockAgent, setGlobalDispatcher } from 'undici';
4
- import { IPGuard } from '../src/core/security/ipGuard.js';
5
-
6
- let mockAgent: MockAgent;
7
-
8
- beforeEach(() => {
9
- mockAgent = new MockAgent();
10
- mockAgent.disableNetConnect();
11
- setGlobalDispatcher(mockAgent);
12
-
13
- // IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
14
- vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
15
- });
16
-
17
- test('fetches simple page', async () => {
18
- const client = mockAgent.get('https://example.com');
19
- client.intercept({ path: '/', method: 'GET' }).reply(200, 'Hello', {
20
- headers: { 'content-type': 'text/html', 'etag': '"123"', 'last-modified': 'Mon, 01 Jan 2000 00:00:00 GMT' }
21
- });
22
-
23
- const fetcher = new Fetcher();
24
- const res = await fetcher.fetch('https://example.com/');
25
- expect(res.status).toBe(200);
26
- expect(res.body).toBe('Hello');
27
- expect(res.etag).toBe('"123"');
28
- expect(res.lastModified).toBe('Mon, 01 Jan 2000 00:00:00 GMT');
29
- expect(res.redirectChain).toEqual([]);
30
- });
31
-
32
- test('follows redirects', async () => {
33
- const client = mockAgent.get('https://example.com');
34
- // A -> B
35
- client.intercept({ path: '/a', method: 'GET' }).reply(301, '', {
36
- headers: { location: '/b' }
37
- });
38
- // B -> C
39
- client.intercept({ path: '/b', method: 'GET' }).reply(302, '', {
40
- headers: { location: 'https://other.com/c' }
41
- });
42
-
43
- const otherClient = mockAgent.get('https://other.com');
44
- // C -> 200
45
- otherClient.intercept({ path: '/c', method: 'GET' }).reply(200, 'Final');
46
-
47
- const fetcher = new Fetcher();
48
- const res = await fetcher.fetch('https://example.com/a');
49
-
50
- expect(res.status).toBe(200);
51
- expect(res.body).toBe('Final');
52
- expect(res.finalUrl).toBe('https://other.com/c');
53
- expect(res.redirectChain.length).toBe(2);
54
- expect(res.redirectChain[0]).toEqual({ url: 'https://example.com/a', status: 301, target: 'https://example.com/b' });
55
- expect(res.redirectChain[1]).toEqual({ url: 'https://example.com/b', status: 302, target: 'https://other.com/c' });
56
- });
57
-
58
- test('detects redirect loop', async () => {
59
- const client = mockAgent.get('https://loop.com');
60
- // A -> B
61
- client.intercept({ path: '/a', method: 'GET' }).reply(301, '', { headers: { location: '/b' } });
62
- // B -> A (This will be detected as loop)
63
- client.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
64
-
65
- const fetcher = new Fetcher();
66
- const res = await fetcher.fetch('https://loop.com/a');
67
-
68
- // Should return the redirect_loop security error
69
- expect(res.status).toBe('redirect_loop');
70
- expect(res.redirectChain.length).toBe(1); // Detected while resolving target of B
71
- expect(res.redirectChain[0].url).toBe('https://loop.com/a');
72
- });
73
-
74
- test('sends conditional headers', async () => {
75
- const client = mockAgent.get('https://cache.com');
76
-
77
- client.intercept({
78
- path: '/',
79
- method: 'GET',
80
- headers: {
81
- 'If-None-Match': '"123"',
82
- 'If-Modified-Since': 'Mon, 01 Jan 2000 00:00:00 GMT'
83
- }
84
- }).reply(304, '', { headers: { etag: '"123"' } });
85
-
86
- const fetcher = new Fetcher();
87
- const res = await fetcher.fetch('https://cache.com/', {
88
- etag: '"123"',
89
- lastModified: 'Mon, 01 Jan 2000 00:00:00 GMT'
90
- });
91
-
92
- expect(res.status).toBe(304);
93
- expect(res.body).toBe('');
94
- });
95
-
96
- test('handles max redirects', async () => {
97
- const client = mockAgent.get('https://max.com');
98
- // 11 redirects
99
- for (let i = 0; i < 11; i++) {
100
- client.intercept({ path: `/p${i}`, method: 'GET' }).reply(301, '', { headers: { location: `/p${i + 1}` } });
101
- }
102
-
103
- // Set maxRedirects to 10 to trigger failure exactly after 10 hops
104
- // Increase rate to prevent timeout (11 requests * 500ms > 5000ms)
105
- const fetcher = new Fetcher({ maxRedirects: 10, rate: 100 });
106
- const res = await fetcher.fetch('https://max.com/p0');
107
-
108
- expect(res.status).toBe('redirect_limit_exceeded');
109
- expect(res.redirectChain.length).toBe(10);
110
- });