@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,196 +0,0 @@
1
- import { Graph } from '../src/graph/graph.js';
2
- import { calculateMetrics } from '../src/graph/metrics.js';
3
- import { test, expect } from 'vitest';
4
-
5
- test('graph metrics basic', () => {
6
- const g = new Graph();
7
-
8
- // Structure:
9
- // A -> B
10
- // A -> C
11
- // B -> C
12
- // C -> A
13
-
14
- g.addNode('A', 0, 200);
15
- g.addNode('B', 1, 200);
16
- g.addNode('C', 1, 200);
17
-
18
- g.addEdge('A', 'B');
19
- g.addEdge('A', 'C');
20
- g.addEdge('B', 'C');
21
- g.addEdge('C', 'A');
22
-
23
- const metrics = calculateMetrics(g, 5);
24
-
25
- expect(metrics.totalPages).toBe(3);
26
- expect(metrics.totalEdges).toBe(4);
27
-
28
- // Check degrees on nodes directly
29
- const nodeA = g.nodes.get('A');
30
- expect(nodeA?.inLinks).toBe(1);
31
- expect(nodeA?.outLinks).toBe(2);
32
-
33
- const nodeC = g.nodes.get('C');
34
- expect(nodeC?.inLinks).toBe(2);
35
- expect(nodeC?.outLinks).toBe(1);
36
-
37
- expect(metrics.averageOutDegree).toBeCloseTo(4/3);
38
-
39
- // Top authority should be C with 2 in-links, authority = 1
40
- expect(metrics.topAuthorityPages[0].url).toBe('C');
41
- expect(metrics.topAuthorityPages[0].authority).toBeCloseTo(1);
42
-
43
- // Max depth found
44
- expect(metrics.maxDepthFound).toBe(1);
45
-
46
- // Orphan pages (none)
47
- expect(metrics.orphanPages).toEqual([]);
48
- });
49
-
50
- test('orphan pages', () => {
51
- const g = new Graph();
52
- g.addNode('Root', 0, 200);
53
- g.addNode('Orphan', 1, 200);
54
- // Orphan is at depth 1 but no incoming edges recorded (maybe missed or filtered)
55
-
56
- const metrics = calculateMetrics(g, 5);
57
- expect(metrics.orphanPages).toContain('Orphan');
58
- expect(metrics.orphanPages).not.toContain('Root');
59
- });
60
- test('metrics v2 calculations', () => {
61
- const g = new Graph();
62
-
63
- // Root (depth 0, in=0, out=2)
64
- g.addNode('root', 0, 200);
65
-
66
- // A (depth 1, in=1, out=1)
67
- g.addNode('A', 1, 200);
68
- g.addEdge('root', 'A');
69
-
70
- // B (depth 1, in=1, out=0)
71
- g.addNode('B', 1, 200);
72
- g.addEdge('root', 'B');
73
-
74
- // C (depth 2, in=1, out=0)
75
- g.addNode('C', 2, 200);
76
- g.addEdge('A', 'C');
77
-
78
- // Orphan (depth 1, in=0) - e.g. added but no edge to it?
79
- // If it's in graph with depth > 0 and inLinks=0, it's an orphan.
80
- g.addNode('orphan', 1, 200);
81
-
82
- // Near Orphan (depth 3, in=1)
83
- g.addNode('D', 2, 200);
84
- g.addNode('nearOrphan', 3, 200);
85
- g.addEdge('C', 'D'); // C->D
86
- g.addEdge('D', 'nearOrphan'); // D->nearOrphan
87
-
88
- // Deep page (depth 4)
89
- g.addNode('deep', 4, 200);
90
- g.addEdge('nearOrphan', 'deep');
91
-
92
- // Nodes: root(0), A(1), B(1), C(2), orphan(1), D(2), nearOrphan(3), deep(4)
93
- // Total pages: 8
94
-
95
- // Edges: root->A, root->B, A->C, C->D, D->nearOrphan, nearOrphan->deep
96
- // Total edges: 6
97
-
98
- // InLinks:
99
- // root: 0
100
- // A: 1
101
- // B: 1
102
- // C: 1
103
- // orphan: 0
104
- // D: 1
105
- // nearOrphan: 1
106
- // deep: 1
107
-
108
- // Max InLinks = 1.
109
- // Authority Score = log(1 + in) / log(1 + maxIn)
110
- // If maxIn = 1, log(2).
111
- // For A: log(2)/log(2) = 1.
112
- // For root: log(1)/log(2) = 0.
113
-
114
- // Let's make maxIn > 1 to test better.
115
- g.addNode('popular', 1, 200);
116
- g.addEdge('root', 'popular');
117
- g.addEdge('A', 'popular');
118
- // popular inLinks = 2. MaxIn = 2.
119
- // Authority popular = log(3)/log(3) = 1.
120
- // Authority A = log(2)/log(3) approx 0.63
121
-
122
- const metrics = calculateMetrics(g, 10); // maxDepth arg (not used for calculation logic of deepPages which is hardcoded >=4 per prompt?)
123
- // Prompt says "deepPages: depth >= 4".
124
- // Existing calculateMetrics takes maxDepth arg.
125
- // Existing: deepPages = nodes.filter(n => n.depth >= maxDepth)
126
- // New requirement: deepPages: depth >= 4.
127
- // I should probably ignore the argument or update the requirement interpretation.
128
- // "deepPages: depth >= 4" implies fixed threshold.
129
-
130
- // Orphan pages: inLinks === 0 && depth > 0
131
- expect(metrics.orphanPages).toContain('orphan');
132
- expect(metrics.orphanPages).not.toContain('root'); // depth 0
133
-
134
- // Near orphans: inLinks === 1 && depth >= 3
135
- expect(metrics.nearOrphans).toContain('nearOrphan'); // depth 3, in 1
136
- expect(metrics.nearOrphans).toContain('deep'); // depth 4, in 1 (from nearOrphan)
137
- expect(metrics.nearOrphans).not.toContain('D'); // depth 2
138
-
139
- // Deep pages: depth >= 4
140
- expect(metrics.deepPages).toContain('deep');
141
- expect(metrics.deepPages).not.toContain('nearOrphan');
142
-
143
- // Crawl Efficiency Score: 1 - (deepPagesCount / totalPages)
144
- // Total: 9 nodes (root, A, B, C, orphan, D, nearOrphan, deep, popular)
145
- // Deep: 1 (deep)
146
- // Score: 1 - 1/9 = 8/9 = 0.888...
147
- expect(metrics.crawlEfficiencyScore).toBeCloseTo(8/9);
148
-
149
- // Average Depth: sum(depth) / totalPages
150
- // Depths: 0, 1, 1, 2, 1, 2, 3, 4, 1
151
- // Sum: 15
152
- // Avg: 15/9 = 1.666...
153
- expect(metrics.averageDepth).toBeCloseTo(15/9);
154
-
155
- // Structural Entropy
156
- // OutDegrees:
157
- // root: 3 (A, B, popular)
158
- // A: 2 (C, popular)
159
- // B: 0
160
- // C: 1 (D)
161
- // orphan: 0
162
- // D: 1 (nearOrphan)
163
- // nearOrphan: 1 (deep)
164
- // deep: 0
165
- // popular: 0
166
-
167
- // Distribution:
168
- // 0: 4 nodes (B, orphan, deep, popular)
169
- // 1: 3 nodes (C, D, nearOrphan)
170
- // 2: 1 node (A)
171
- // 3: 1 node (root)
172
-
173
- // P(0) = 4/9
174
- // P(1) = 3/9
175
- // P(2) = 1/9
176
- // P(3) = 1/9
177
-
178
- // Entropy = - (4/9 log2(4/9) + 3/9 log2(3/9) + 1/9 log2(1/9) + 1/9 log2(1/9))
179
- // = - (0.444 * -1.17 + 0.333 * -1.58 + 0.111 * -3.17 + 0.111 * -3.17)
180
- // approx 1.75
181
-
182
- // Let's compute exact expected value
183
- const p0 = 4/9;
184
- const p1 = 3/9;
185
- const p2 = 1/9;
186
- const p3 = 1/9;
187
- const entropy = - (p0 * Math.log2(p0) + p1 * Math.log2(p1) + p2 * Math.log2(p2) + p3 * Math.log2(p3));
188
-
189
- expect(metrics.structuralEntropy).toBeCloseTo(entropy);
190
-
191
- // Limit Reached
192
- expect(metrics.limitReached).toBe(false);
193
- g.limitReached = true;
194
- const metrics2 = calculateMetrics(g, 10);
195
- expect(metrics2.limitReached).toBe(true);
196
- });
@@ -1,101 +0,0 @@
1
- import { normalizeUrl } from '../src/crawler/normalize.js';
2
- import { extractLinks } from '../src/crawler/extract.js';
3
- import { test, expect } from 'vitest';
4
-
5
- test('normalizeUrl', () => {
6
- expect(normalizeUrl('https://Example.com/Foo/', '')).toBe('https://example.com/Foo');
7
- expect(normalizeUrl('http://example.com:80/bar', '')).toBe('http://example.com/bar');
8
- expect(normalizeUrl('https://example.com/baz#frag', '')).toBe('https://example.com/baz');
9
- expect(normalizeUrl('https://example.com/qux?a=1', '', { stripQuery: true })).toBe('https://example.com/qux');
10
- expect(normalizeUrl('https://example.com/qux?a=1', '', { stripQuery: false })).toBe('https://example.com/qux?a=1');
11
- expect(normalizeUrl('https://example.com/', '')).toBe('https://example.com/');
12
- });
13
-
14
- test('extractLinks', () => {
15
- const html = `
16
- <html>
17
- <body>
18
- <a href="/foo">Foo</a>
19
- <a href="bar">Bar</a>
20
- <a href="https://other.com/baz">Baz</a>
21
- <a href="#top">Top</a>
22
- </body>
23
- </html>
24
- `;
25
- const links = extractLinks(html, 'https://example.com/page/');
26
- expect(links).toContain('https://example.com/foo');
27
- expect(links).toContain('https://example.com/page/bar');
28
- expect(links).toContain('https://other.com/baz');
29
- expect(links).not.toContain('https://example.com/page/#top');
30
- expect(links).toContain('https://example.com/page/'); // #top resolves to base url without fragment
31
- });
32
- test('normalizeUrl: absolute resolution', () => {
33
- expect(normalizeUrl('/foo', 'https://example.com')).toBe('https://example.com/foo');
34
- expect(normalizeUrl('bar', 'https://example.com/baz/')).toBe('https://example.com/baz/bar');
35
- expect(normalizeUrl('//other.com/foo', 'https://example.com')).toBe('https://other.com/foo');
36
- });
37
-
38
- test('normalizeUrl: only http/https', () => {
39
- expect(normalizeUrl('ftp://example.com/file', 'https://example.com')).toBeNull();
40
- expect(normalizeUrl('mailto:user@example.com', 'https://example.com')).toBeNull();
41
- expect(normalizeUrl('javascript:alert(1)', 'https://example.com')).toBeNull();
42
- });
43
-
44
- test('normalizeUrl: lowercase hostname', () => {
45
- expect(normalizeUrl('https://EXAMPLE.com/foo', '')).toBe('https://example.com/foo');
46
- });
47
-
48
- test('normalizeUrl: remove default ports', () => {
49
- expect(normalizeUrl('http://example.com:80/foo', '')).toBe('http://example.com/foo');
50
- expect(normalizeUrl('https://example.com:443/foo', '')).toBe('https://example.com/foo');
51
- expect(normalizeUrl('http://example.com:8080/foo', '')).toBe('http://example.com:8080/foo');
52
- });
53
-
54
- test('normalizeUrl: remove hash fragments', () => {
55
- expect(normalizeUrl('https://example.com/foo#bar', '')).toBe('https://example.com/foo');
56
- });
57
-
58
- test('normalizeUrl: strip query', () => {
59
- expect(normalizeUrl('https://example.com/foo?a=1&b=2', '', { stripQuery: true })).toBe('https://example.com/foo');
60
- });
61
-
62
- test('normalizeUrl: filter tracking params', () => {
63
- const url = 'https://example.com/foo?utm_source=google&utm_medium=cpc&a=1&fbclid=123';
64
- expect(normalizeUrl(url, '', { stripQuery: false })).toBe('https://example.com/foo?a=1');
65
-
66
- const url2 = 'https://example.com/foo?gclid=abc&msclkid=def';
67
- expect(normalizeUrl(url2, '', { stripQuery: false })).toBe('https://example.com/foo');
68
- });
69
-
70
- test('normalizeUrl: trailing slash', () => {
71
- expect(normalizeUrl('https://example.com/foo/', '')).toBe('https://example.com/foo');
72
- expect(normalizeUrl('https://example.com/', '')).toBe('https://example.com/');
73
- });
74
-
75
- test('normalizeUrl: collapse duplicate slashes', () => {
76
- expect(normalizeUrl('https://example.com/foo//bar', '')).toBe('https://example.com/foo/bar');
77
- expect(normalizeUrl('https://example.com//foo///bar', '')).toBe('https://example.com/foo/bar');
78
- });
79
-
80
- test('normalizeUrl: skip non-HTML assets', () => {
81
- expect(normalizeUrl('https://example.com/file.pdf', '')).toBeNull();
82
- expect(normalizeUrl('https://example.com/image.jpg', '')).toBeNull();
83
- expect(normalizeUrl('https://example.com/image.png', '')).toBeNull();
84
- expect(normalizeUrl('https://example.com/image.svg', '')).toBeNull();
85
- expect(normalizeUrl('https://example.com/image.webp', '')).toBeNull();
86
- expect(normalizeUrl('https://example.com/image.gif', '')).toBeNull();
87
- expect(normalizeUrl('https://example.com/archive.zip', '')).toBeNull();
88
- expect(normalizeUrl('https://example.com/data.xml', '')).toBeNull();
89
- expect(normalizeUrl('https://example.com/data.json', '')).toBeNull();
90
- expect(normalizeUrl('https://example.com/video.mp4', '')).toBeNull();
91
-
92
- // HTML extensions should pass (or no extension)
93
- expect(normalizeUrl('https://example.com/page.html', '')).toBe('https://example.com/page.html');
94
- expect(normalizeUrl('https://example.com/page.htm', '')).toBe('https://example.com/page.htm');
95
- expect(normalizeUrl('https://example.com/page', '')).toBe('https://example.com/page');
96
- });
97
-
98
- test('normalizeUrl: return format', () => {
99
- const res = normalizeUrl('https://example.com/foo?a=1', '');
100
- expect(res).toBe('https://example.com/foo?a=1');
101
- });
@@ -1,160 +0,0 @@
1
- import { describe, expect, test } from 'vitest';
2
- import { annotateOrphans, calculateOrphanSeverity, mapImpactLevel, type SitegraphNode, type SitegraphEdge } from '../src/scoring/orphanSeverity.js';
3
-
4
- function baseNode(url: string, overrides: Partial<SitegraphNode> = {}): SitegraphNode {
5
- return {
6
- url,
7
- depth: 1,
8
- inLinks: 0,
9
- outLinks: 0,
10
- status: 200,
11
- ...overrides
12
- };
13
- }
14
-
15
- describe('orphan detection and severity scoring', () => {
16
- test('hard orphan detection and homepage exclusion', () => {
17
- const nodes: SitegraphNode[] = [
18
- baseNode('https://example.com/', { depth: 0, inLinks: 0 }),
19
- baseNode('https://example.com/orphan', { inLinks: 0 })
20
- ];
21
- const edges: SitegraphEdge[] = [];
22
-
23
- const result = annotateOrphans(nodes, edges, {
24
- enabled: true,
25
- severityEnabled: false,
26
- includeSoftOrphans: false,
27
- minInbound: 2,
28
- rootUrl: 'https://example.com/'
29
- });
30
-
31
- expect(result[0]).toMatchObject({ orphan: false });
32
- expect(result[1]).toMatchObject({ orphan: true, orphanType: 'hard' });
33
- });
34
-
35
- test('near orphan threshold override', () => {
36
- const nodes = [baseNode('https://example.com/near', { inLinks: 2 })];
37
- const edges: SitegraphEdge[] = [];
38
-
39
- const resultDefault = annotateOrphans(nodes, edges, {
40
- enabled: true,
41
- severityEnabled: false,
42
- includeSoftOrphans: false,
43
- minInbound: 2
44
- });
45
- const resultStrict = annotateOrphans(nodes, edges, {
46
- enabled: true,
47
- severityEnabled: false,
48
- includeSoftOrphans: false,
49
- minInbound: 1
50
- });
51
-
52
- expect(resultDefault[0]).toMatchObject({ orphan: true, orphanType: 'near' });
53
- expect(resultStrict[0]).toMatchObject({ orphan: false });
54
- });
55
-
56
- test('soft orphan detection only when enabled and inbound only from low-value sources', () => {
57
- const nodes: SitegraphNode[] = [
58
- baseNode('https://example.com/tag/seo', { pageType: 'tag', outLinks: 1 }),
59
- baseNode('https://example.com/list?page=2', { pageType: 'pagination', outLinks: 1 }),
60
- baseNode('https://example.com/target', { inLinks: 2 }),
61
- baseNode('https://example.com/normal', { outLinks: 1 })
62
- ];
63
-
64
- const edges: SitegraphEdge[] = [
65
- { source: 'https://example.com/tag/seo', target: 'https://example.com/target' },
66
- { source: 'https://example.com/list?page=2', target: 'https://example.com/target' }
67
- ];
68
-
69
- const withSoft = annotateOrphans(nodes, edges, {
70
- enabled: true,
71
- severityEnabled: false,
72
- includeSoftOrphans: true,
73
- minInbound: 1
74
- });
75
-
76
- const withoutSoft = annotateOrphans(nodes, edges, {
77
- enabled: true,
78
- severityEnabled: false,
79
- includeSoftOrphans: false,
80
- minInbound: 1
81
- });
82
-
83
- expect(withSoft.find((n) => n.url.endsWith('/target'))).toMatchObject({ orphan: true, orphanType: 'soft' });
84
- expect(withoutSoft.find((n) => n.url.endsWith('/target'))).toMatchObject({ orphan: false });
85
- });
86
-
87
- test('crawl-only orphan detection', () => {
88
- const nodes = [baseNode('https://example.com/sitemap-only', { inLinks: 0, discoveredViaSitemap: true })];
89
- const result = annotateOrphans(nodes, [], {
90
- enabled: true,
91
- severityEnabled: false,
92
- includeSoftOrphans: false,
93
- minInbound: 2
94
- });
95
-
96
- expect(result[0]).toMatchObject({ orphan: true, orphanType: 'crawl-only' });
97
- });
98
-
99
- test('severity calculation modifiers and score clamping', () => {
100
- const high = calculateOrphanSeverity('hard', baseNode('https://example.com/high', {
101
- inLinks: 0,
102
- wordCount: 1500,
103
- hasStructuredData: true,
104
- depth: 1,
105
- isProductOrCommercial: true
106
- }));
107
-
108
- const low = calculateOrphanSeverity('hard', baseNode('https://example.com/low', {
109
- inLinks: 0,
110
- wordCount: 120,
111
- noindex: true,
112
- duplicateContent: true,
113
- pageType: 'archive'
114
- }));
115
-
116
- expect(high).toBe(100);
117
- expect(low).toBe(80);
118
- });
119
-
120
- test('impact level mapping', () => {
121
- expect(mapImpactLevel(0)).toBe('low');
122
- expect(mapImpactLevel(39)).toBe('low');
123
- expect(mapImpactLevel(40)).toBe('medium');
124
- expect(mapImpactLevel(69)).toBe('medium');
125
- expect(mapImpactLevel(70)).toBe('high');
126
- expect(mapImpactLevel(89)).toBe('high');
127
- expect(mapImpactLevel(90)).toBe('critical');
128
- expect(mapImpactLevel(100)).toBe('critical');
129
- });
130
-
131
- test('canonical consolidation, robots exclusion, and deterministic JSON output snapshot', () => {
132
- const nodes: SitegraphNode[] = [
133
- baseNode('https://example.com/canonical', { inLinks: 0 }),
134
- baseNode('https://example.com/variant?a=1', { canonicalUrl: 'https://example.com/canonical', inLinks: 1 }),
135
- baseNode('https://example.com/blocked', { inLinks: 0, robotsExcluded: true }),
136
- baseNode('https://example.com/redirect-target', { inLinks: 1 })
137
- ];
138
-
139
- const edges: SitegraphEdge[] = [
140
- { source: 'https://example.com/redirect-source', target: 'https://example.com/redirect-target' }
141
- ];
142
-
143
- const options = {
144
- enabled: true,
145
- severityEnabled: true,
146
- includeSoftOrphans: true,
147
- minInbound: 2
148
- };
149
-
150
- const first = annotateOrphans(nodes, edges, options);
151
- const second = annotateOrphans(nodes, edges, options);
152
-
153
- expect(first).toEqual(second);
154
- expect(first.find((n) => n.url.endsWith('/canonical'))).toMatchObject({ orphan: true, orphanType: 'near' });
155
- expect(first.find((n) => n.url.endsWith('/blocked'))).toMatchObject({ orphan: false });
156
-
157
- const normalized = JSON.stringify(first, null, 2).replace(/\r\n/g, '\n');
158
- expect(normalized).toMatchSnapshot();
159
- });
160
- });
@@ -1,98 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { Graph } from '../src/graph/graph.js';
3
- import { computePageRank } from '../src/graph/pagerank.js';
4
-
5
- describe('PageRank Engine', () => {
6
- it('should calculate identical PageRank for a simple loop', () => {
7
- const graph = new Graph();
8
- graph.addNode('https://a.com', 0, 200);
9
- graph.addNode('https://b.com', 1, 200);
10
- graph.addEdge('https://a.com', 'https://b.com');
11
- graph.addEdge('https://b.com', 'https://a.com');
12
-
13
- computePageRank(graph);
14
- const nodes = graph.getNodes();
15
-
16
- expect(nodes[0].pageRank).toBeCloseTo(0.5, 4);
17
- expect(nodes[1].pageRank).toBeCloseTo(0.5, 4);
18
- expect(nodes[0].pageRankScore).toBe(100);
19
- expect(nodes[1].pageRankScore).toBe(100);
20
- });
21
-
22
- it('should identify the center of a star graph as most important', () => {
23
- const graph = new Graph();
24
- graph.addNode('https://center.com', 0, 200);
25
- graph.addNode('https://p1.com', 1, 200);
26
- graph.addNode('https://p2.com', 1, 200);
27
- graph.addNode('https://p3.com', 1, 200);
28
-
29
- // Star in: all link to center
30
- graph.addEdge('https://p1.com', 'https://center.com');
31
- graph.addEdge('https://p2.com', 'https://center.com');
32
- graph.addEdge('https://p3.com', 'https://center.com');
33
-
34
- computePageRank(graph);
35
- const nodes = graph.getNodes();
36
-
37
- const center = nodes.find(n => n.url.includes('center'))!;
38
- const leaves = nodes.filter(n => !n.url.includes('center'));
39
-
40
- expect(center.pageRankScore).toBe(100);
41
- leaves.forEach(leaf => {
42
- expect(leaf.pageRankScore).toBeLessThan(100);
43
- expect(leaf.pageRank!).toBeLessThan(center.pageRank!);
44
- });
45
- });
46
-
47
- it('should respect link weights (Body > Nav > Footer)', () => {
48
- const graph = new Graph();
49
- graph.addNode('https://source.com', 0, 200);
50
- graph.addNode('https://body-target.com', 1, 200);
51
- graph.addNode('https://footer-target.com', 1, 200);
52
-
53
- // Body weight 1.0, Footer weight 0.4
54
- graph.addEdge('https://source.com', 'https://body-target.com', 1.0);
55
- graph.addEdge('https://source.com', 'https://footer-target.com', 0.4);
56
-
57
- computePageRank(graph);
58
-
59
- const bodyTarget = graph.nodes.get('https://body-target.com')!;
60
- const footerTarget = graph.nodes.get('https://footer-target.com')!;
61
-
62
- expect(bodyTarget.pageRank!).toBeGreaterThan(footerTarget.pageRank!);
63
- });
64
-
65
- it('should handle sink nodes by redistributing rank', () => {
66
- const graph = new Graph();
67
- graph.addNode('https://a.com', 0, 200);
68
- graph.addNode('https://b.com', 1, 200); // b is a sink
69
- graph.addEdge('https://a.com', 'https://b.com');
70
-
71
- computePageRank(graph);
72
-
73
- const nodeA = graph.nodes.get('https://a.com')!;
74
- const nodeB = graph.nodes.get('https://b.com')!;
75
-
76
- // Without redistribution, A would lose all rank.
77
- // With redistribution, A should still have some rank.
78
- expect(nodeA.pageRank).toBeGreaterThan(0);
79
- expect(nodeB.pageRank).toBeGreaterThan(nodeA.pageRank!);
80
- });
81
-
82
- it('should exclude noindex pages from receiving or passing rank', () => {
83
- const graph = new Graph();
84
- graph.addNode('https://a.com', 0, 200);
85
- graph.addNode('https://no-index.com', 1, 200);
86
- graph.nodes.get('https://no-index.com')!.noindex = true;
87
-
88
- graph.addEdge('https://a.com', 'https://no-index.com');
89
-
90
- computePageRank(graph);
91
-
92
- const nodeA = graph.nodes.get('https://a.com')!;
93
- const nodeNoIndex = graph.nodes.get('https://no-index.com')!;
94
-
95
- expect(nodeNoIndex.pageRank).toBeUndefined();
96
- expect(nodeA.pageRank).toBe(1.0); // Only one eligible node
97
- });
98
- });
@@ -1,117 +0,0 @@
1
- import { test, expect } from 'vitest';
2
- import { Parser } from '../src/crawler/parser.js';
3
-
4
- const parser = new Parser();
5
- const baseUrl = 'https://example.com';
6
-
7
- test('extracts links correctly', () => {
8
- const html = `
9
- <html>
10
- <body>
11
- <a href="/page1">Page 1</a>
12
- <a href="https://other.com">Other</a>
13
- <a href="#hash">Hash</a>
14
- <a href="javascript:void(0)">JS</a>
15
- </body>
16
- </html>
17
- `;
18
- const result = parser.parse(html, baseUrl, 200);
19
- const urls = result.links.map(l => l.url);
20
- expect(urls).toContain('https://example.com/page1');
21
- expect(urls).toContain('https://other.com/');
22
- expect(urls).not.toContain('https://example.com/#hash');
23
- // It also extracts the base URL itself from href="#hash"
24
- expect(urls).toContain('https://example.com/');
25
- expect(result.links.length).toBe(3);
26
- });
27
-
28
- test('respects nofollow on links', () => {
29
- const html = `
30
- <html>
31
- <body>
32
- <a href="/page1" rel="nofollow">Page 1</a>
33
- <a href="/page2">Page 2</a>
34
- </body>
35
- </html>
36
- `;
37
- const result = parser.parse(html, baseUrl, 200);
38
- const urls = result.links.map(l => l.url);
39
- expect(urls).not.toContain('https://example.com/page1');
40
- expect(urls).toContain('https://example.com/page2');
41
- });
42
-
43
- test('respects meta robots nofollow', () => {
44
- const html = `
45
- <html>
46
- <head>
47
- <meta name="robots" content="nofollow">
48
- </head>
49
- <body>
50
- <a href="/page1">Page 1</a>
51
- </body>
52
- </html>
53
- `;
54
- const result = parser.parse(html, baseUrl, 200);
55
- expect(result.nofollow).toBe(true);
56
- expect(result.links.length).toBe(0);
57
- });
58
-
59
- test('detects canonical', () => {
60
- const html = `
61
- <html>
62
- <head>
63
- <link rel="canonical" href="https://example.com/canon">
64
- </head>
65
- </html>
66
- `;
67
- const result = parser.parse(html, baseUrl, 200);
68
- expect(result.canonical).toBe('https://example.com/canon');
69
- });
70
-
71
- test('detects relative canonical', () => {
72
- const html = `
73
- <html>
74
- <head>
75
- <link rel="canonical" href="/canon">
76
- </head>
77
- </html>
78
- `;
79
- const result = parser.parse(html, baseUrl, 200);
80
- expect(result.canonical).toBe('https://example.com/canon');
81
- });
82
-
83
- test('detects soft 404', () => {
84
- const html = `
85
- <html>
86
- <head><title>Page Not Found</title></head>
87
- <body>Sorry, the page you are looking for does not exist.</body>
88
- </html>
89
- `;
90
- const result = parser.parse(html, baseUrl, 200);
91
- expect(result.soft404Score).toBeGreaterThanOrEqual(0.5);
92
- });
93
-
94
- test('content hash ignores scripts', () => {
95
- const html1 = `
96
- <html><body><script>var x=1;</script><p>Hello</p></body></html>
97
- `;
98
- const html2 = `
99
- <html><body><script>var x=2;</script><p>Hello</p></body></html>
100
- `;
101
- const result1 = parser.parse(html1, baseUrl, 200);
102
- const result2 = parser.parse(html2, baseUrl, 200);
103
- expect(result1.contentHash).toBe(result2.contentHash);
104
- });
105
-
106
- test('detects meta robots noindex', () => {
107
- const html = `
108
- <html>
109
- <head>
110
- <meta name="robots" content="noindex, nofollow">
111
- </head>
112
- </html>
113
- `;
114
- const result = parser.parse(html, baseUrl, 200);
115
- expect(result.noindex).toBe(true);
116
- expect(result.nofollow).toBe(true);
117
- });