@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,91 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { Fetcher } from '../src/crawler/fetcher.js';
3
- import { request } from 'undici';
4
-
5
- vi.mock('undici', () => {
6
- return {
7
- request: vi.fn(),
8
- Agent: class {
9
- dispatch = vi.fn();
10
- },
11
- Dispatcher: class {}
12
- };
13
- });
14
-
15
- describe('Fetcher Safety Integration', () => {
16
- let fetcher: Fetcher;
17
-
18
- beforeEach(() => {
19
- vi.clearAllMocks();
20
- fetcher = new Fetcher({ rate: 100 }); // High rate for tests
21
- });
22
-
23
- it('should block internal IPs', async () => {
24
- const res = await fetcher.fetch('http://127.0.0.1');
25
- expect(res.status).toBe('blocked_internal_ip');
26
- });
27
-
28
- it('should block internal IPs in redirects', async () => {
29
- const mockRequest = vi.mocked(request);
30
-
31
- // First request is fine, returns redirect
32
- mockRequest.mockResolvedValueOnce({
33
- statusCode: 301,
34
- headers: { location: 'http://192.168.1.1' },
35
- body: { dump: vi.fn(), text: vi.fn().mockResolvedValue('') }
36
- } as any);
37
-
38
- const res = await fetcher.fetch('http://example.com');
39
- expect(res.status).toBe('blocked_internal_ip');
40
- expect(res.redirectChain).toHaveLength(1); // Records the redirect that led to block
41
- expect(res.redirectChain[0].target).toBe('http://192.168.1.1/');
42
- });
43
-
44
- it('should enforce max bytes', async () => {
45
- const mockRequest = vi.mocked(request);
46
-
47
- mockRequest.mockResolvedValueOnce({
48
- statusCode: 200,
49
- headers: {},
50
- body: {
51
- on: vi.fn((event, cb) => {
52
- if (event === 'data') {
53
- cb(Buffer.alloc(1000));
54
- cb(Buffer.alloc(1000));
55
- }
56
- return { on: vi.fn() };
57
- }),
58
- destroy: vi.fn(),
59
- dump: vi.fn()
60
- }
61
- } as any);
62
-
63
- const res = await fetcher.fetch('http://example.com', { maxBytes: 500 });
64
- expect(res.status).toBe('oversized');
65
- });
66
-
67
- it('should retry on 500', async () => {
68
- const mockRequest = vi.mocked(request);
69
-
70
- mockRequest
71
- .mockResolvedValueOnce({
72
- statusCode: 500,
73
- headers: {},
74
- body: { dump: vi.fn().mockResolvedValue(undefined) }
75
- } as any)
76
- .mockResolvedValueOnce({
77
- statusCode: 200,
78
- headers: {},
79
- body: {
80
- on: vi.fn((event, cb) => {
81
- if (event === 'data') cb(Buffer.from('ok'));
82
- if (event === 'end') cb();
83
- })
84
- }
85
- } as any);
86
-
87
- const res = await fetcher.fetch('http://example.com');
88
- expect(res.status).toBe(200);
89
- expect(res.retries).toBe(1);
90
- });
91
- });
@@ -1,26 +0,0 @@
1
- {
2
- "pages": [
3
- {
4
- "url": "https://example.com/",
5
- "status": 200,
6
- "depth": 0,
7
- "html": "<html><head><title>Example Home Page SEO Title For Strong Ranking Signals 12345</title><meta name='description' content='This is an intentionally long and descriptive meta description designed to fit ideal search snippet lengths with rich context for users and engines.'/></head><body><h1>Home</h1><nav><a href='/skip'>Nav</a></nav><p>Welcome to the homepage. This page contains meaningful content. Another sentence here.</p><img src='/a.jpg' alt='hero'><img src='/b.jpg'><a href='/about'>About</a><a href='https://external.com' rel='nofollow noopener'>External</a><script type='application/ld+json'>{\"@context\":\"https://schema.org\",\"@type\":\"WebSite\"}</script></body></html>"
8
- },
9
- {
10
- "url": "https://example.com/about",
11
- "status": 200,
12
- "depth": 1,
13
- "html": "<html><head><title>Example Home Page SEO Title For Strong Ranking Signals 12345</title><meta name='description' content='short desc'/></head><body><h1>Example Home Page SEO Title For Strong Ranking Signals 12345</h1><h1>Second</h1><p>Duplicate body sentence. Duplicate body sentence.</p><img src='/c.jpg' alt=''><script type='application/ld+json'>not-json</script><a href='https://example.com/'>Home</a></body></html>"
14
- },
15
- {
16
- "url": "https://example.com/empty",
17
- "status": 200,
18
- "depth": 2,
19
- "html": ""
20
- }
21
- ],
22
- "edges": [
23
- { "source": "https://example.com/", "target": "https://example.com/about" },
24
- { "source": "https://example.com/about", "target": "https://example.com/" }
25
- ]
26
- }
@@ -1,100 +0,0 @@
1
- import { describe, it, expect, beforeEach } from 'vitest';
2
- import { Graph } from '../../src/graph/graph.js';
3
-
4
- describe('Graph', () => {
5
- let graph: Graph;
6
-
7
- beforeEach(() => {
8
- graph = new Graph();
9
- });
10
-
11
- it('should add a new node', () => {
12
- graph.addNode('http://example.com', 0, 200);
13
- const node = graph.nodes.get('http://example.com');
14
- expect(node).toBeDefined();
15
- expect(node?.depth).toBe(0);
16
- expect(node?.status).toBe(200);
17
- });
18
-
19
- it('should update existing node status if non-zero', () => {
20
- graph.addNode('http://example.com', 0, 0);
21
- graph.addNode('http://example.com', 1, 200); // Should update status, but not depth?
22
- // Wait, addNode implementation:
23
- // if (!existing) { ... } else { if (status !== 0) existing.status = status; }
24
-
25
- const node = graph.nodes.get('http://example.com');
26
- expect(node?.status).toBe(200);
27
- expect(node?.depth).toBe(0); // Depth should not change
28
- });
29
-
30
- it('should add an edge', () => {
31
- graph.addNode('http://a.com', 0);
32
- graph.addNode('http://b.com', 1);
33
- graph.addEdge('http://a.com', 'http://b.com', 0.5);
34
-
35
- const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
36
- expect(graph.edges.has(edgeKey)).toBe(true);
37
- expect(graph.edges.get(edgeKey)).toBe(0.5);
38
-
39
- const source = graph.nodes.get('http://a.com');
40
- const target = graph.nodes.get('http://b.com');
41
- expect(source?.outLinks).toBe(1);
42
- expect(target?.inLinks).toBe(1);
43
- });
44
-
45
- it('should update edge weight if new weight is higher', () => {
46
- graph.addNode('http://a.com', 0);
47
- graph.addNode('http://b.com', 1);
48
- graph.addEdge('http://a.com', 'http://b.com', 0.5);
49
- graph.addEdge('http://a.com', 'http://b.com', 0.8);
50
-
51
- const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
52
- expect(graph.edges.get(edgeKey)).toBe(0.8);
53
-
54
- // Should not increment link counts again
55
- const source = graph.nodes.get('http://a.com');
56
- expect(source?.outLinks).toBe(1);
57
- });
58
-
59
- it('should not update edge weight if new weight is lower', () => {
60
- graph.addNode('http://a.com', 0);
61
- graph.addNode('http://b.com', 1);
62
- graph.addEdge('http://a.com', 'http://b.com', 0.8);
63
- graph.addEdge('http://a.com', 'http://b.com', 0.5);
64
-
65
- const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
66
- expect(graph.edges.get(edgeKey)).toBe(0.8);
67
- });
68
-
69
- it('should serialize to JSON and deserialize from JSON', () => {
70
- graph.addNode('http://a.com', 0, 200);
71
- graph.addNode('http://b.com', 1, 200);
72
- graph.addEdge('http://a.com', 'http://b.com', 1.0);
73
- graph.duplicateClusters = [{ id: '1', type: 'exact', size: 2, representative: 'http://a.com', severity: 'high' }];
74
- graph.contentClusters = [{ id: 1, count: 2, primaryUrl: 'http://a.com', risk: 'high' }];
75
-
76
- const json = graph.toJSON();
77
- const newGraph = Graph.fromJSON(json);
78
-
79
- expect(newGraph.nodes.size).toBe(2);
80
- expect(newGraph.edges.size).toBe(1);
81
- expect(newGraph.duplicateClusters).toHaveLength(1);
82
- expect(newGraph.contentClusters).toHaveLength(1);
83
-
84
- const nodeA = newGraph.nodes.get('http://a.com');
85
- expect(nodeA?.status).toBe(200);
86
-
87
- const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
88
- expect(newGraph.edges.get(edgeKey)).toBe(1.0);
89
- });
90
-
91
- it('should handle partial JSON in fromJSON', () => {
92
- const json = {
93
- nodes: [{ url: 'http://a.com', depth: 0, status: 200, inLinks: 0, outLinks: 0 }],
94
- // missing edges, clusters
95
- };
96
- const newGraph = Graph.fromJSON(json);
97
- expect(newGraph.nodes.size).toBe(1);
98
- expect(newGraph.edges.size).toBe(0);
99
- });
100
- });
@@ -1,124 +0,0 @@
1
- import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
- import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
3
- import { getDb, closeDb } from '../src/db/index.js';
4
- import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
5
- import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
6
- import { PageRepository } from '../src/db/repositories/PageRepository.js';
7
- import { MetricsRepository } from '../src/db/repositories/MetricsRepository.js';
8
- import { Database } from 'better-sqlite3';
9
-
10
- describe('GraphLoader', () => {
11
- let db: Database;
12
-
13
- beforeEach(() => {
14
- process.env.NODE_ENV = 'test';
15
- closeDb();
16
- db = getDb();
17
- });
18
-
19
- afterEach(() => {
20
- closeDb();
21
- });
22
-
23
- it('should load graph with metrics correctly', () => {
24
- const siteRepo = new SiteRepository(db);
25
- const snapshotRepo = new SnapshotRepository(db);
26
- const pageRepo = new PageRepository(db);
27
- const metricsRepo = new MetricsRepository(db);
28
-
29
- const siteId = siteRepo.createSite('example.com');
30
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
31
- const url = 'http://example.com/page1';
32
-
33
- // Create Page
34
- pageRepo.upsertPage({
35
- site_id: siteId,
36
- normalized_url: url,
37
- last_seen_snapshot_id: snapshotId,
38
- http_status: 200,
39
- depth: 0
40
- });
41
- const page = pageRepo.getPage(siteId, url)!;
42
-
43
- // Insert Metrics
44
- metricsRepo.insertMetrics({
45
- snapshot_id: snapshotId,
46
- page_id: page.id,
47
- authority_score: 0.5,
48
- hub_score: 0.2,
49
- pagerank: 0.8,
50
- pagerank_score: 80.0,
51
- link_role: 'authority',
52
- crawl_status: 'fetched',
53
- word_count: 500,
54
- thin_content_score: 10,
55
- external_link_ratio: 0.1,
56
- orphan_score: 5,
57
- duplicate_cluster_id: null,
58
- duplicate_type: null,
59
- is_cluster_primary: 1
60
- });
61
-
62
- // Load Graph
63
- const graph = loadGraphFromSnapshot(snapshotId);
64
- const node = graph.nodes.get(url);
65
-
66
- expect(node).toBeDefined();
67
- expect(node?.authorityScore).toBe(0.5);
68
- expect(node?.hubScore).toBe(0.2);
69
- // Verify new fields
70
- expect(node?.crawlStatus).toBe('fetched');
71
- expect(node?.wordCount).toBe(500);
72
- expect(node?.thinContentScore).toBe(10);
73
- expect(node?.externalLinkRatio).toBe(0.1);
74
- expect(node?.orphanScore).toBe(5);
75
- });
76
-
77
- it('should handle null metrics gracefully', () => {
78
- const siteRepo = new SiteRepository(db);
79
- const snapshotRepo = new SnapshotRepository(db);
80
- const pageRepo = new PageRepository(db);
81
- const metricsRepo = new MetricsRepository(db);
82
-
83
- const siteId = siteRepo.createSite('example.com');
84
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
85
- const url = 'http://example.com/page2';
86
-
87
- pageRepo.upsertPage({
88
- site_id: siteId,
89
- normalized_url: url,
90
- last_seen_snapshot_id: snapshotId,
91
- http_status: 200,
92
- depth: 1
93
- });
94
- const page = pageRepo.getPage(siteId, url)!;
95
-
96
- // Insert Metrics with nulls
97
- metricsRepo.insertMetrics({
98
- snapshot_id: snapshotId,
99
- page_id: page.id,
100
- authority_score: null,
101
- hub_score: null,
102
- pagerank: null,
103
- pagerank_score: null,
104
- link_role: null,
105
- crawl_status: null,
106
- word_count: null,
107
- thin_content_score: null,
108
- external_link_ratio: null,
109
- orphan_score: null,
110
- duplicate_cluster_id: null,
111
- duplicate_type: null,
112
- is_cluster_primary: 0
113
- });
114
-
115
- const graph = loadGraphFromSnapshot(snapshotId);
116
- const node = graph.nodes.get(url);
117
-
118
- expect(node).toBeDefined();
119
- // Check undefined
120
- expect(node?.crawlStatus).toBeUndefined();
121
- expect(node?.wordCount).toBeUndefined();
122
- expect(node?.thinContentScore).toBeUndefined();
123
- });
124
- });
@@ -1,134 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { Graph } from '../src/graph/graph.js';
3
- import { computeHITS } from '../src/scoring/hits.js';
4
-
5
- describe('HITS Scoring', () => {
6
- it('should compute scores for a simple star topology', () => {
7
- const graph = new Graph();
8
- // Hub
9
- graph.addNode('http://hub.com', 0, 200);
10
- // Authorities
11
- graph.addNode('http://auth1.com', 1, 200);
12
- graph.addNode('http://auth2.com', 1, 200);
13
- graph.addNode('http://auth3.com', 1, 200);
14
-
15
- graph.addEdge('http://hub.com', 'http://auth1.com');
16
- graph.addEdge('http://hub.com', 'http://auth2.com');
17
- graph.addEdge('http://hub.com', 'http://auth3.com');
18
-
19
- computeHITS(graph, { iterations: 10 });
20
-
21
- const hub = graph.nodes.get('http://hub.com')!;
22
- const auth1 = graph.nodes.get('http://auth1.com')!;
23
-
24
- // In a star topology:
25
- // Hub should have max hub score
26
- // Authorities should have max authority scores
27
- expect(hub.hubScore).toBeGreaterThan(0.9);
28
- expect(hub.authorityScore).toBe(0); // No one links to hub
29
-
30
- expect(auth1.authorityScore).toBeGreaterThan(0.5);
31
- expect(auth1.hubScore).toBe(0); // Auth1 links to no one
32
- });
33
-
34
- it('should handle exclusion rules', () => {
35
- const graph = new Graph();
36
- graph.addNode('http://valid.com', 0, 200);
37
- graph.addNode('http://noindex.com', 0, 200);
38
- graph.updateNodeData('http://noindex.com', { noindex: true });
39
- graph.addNode('http://redirect.com', 0, 200);
40
- graph.updateNodeData('http://redirect.com', { redirectChain: ['http://target.com'] });
41
- graph.addNode('http://external.com', 0, 200); // Eligibility check marks it as eligible if status is 200
42
- // but typically external wouldn't have status 200 in the graph if we don't crawl them or they are marked as external.
43
- // The current hits logic relies on: status === 200 && no redirectChain && !noindex
44
-
45
- graph.addEdge('http://valid.com', 'http://noindex.com');
46
- graph.addEdge('http://valid.com', 'http://redirect.com');
47
-
48
- computeHITS(graph);
49
-
50
- expect(graph.nodes.get('http://noindex.com')?.hubScore).toBeUndefined();
51
- expect(graph.nodes.get('http://redirect.com')?.hubScore).toBeUndefined();
52
- expect(graph.nodes.get('http://valid.com')?.hubScore).toBe(0); // Valid hub but its targets are ineligible
53
- });
54
-
55
- it('should respect edge weights', () => {
56
- const graph = new Graph();
57
- graph.addNode('http://hub.com', 0, 200);
58
- graph.addNode('http://auth-high.com', 1, 200);
59
- graph.addNode('http://auth-low.com', 1, 200);
60
-
61
- graph.addEdge('http://hub.com', 'http://auth-high.com', 1.0);
62
- graph.addEdge('http://hub.com', 'http://auth-low.com', 0.1);
63
-
64
- computeHITS(graph, { iterations: 10 });
65
-
66
- const authHigh = graph.nodes.get('http://auth-high.com')!;
67
- const authLow = graph.nodes.get('http://auth-low.com')!;
68
-
69
- expect(authHigh.authorityScore).toBeGreaterThan(authLow.authorityScore!);
70
- });
71
-
72
- it('should classify link roles correctly', () => {
73
- const graph = new Graph();
74
- for (let i = 0; i < 11; i++) {
75
- graph.addNode(`http://node${i}.com`, 0, 200);
76
- }
77
-
78
- // AUTHORITY: node1 (linked by 0,2,3... no outlinks)
79
- graph.addEdge('http://node0.com', 'http://node1.com');
80
- graph.addEdge('http://node2.com', 'http://node1.com');
81
- graph.addEdge('http://node3.com', 'http://node1.com');
82
- graph.addEdge('http://node4.com', 'http://node1.com');
83
-
84
- // HUB: node4 (links to 1,5,6,7... few inlinks)
85
- graph.addEdge('http://node4.com', 'http://node5.com');
86
- graph.addEdge('http://node4.com', 'http://node6.com');
87
- graph.addEdge('http://node4.com', 'http://node7.com');
88
-
89
- // POWER: node2 (linked by 0, power is often recursive... link to authority and be linked by hub)
90
- graph.addEdge('http://node0.com', 'http://node2.com');
91
- graph.addEdge('http://node2.com', 'http://node1.com');
92
- graph.addEdge('http://node2.com', 'http://node5.com');
93
-
94
- // PERIPHERAL: node10 (no links)
95
- // Some filler nodes to push medians down
96
- graph.addEdge('http://node8.com', 'http://node9.com');
97
-
98
- computeHITS(graph, { iterations: 20 });
99
-
100
- const roles = graph.getNodes().map(n => n.linkRole).filter(Boolean);
101
- expect(roles).toContain('authority');
102
- expect(roles).toContain('hub');
103
- expect(roles).toContain('power');
104
- expect(roles).toContain('peripheral');
105
- });
106
-
107
- it('should handle large synthetic graphs (Performance Test)', () => {
108
- const graph = new Graph();
109
- const nodeCount = 5000;
110
-
111
- // Create 5000 nodes
112
- for (let i = 0; i < nodeCount; i++) {
113
- graph.addNode(`http://page${i}.com`, 1, 200);
114
- }
115
-
116
- // Create random edges (avg 10 per node)
117
- for (let i = 0; i < nodeCount; i++) {
118
- for (let j = 0; j < 10; j++) {
119
- const target = Math.floor(Math.random() * nodeCount);
120
- if (i !== target) {
121
- graph.addEdge(`http://page${i}.com`, `http://page${target}.com`);
122
- }
123
- }
124
- }
125
-
126
- const start = Date.now();
127
- computeHITS(graph, { iterations: 20 });
128
- const duration = Date.now() - start;
129
-
130
- console.log(`HITS on 5000 nodes took ${duration}ms`);
131
- expect(duration).toBeLessThan(2000); // Should be very fast, but allow buffer for CI environments
132
- expect(graph.nodes.get('http://page0.com')?.hubScore).toBeDefined();
133
- });
134
- });
@@ -1,59 +0,0 @@
1
- import { describe, expect, test } from 'vitest';
2
- import { renderAnalysisHtml, AnalysisResult, PageAnalysis } from '../src/analysis/analyze.js';
3
-
4
- const mockPage: PageAnalysis = {
5
- url: 'https://example.com',
6
- status: 200,
7
- seoScore: 85,
8
- thinScore: 10,
9
- title: { value: 'Example Title', length: 13, status: 'ok' },
10
- metaDescription: { value: 'Example Desc', length: 12, status: 'ok' },
11
- h1: { count: 1, status: 'ok', matchesTitle: true },
12
- content: { wordCount: 500, uniqueSentenceCount: 50, textHtmlRatio: 0.6 },
13
- images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
14
- links: { internalLinks: 5, externalLinks: 2, nofollowCount: 0, externalRatio: 0.2 },
15
- structuredData: { present: true, valid: true, types: ['Article'] },
16
- meta: { canonical: 'https://example.com', noindex: false, nofollow: false }
17
- };
18
-
19
- const mockResult: AnalysisResult = {
20
- site_summary: {
21
- pages_analyzed: 1,
22
- avg_seo_score: 85,
23
- thin_pages: 0,
24
- duplicate_titles: 0,
25
- site_score: 90
26
- },
27
- site_scores: { overallScore: 90, seoHealthScore: 85 },
28
- pages: [mockPage],
29
- active_modules: { seo: true, content: true, accessibility: true }
30
- };
31
-
32
- describe('HTML Report Generation', () => {
33
- test('generates single page report correctly', () => {
34
- // If pages length is 1, it renders single page report
35
- const html = renderAnalysisHtml(mockResult);
36
- expect(html).toContain('<!DOCTYPE html>');
37
- expect(html).toContain('Analysis for https://example.com');
38
- expect(html).toContain('Example Title');
39
- expect(html).toContain('Example Desc');
40
- expect(html).toContain('500 words');
41
- expect(html).toContain('<span class="status-ok">Valid</span>');
42
- });
43
-
44
- test('generates list report correctly', () => {
45
- // Modify result to have 2 pages to trigger list view
46
- const listResult: AnalysisResult = {
47
- ...mockResult,
48
- pages: [mockPage, { ...mockPage, url: 'https://example.com/2' }]
49
- };
50
- const html = renderAnalysisHtml(listResult);
51
-
52
- expect(html).toContain('<!DOCTYPE html>');
53
- expect(html).toContain('Crawlith Analysis Report');
54
- expect(html).toContain('Pages: 1'); // site_summary.pages_analyzed is 1 in mockResult
55
- expect(html).toContain('https://example.com');
56
- expect(html).toContain('https://example.com/2');
57
- expect(html).toContain('<td>85</td>'); // seoScore
58
- });
59
- });
@@ -1,73 +0,0 @@
1
- import { describe, it, expect, vi } from 'vitest';
2
- import { IPGuard } from '../src/core/security/ipGuard.js';
3
- import * as dns from 'dns';
4
-
5
- vi.mock('dns', () => ({
6
- lookup: vi.fn(),
7
- resolve4: vi.fn(),
8
- resolve6: vi.fn(),
9
- }));
10
-
11
- describe('IPGuard Secure Lookup', () => {
12
- it('should resolve safe IPs', () => {
13
- const lookupMock = vi.mocked(dns.lookup);
14
- // Mock successful resolution
15
- lookupMock.mockImplementation((hostname, options, callback) => {
16
- callback(null, '8.8.8.8', 4);
17
- });
18
-
19
- const callback = vi.fn();
20
- IPGuard.secureLookup('google.com', {}, callback);
21
-
22
- expect(callback).toHaveBeenCalledWith(null, '8.8.8.8', 4);
23
- });
24
-
25
- it('should block internal IPs', () => {
26
- const lookupMock = vi.mocked(dns.lookup);
27
- // Mock internal IP resolution
28
- lookupMock.mockImplementation((hostname, options, callback) => {
29
- callback(null, '127.0.0.1', 4);
30
- });
31
-
32
- const callback = vi.fn();
33
- IPGuard.secureLookup('localhost', {}, callback);
34
-
35
- expect(callback).toHaveBeenCalledWith(expect.any(Error), '127.0.0.1', 4);
36
- const error = callback.mock.calls[0][0];
37
- expect(error.message).toContain('Blocked internal IP');
38
- expect(error.code).toBe('EBLOCKED');
39
- });
40
-
41
- it('should handle array of IPs (IPv4)', () => {
42
- const lookupMock = vi.mocked(dns.lookup);
43
- // Mock array resolution
44
- lookupMock.mockImplementation((hostname, options, callback) => {
45
- // Mocking address array structure
46
- const addresses = [
47
- { address: '1.1.1.1', family: 4 },
48
- { address: '127.0.0.1', family: 4 }
49
- ];
50
- callback(null, addresses as any, 4);
51
- });
52
-
53
- const callback = vi.fn();
54
- IPGuard.secureLookup('mixed.com', { all: true } as any, callback);
55
-
56
- expect(callback).toHaveBeenCalledWith(expect.any(Error), expect.anything(), 4);
57
- const error = callback.mock.calls[0][0];
58
- expect(error.message).toContain('Blocked internal IP');
59
- });
60
-
61
- it('should pass through DNS errors', () => {
62
- const lookupMock = vi.mocked(dns.lookup);
63
- const dnsError = new Error('ENOTFOUND');
64
- lookupMock.mockImplementation((hostname, options, callback) => {
65
- callback(dnsError as any, undefined as any, 0);
66
- });
67
-
68
- const callback = vi.fn();
69
- IPGuard.secureLookup('invalid.domain', {}, callback);
70
-
71
- expect(callback).toHaveBeenCalledWith(dnsError, undefined, 0);
72
- });
73
- });