@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -1,6 +1,7 @@
1
- import { test, expect, beforeEach } from 'vitest';
1
+ import { test, expect, beforeEach, vi } from 'vitest';
2
2
  import { Fetcher } from '../src/crawler/fetcher.js';
3
3
  import { MockAgent, setGlobalDispatcher } from 'undici';
4
+ import { IPGuard } from '../src/core/security/ipGuard.js';
4
5
 
5
6
  let mockAgent: MockAgent;
6
7
 
@@ -8,6 +9,9 @@ beforeEach(() => {
8
9
  mockAgent = new MockAgent();
9
10
  mockAgent.disableNetConnect();
10
11
  setGlobalDispatcher(mockAgent);
12
+
13
+ // IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
14
+ vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
11
15
  });
12
16
 
13
17
  test('fetches simple page', async () => {
@@ -2,9 +2,15 @@ import { describe, it, expect, vi, beforeEach } from 'vitest';
2
2
  import { Fetcher } from '../src/crawler/fetcher.js';
3
3
  import { request } from 'undici';
4
4
 
5
- vi.mock('undici', () => ({
6
- request: vi.fn(),
7
- }));
5
+ vi.mock('undici', () => {
6
+ return {
7
+ request: vi.fn(),
8
+ Agent: class {
9
+ dispatch = vi.fn();
10
+ },
11
+ Dispatcher: class {}
12
+ };
13
+ });
8
14
 
9
15
  describe('Fetcher Safety Integration', () => {
10
16
  let fetcher: Fetcher;
@@ -0,0 +1,100 @@
1
+ import { describe, it, expect, beforeEach } from 'vitest';
2
+ import { Graph } from '../../src/graph/graph.js';
3
+
4
+ describe('Graph', () => {
5
+ let graph: Graph;
6
+
7
+ beforeEach(() => {
8
+ graph = new Graph();
9
+ });
10
+
11
+ it('should add a new node', () => {
12
+ graph.addNode('http://example.com', 0, 200);
13
+ const node = graph.nodes.get('http://example.com');
14
+ expect(node).toBeDefined();
15
+ expect(node?.depth).toBe(0);
16
+ expect(node?.status).toBe(200);
17
+ });
18
+
19
+ it('should update existing node status if non-zero', () => {
20
+ graph.addNode('http://example.com', 0, 0);
21
+ graph.addNode('http://example.com', 1, 200); // Should update status, but not depth?
22
+ // Wait, addNode implementation:
23
+ // if (!existing) { ... } else { if (status !== 0) existing.status = status; }
24
+
25
+ const node = graph.nodes.get('http://example.com');
26
+ expect(node?.status).toBe(200);
27
+ expect(node?.depth).toBe(0); // Depth should not change
28
+ });
29
+
30
+ it('should add an edge', () => {
31
+ graph.addNode('http://a.com', 0);
32
+ graph.addNode('http://b.com', 1);
33
+ graph.addEdge('http://a.com', 'http://b.com', 0.5);
34
+
35
+ const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
36
+ expect(graph.edges.has(edgeKey)).toBe(true);
37
+ expect(graph.edges.get(edgeKey)).toBe(0.5);
38
+
39
+ const source = graph.nodes.get('http://a.com');
40
+ const target = graph.nodes.get('http://b.com');
41
+ expect(source?.outLinks).toBe(1);
42
+ expect(target?.inLinks).toBe(1);
43
+ });
44
+
45
+ it('should update edge weight if new weight is higher', () => {
46
+ graph.addNode('http://a.com', 0);
47
+ graph.addNode('http://b.com', 1);
48
+ graph.addEdge('http://a.com', 'http://b.com', 0.5);
49
+ graph.addEdge('http://a.com', 'http://b.com', 0.8);
50
+
51
+ const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
52
+ expect(graph.edges.get(edgeKey)).toBe(0.8);
53
+
54
+ // Should not increment link counts again
55
+ const source = graph.nodes.get('http://a.com');
56
+ expect(source?.outLinks).toBe(1);
57
+ });
58
+
59
+ it('should not update edge weight if new weight is lower', () => {
60
+ graph.addNode('http://a.com', 0);
61
+ graph.addNode('http://b.com', 1);
62
+ graph.addEdge('http://a.com', 'http://b.com', 0.8);
63
+ graph.addEdge('http://a.com', 'http://b.com', 0.5);
64
+
65
+ const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
66
+ expect(graph.edges.get(edgeKey)).toBe(0.8);
67
+ });
68
+
69
+ it('should serialize to JSON and deserialize from JSON', () => {
70
+ graph.addNode('http://a.com', 0, 200);
71
+ graph.addNode('http://b.com', 1, 200);
72
+ graph.addEdge('http://a.com', 'http://b.com', 1.0);
73
+ graph.duplicateClusters = [{ id: '1', type: 'exact', size: 2, representative: 'http://a.com', severity: 'high' }];
74
+ graph.contentClusters = [{ id: 1, count: 2, primaryUrl: 'http://a.com', risk: 'high' }];
75
+
76
+ const json = graph.toJSON();
77
+ const newGraph = Graph.fromJSON(json);
78
+
79
+ expect(newGraph.nodes.size).toBe(2);
80
+ expect(newGraph.edges.size).toBe(1);
81
+ expect(newGraph.duplicateClusters).toHaveLength(1);
82
+ expect(newGraph.contentClusters).toHaveLength(1);
83
+
84
+ const nodeA = newGraph.nodes.get('http://a.com');
85
+ expect(nodeA?.status).toBe(200);
86
+
87
+ const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
88
+ expect(newGraph.edges.get(edgeKey)).toBe(1.0);
89
+ });
90
+
91
+ it('should handle partial JSON in fromJSON', () => {
92
+ const json = {
93
+ nodes: [{ url: 'http://a.com', depth: 0, status: 200, inLinks: 0, outLinks: 0 }],
94
+ // missing edges, clusters
95
+ };
96
+ const newGraph = Graph.fromJSON(json);
97
+ expect(newGraph.nodes.size).toBe(1);
98
+ expect(newGraph.edges.size).toBe(0);
99
+ });
100
+ });
@@ -0,0 +1,124 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
3
+ import { getDb, closeDb } from '../src/db/index.js';
4
+ import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
5
+ import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
6
+ import { PageRepository } from '../src/db/repositories/PageRepository.js';
7
+ import { MetricsRepository } from '../src/db/repositories/MetricsRepository.js';
8
+ import { Database } from 'better-sqlite3';
9
+
10
+ describe('GraphLoader', () => {
11
+ let db: Database;
12
+
13
+ beforeEach(() => {
14
+ process.env.NODE_ENV = 'test';
15
+ closeDb();
16
+ db = getDb();
17
+ });
18
+
19
+ afterEach(() => {
20
+ closeDb();
21
+ });
22
+
23
+ it('should load graph with metrics correctly', () => {
24
+ const siteRepo = new SiteRepository(db);
25
+ const snapshotRepo = new SnapshotRepository(db);
26
+ const pageRepo = new PageRepository(db);
27
+ const metricsRepo = new MetricsRepository(db);
28
+
29
+ const siteId = siteRepo.createSite('example.com');
30
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
31
+ const url = 'http://example.com/page1';
32
+
33
+ // Create Page
34
+ pageRepo.upsertPage({
35
+ site_id: siteId,
36
+ normalized_url: url,
37
+ last_seen_snapshot_id: snapshotId,
38
+ http_status: 200,
39
+ depth: 0
40
+ });
41
+ const page = pageRepo.getPage(siteId, url)!;
42
+
43
+ // Insert Metrics
44
+ metricsRepo.insertMetrics({
45
+ snapshot_id: snapshotId,
46
+ page_id: page.id,
47
+ authority_score: 0.5,
48
+ hub_score: 0.2,
49
+ pagerank: 0.8,
50
+ pagerank_score: 80.0,
51
+ link_role: 'authority',
52
+ crawl_status: 'fetched',
53
+ word_count: 500,
54
+ thin_content_score: 10,
55
+ external_link_ratio: 0.1,
56
+ orphan_score: 5,
57
+ duplicate_cluster_id: null,
58
+ duplicate_type: null,
59
+ is_cluster_primary: 1
60
+ });
61
+
62
+ // Load Graph
63
+ const graph = loadGraphFromSnapshot(snapshotId);
64
+ const node = graph.nodes.get(url);
65
+
66
+ expect(node).toBeDefined();
67
+ expect(node?.authorityScore).toBe(0.5);
68
+ expect(node?.hubScore).toBe(0.2);
69
+ // Verify new fields
70
+ expect(node?.crawlStatus).toBe('fetched');
71
+ expect(node?.wordCount).toBe(500);
72
+ expect(node?.thinContentScore).toBe(10);
73
+ expect(node?.externalLinkRatio).toBe(0.1);
74
+ expect(node?.orphanScore).toBe(5);
75
+ });
76
+
77
+ it('should handle null metrics gracefully', () => {
78
+ const siteRepo = new SiteRepository(db);
79
+ const snapshotRepo = new SnapshotRepository(db);
80
+ const pageRepo = new PageRepository(db);
81
+ const metricsRepo = new MetricsRepository(db);
82
+
83
+ const siteId = siteRepo.createSite('example.com');
84
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
85
+ const url = 'http://example.com/page2';
86
+
87
+ pageRepo.upsertPage({
88
+ site_id: siteId,
89
+ normalized_url: url,
90
+ last_seen_snapshot_id: snapshotId,
91
+ http_status: 200,
92
+ depth: 1
93
+ });
94
+ const page = pageRepo.getPage(siteId, url)!;
95
+
96
+ // Insert Metrics with nulls
97
+ metricsRepo.insertMetrics({
98
+ snapshot_id: snapshotId,
99
+ page_id: page.id,
100
+ authority_score: null,
101
+ hub_score: null,
102
+ pagerank: null,
103
+ pagerank_score: null,
104
+ link_role: null,
105
+ crawl_status: null,
106
+ word_count: null,
107
+ thin_content_score: null,
108
+ external_link_ratio: null,
109
+ orphan_score: null,
110
+ duplicate_cluster_id: null,
111
+ duplicate_type: null,
112
+ is_cluster_primary: 0
113
+ });
114
+
115
+ const graph = loadGraphFromSnapshot(snapshotId);
116
+ const node = graph.nodes.get(url);
117
+
118
+ expect(node).toBeDefined();
119
+ // Check undefined
120
+ expect(node?.crawlStatus).toBeUndefined();
121
+ expect(node?.wordCount).toBeUndefined();
122
+ expect(node?.thinContentScore).toBeUndefined();
123
+ });
124
+ });
@@ -1,58 +1,59 @@
1
1
  import { describe, expect, test } from 'vitest';
2
- import { generateHtml } from '../src/report/html.js';
3
- import { Metrics } from '../src/graph/metrics.js';
2
+ import { renderAnalysisHtml, AnalysisResult, PageAnalysis } from '../src/analysis/analyze.js';
4
3
 
5
- describe('html report generator', () => {
6
- test('generates valid html string with metrics', () => {
7
- const mockMetrics: Metrics = {
8
- totalPages: 10,
9
- totalEdges: 20,
10
- orphanPages: ['https://example.com/orphan'],
11
- nearOrphans: [],
12
- deepPages: [],
13
- topAuthorityPages: [{ url: 'https://example.com/', authority: 0.9 }],
14
- averageOutDegree: 2.0,
15
- maxDepthFound: 5,
16
- crawlEfficiencyScore: 0.8,
17
- averageDepth: 3.0,
18
- structuralEntropy: 1.5,
19
- topPageRankPages: [],
20
- limitReached: false,
21
- sessionStats: {
22
- pagesFetched: 5,
23
- pagesCached: 2,
24
- pagesSkipped: 0,
25
- totalFound: 7
26
- }
27
- };
4
+ const mockPage: PageAnalysis = {
5
+ url: 'https://example.com',
6
+ status: 200,
7
+ seoScore: 85,
8
+ thinScore: 10,
9
+ title: { value: 'Example Title', length: 13, status: 'ok' },
10
+ metaDescription: { value: 'Example Desc', length: 12, status: 'ok' },
11
+ h1: { count: 1, status: 'ok', matchesTitle: true },
12
+ content: { wordCount: 500, uniqueSentenceCount: 50, textHtmlRatio: 0.6 },
13
+ images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
14
+ links: { internalLinks: 5, externalLinks: 2, nofollowCount: 0, externalRatio: 0.2 },
15
+ structuredData: { present: true, valid: true, types: ['Article'] },
16
+ meta: { canonical: 'https://example.com', noindex: false, nofollow: false }
17
+ };
28
18
 
29
- const mockGraphData = {
30
- nodes: [{ url: 'https://example.com/', depth: 0, inLinks: 5, outLinks: 2, status: 200 }],
31
- edges: []
32
- };
19
+ const mockResult: AnalysisResult = {
20
+ site_summary: {
21
+ pages_analyzed: 1,
22
+ avg_seo_score: 85,
23
+ thin_pages: 0,
24
+ duplicate_titles: 0,
25
+ site_score: 90
26
+ },
27
+ site_scores: { overallScore: 90, seoHealthScore: 85 },
28
+ pages: [mockPage],
29
+ active_modules: { seo: true, content: true, accessibility: true }
30
+ };
33
31
 
34
- const html = generateHtml(mockGraphData, mockMetrics);
32
+ describe('HTML Report Generation', () => {
33
+ test('generates single page report correctly', () => {
34
+ // If pages length is 1, it renders single page report
35
+ const html = renderAnalysisHtml(mockResult);
36
+ expect(html).toContain('<!DOCTYPE html>');
37
+ expect(html).toContain('Analysis for https://example.com');
38
+ expect(html).toContain('Example Title');
39
+ expect(html).toContain('Example Desc');
40
+ expect(html).toContain('500 words');
41
+ expect(html).toContain('<span class="status-ok">Valid</span>');
42
+ });
35
43
 
36
- expect(html).toContain('<!DOCTYPE html>');
37
- expect(html).toContain('Crawlith Site Graph');
38
- expect(html).toContain('10</span>'); // totalPages
39
- expect(html).toContain('5 pages</span>'); // pagesFetched
40
- expect(html).toContain('2</span>'); // pagesCached
41
- expect(html).toContain('https://example.com/orphan');
42
- expect(html).toContain('window.GRAPH_DATA =');
43
- });
44
+ test('generates list report correctly', () => {
45
+ // Modify result to have 2 pages to trigger list view
46
+ const listResult: AnalysisResult = {
47
+ ...mockResult,
48
+ pages: [mockPage, { ...mockPage, url: 'https://example.com/2' }]
49
+ };
50
+ const html = renderAnalysisHtml(listResult);
44
51
 
45
- test('handles missing session stats', () => {
46
- const mockMetrics: any = {
47
- totalPages: 10,
48
- totalEdges: 20,
49
- orphanPages: [],
50
- averageOutDegree: 2.0,
51
- maxDepthFound: 5,
52
- topAuthorityPages: [],
53
- sessionStats: null
54
- };
55
- const html = generateHtml({ nodes: [], edges: [] }, mockMetrics as any);
56
- expect(html).not.toContain('Session Crawl:');
57
- });
52
+ expect(html).toContain('<!DOCTYPE html>');
53
+ expect(html).toContain('Crawlith Analysis Report');
54
+ expect(html).toContain('Pages: 1'); // site_summary.pages_analyzed is 1 in mockResult
55
+ expect(html).toContain('https://example.com');
56
+ expect(html).toContain('https://example.com/2');
57
+ expect(html).toContain('<td>85</td>'); // seoScore
58
+ });
58
59
  });
@@ -0,0 +1,73 @@
1
+ import { describe, it, expect, vi } from 'vitest';
2
+ import { IPGuard } from '../src/core/security/ipGuard.js';
3
+ import * as dns from 'dns';
4
+
5
+ vi.mock('dns', () => ({
6
+ lookup: vi.fn(),
7
+ resolve4: vi.fn(),
8
+ resolve6: vi.fn(),
9
+ }));
10
+
11
+ describe('IPGuard Secure Lookup', () => {
12
+ it('should resolve safe IPs', () => {
13
+ const lookupMock = vi.mocked(dns.lookup);
14
+ // Mock successful resolution
15
+ lookupMock.mockImplementation((hostname, options, callback) => {
16
+ callback(null, '8.8.8.8', 4);
17
+ });
18
+
19
+ const callback = vi.fn();
20
+ IPGuard.secureLookup('google.com', {}, callback);
21
+
22
+ expect(callback).toHaveBeenCalledWith(null, '8.8.8.8', 4);
23
+ });
24
+
25
+ it('should block internal IPs', () => {
26
+ const lookupMock = vi.mocked(dns.lookup);
27
+ // Mock internal IP resolution
28
+ lookupMock.mockImplementation((hostname, options, callback) => {
29
+ callback(null, '127.0.0.1', 4);
30
+ });
31
+
32
+ const callback = vi.fn();
33
+ IPGuard.secureLookup('localhost', {}, callback);
34
+
35
+ expect(callback).toHaveBeenCalledWith(expect.any(Error), '127.0.0.1', 4);
36
+ const error = callback.mock.calls[0][0];
37
+ expect(error.message).toContain('Blocked internal IP');
38
+ expect(error.code).toBe('EBLOCKED');
39
+ });
40
+
41
+ it('should handle array of IPs (IPv4)', () => {
42
+ const lookupMock = vi.mocked(dns.lookup);
43
+ // Mock array resolution
44
+ lookupMock.mockImplementation((hostname, options, callback) => {
45
+ // Mocking address array structure
46
+ const addresses = [
47
+ { address: '1.1.1.1', family: 4 },
48
+ { address: '127.0.0.1', family: 4 }
49
+ ];
50
+ callback(null, addresses as any, 4);
51
+ });
52
+
53
+ const callback = vi.fn();
54
+ IPGuard.secureLookup('mixed.com', { all: true } as any, callback);
55
+
56
+ expect(callback).toHaveBeenCalledWith(expect.any(Error), expect.anything(), 4);
57
+ const error = callback.mock.calls[0][0];
58
+ expect(error.message).toContain('Blocked internal IP');
59
+ });
60
+
61
+ it('should pass through DNS errors', () => {
62
+ const lookupMock = vi.mocked(dns.lookup);
63
+ const dnsError = new Error('ENOTFOUND');
64
+ lookupMock.mockImplementation((hostname, options, callback) => {
65
+ callback(dnsError as any, undefined as any, 0);
66
+ });
67
+
68
+ const callback = vi.fn();
69
+ IPGuard.secureLookup('invalid.domain', {}, callback);
70
+
71
+ expect(callback).toHaveBeenCalledWith(dnsError, undefined, 0);
72
+ });
73
+ });
@@ -6,6 +6,7 @@ import { existsSync, unlinkSync, readFileSync } from 'node:fs';
6
6
  import path from 'node:path';
7
7
  import os from 'node:os';
8
8
  import { isPidAlive } from '../../src/lock/pidCheck.js';
9
+ import { EngineContext } from '../../src/events.js';
9
10
 
10
11
  // Mock fs and os
11
12
  vi.mock('node:fs/promises');
@@ -15,6 +16,8 @@ vi.mock('../../src/lock/pidCheck.js', () => ({
15
16
  isPidAlive: vi.fn()
16
17
  }));
17
18
 
19
+ const mockContext: EngineContext = { emit: vi.fn() };
20
+
18
21
  describe('LockManager', () => {
19
22
  const mockHomeDir = '/home/user';
20
23
  const lockDir = path.join(mockHomeDir, '.crawlith', 'locks');
@@ -40,23 +43,18 @@ describe('LockManager', () => {
40
43
  vi.spyOn(process, 'exit').mockImplementation((code) => {
41
44
  throw new Error(`Process exit ${code}`);
42
45
  });
43
-
44
- // Mock console to suppress noise
45
- vi.spyOn(console, 'log').mockImplementation(() => {});
46
- vi.spyOn(console, 'warn').mockImplementation(() => {});
47
- vi.spyOn(console, 'error').mockImplementation(() => {});
48
-
49
46
  // Reset static state if any (LockManager stores lockFilePath)
50
- // We can't easily reset private static via TS, but we can call releaseLock which clears it if set
51
47
  LockManager.releaseLock();
52
48
  });
53
49
 
54
50
  afterEach(() => {
55
51
  vi.restoreAllMocks();
52
+ // Reset static state
53
+ LockManager.releaseLock();
56
54
  });
57
55
 
58
56
  it('should acquire lock when no lock exists', async () => {
59
- await LockManager.acquireLock(command, target, options);
57
+ await LockManager.acquireLock(command, target, options, mockContext);
60
58
 
61
59
  expect(fs.mkdir).toHaveBeenCalledWith(lockDir, { recursive: true });
62
60
  expect(fs.writeFile).toHaveBeenCalledWith(
@@ -77,9 +75,9 @@ describe('LockManager', () => {
77
75
  }));
78
76
  vi.mocked(isPidAlive).mockReturnValue(true);
79
77
 
80
- await expect(LockManager.acquireLock(command, target, options)).rejects.toThrow('Process exit 1');
78
+ await expect(LockManager.acquireLock(command, target, options, mockContext)).rejects.toThrow('Process exit 1');
81
79
 
82
- expect(console.error).toHaveBeenCalledWith(expect.stringContaining('already running'));
80
+ expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('already running') }));
83
81
  });
84
82
 
85
83
  it('should clear stale lock and acquire if PID is dead', async () => {
@@ -93,11 +91,11 @@ describe('LockManager', () => {
93
91
  }));
94
92
  vi.mocked(isPidAlive).mockReturnValue(false);
95
93
 
96
- await LockManager.acquireLock(command, target, options);
94
+ await LockManager.acquireLock(command, target, options, mockContext);
97
95
 
98
96
  expect(unlinkSync).toHaveBeenCalledWith(lockPath);
99
97
  expect(fs.writeFile).toHaveBeenCalled();
100
- expect(console.log).toHaveBeenCalledWith(expect.stringContaining('Detected stale lock'));
98
+ expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'info', message: expect.stringContaining('Detected stale lock') }));
101
99
  });
102
100
 
103
101
  it('should override lock if force is true', async () => {
@@ -108,24 +106,24 @@ describe('LockManager', () => {
108
106
  }));
109
107
  vi.mocked(isPidAlive).mockReturnValue(true);
110
108
 
111
- await LockManager.acquireLock(command, target, options, true); // force = true
109
+ await LockManager.acquireLock(command, target, options, mockContext, true); // force = true
112
110
 
113
111
  expect(unlinkSync).toHaveBeenCalledWith(lockPath);
114
112
  expect(fs.writeFile).toHaveBeenCalled();
115
- expect(console.warn).toHaveBeenCalledWith(expect.stringContaining('Force mode enabled'));
113
+ expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'warn', message: expect.stringContaining('Force mode enabled') }));
116
114
  });
117
115
 
118
116
  it('should handle race condition (EEXIST)', async () => {
119
117
  vi.mocked(existsSync).mockReturnValue(false);
120
118
  vi.mocked(fs.writeFile).mockRejectedValue({ code: 'EEXIST' });
121
119
 
122
- await expect(LockManager.acquireLock(command, target, options)).rejects.toThrow('Process exit 1');
123
- expect(console.error).toHaveBeenCalledWith(expect.stringContaining('Race condition'));
120
+ await expect(LockManager.acquireLock(command, target, options, mockContext)).rejects.toThrow('Process exit 1');
121
+ expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('Race condition') }));
124
122
  });
125
123
 
126
124
  it('should release lock on exit', async () => {
127
125
  // Acquire first (existsSync returns false by default from beforeEach)
128
- await LockManager.acquireLock(command, target, options);
126
+ await LockManager.acquireLock(command, target, options, mockContext);
129
127
 
130
128
  // Simulate file exists for release
131
129
  vi.mocked(existsSync).mockReturnValue(true);
@@ -135,4 +133,66 @@ describe('LockManager', () => {
135
133
 
136
134
  expect(unlinkSync).toHaveBeenCalledWith(lockPath);
137
135
  });
136
+
137
+ it('should register signal handlers and cleanup on SIGINT', async () => {
138
+ const processOnSpy = vi.spyOn(process, 'on');
139
+ await LockManager.acquireLock(command, target, options, mockContext);
140
+
141
+ // Find the handler
142
+ const sigintCall = processOnSpy.mock.calls.find(call => call[0] === 'SIGINT');
143
+ expect(sigintCall).toBeDefined();
144
+ const handler = sigintCall![1] as () => void;
145
+
146
+ // Trigger handler
147
+ vi.mocked(existsSync).mockReturnValue(true); // Simulate file still exists
148
+ try {
149
+ handler();
150
+ } catch (e: any) {
151
+ // Expect process.exit(130) which throws error in our mock
152
+ expect(e.message).toBe('Process exit 130');
153
+ }
154
+
155
+ expect(unlinkSync).toHaveBeenCalledWith(lockPath);
156
+ });
157
+
158
+ it('should register signal handlers and cleanup on SIGTERM', async () => {
159
+ const processOnSpy = vi.spyOn(process, 'on');
160
+ await LockManager.acquireLock(command, target, options, mockContext);
161
+
162
+ // Find the handler
163
+ const sigtermCall = processOnSpy.mock.calls.find(call => call[0] === 'SIGTERM');
164
+ expect(sigtermCall).toBeDefined();
165
+ const handler = sigtermCall![1] as () => void;
166
+
167
+ // Trigger handler
168
+ vi.mocked(existsSync).mockReturnValue(true);
169
+ try {
170
+ handler();
171
+ } catch (e: any) {
172
+ expect(e.message).toBe('Process exit 143');
173
+ }
174
+
175
+ expect(unlinkSync).toHaveBeenCalledWith(lockPath);
176
+ });
177
+
178
+ it('should register signal handlers and cleanup on uncaughtException', async () => {
179
+ const processOnSpy = vi.spyOn(process, 'on');
180
+ await LockManager.acquireLock(command, target, options, mockContext);
181
+
182
+ // Find the handler
183
+ const uncaughtExceptionCall = processOnSpy.mock.calls.find(call => call[0] === 'uncaughtException');
184
+ expect(uncaughtExceptionCall).toBeDefined();
185
+ const handler = uncaughtExceptionCall![1] as (err: Error) => void;
186
+
187
+ // Trigger handler
188
+ vi.mocked(existsSync).mockReturnValue(true);
189
+ try {
190
+ handler(new Error('Test error'));
191
+ } catch (e: any) {
192
+ expect(e.message).toBe('Process exit 1');
193
+ }
194
+
195
+ expect(unlinkSync).toHaveBeenCalledWith(lockPath);
196
+ expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('Uncaught Exception'), error: expect.any(Error) }));
197
+ });
138
198
  });
@@ -1,5 +1,4 @@
1
1
  import { normalizeUrl } from '../src/crawler/normalize.js';
2
- import { extractLinks } from '../src/crawler/extract.js';
3
2
  import { test, expect } from 'vitest';
4
3
 
5
4
  test('normalizeUrl', () => {
@@ -11,24 +10,6 @@ test('normalizeUrl', () => {
11
10
  expect(normalizeUrl('https://example.com/', '')).toBe('https://example.com/');
12
11
  });
13
12
 
14
- test('extractLinks', () => {
15
- const html = `
16
- <html>
17
- <body>
18
- <a href="/foo">Foo</a>
19
- <a href="bar">Bar</a>
20
- <a href="https://other.com/baz">Baz</a>
21
- <a href="#top">Top</a>
22
- </body>
23
- </html>
24
- `;
25
- const links = extractLinks(html, 'https://example.com/page/');
26
- expect(links).toContain('https://example.com/foo');
27
- expect(links).toContain('https://example.com/page/bar');
28
- expect(links).toContain('https://other.com/baz');
29
- expect(links).not.toContain('https://example.com/page/#top');
30
- expect(links).toContain('https://example.com/page/'); // #top resolves to base url without fragment
31
- });
32
13
  test('normalizeUrl: absolute resolution', () => {
33
14
  expect(normalizeUrl('/foo', 'https://example.com')).toBe('https://example.com/foo');
34
15
  expect(normalizeUrl('bar', 'https://example.com/baz/')).toBe('https://example.com/baz/bar');
@@ -95,6 +76,12 @@ test('normalizeUrl: skip non-HTML assets', () => {
95
76
  expect(normalizeUrl('https://example.com/page', '')).toBe('https://example.com/page');
96
77
  });
97
78
 
79
+ test('normalizeUrl: invalid URL', () => {
80
+ expect(normalizeUrl('/foo', '')).toBeNull();
81
+ expect(normalizeUrl('invalid-url', '')).toBeNull();
82
+ expect(normalizeUrl('/foo', 'invalid-base')).toBeNull();
83
+ });
84
+
98
85
  test('normalizeUrl: return format', () => {
99
86
  const res = normalizeUrl('https://example.com/foo?a=1', '');
100
87
  expect(res).toBe('https://example.com/foo?a=1');