@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/tests/fetcher.test.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { test, expect, beforeEach } from 'vitest';
|
|
1
|
+
import { test, expect, beforeEach, vi } from 'vitest';
|
|
2
2
|
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
3
3
|
import { MockAgent, setGlobalDispatcher } from 'undici';
|
|
4
|
+
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
4
5
|
|
|
5
6
|
let mockAgent: MockAgent;
|
|
6
7
|
|
|
@@ -8,6 +9,9 @@ beforeEach(() => {
|
|
|
8
9
|
mockAgent = new MockAgent();
|
|
9
10
|
mockAgent.disableNetConnect();
|
|
10
11
|
setGlobalDispatcher(mockAgent);
|
|
12
|
+
|
|
13
|
+
// IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
|
|
14
|
+
vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
|
|
11
15
|
});
|
|
12
16
|
|
|
13
17
|
test('fetches simple page', async () => {
|
|
@@ -2,9 +2,15 @@ import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
|
2
2
|
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
3
3
|
import { request } from 'undici';
|
|
4
4
|
|
|
5
|
-
vi.mock('undici', () =>
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
vi.mock('undici', () => {
|
|
6
|
+
return {
|
|
7
|
+
request: vi.fn(),
|
|
8
|
+
Agent: class {
|
|
9
|
+
dispatch = vi.fn();
|
|
10
|
+
},
|
|
11
|
+
Dispatcher: class {}
|
|
12
|
+
};
|
|
13
|
+
});
|
|
8
14
|
|
|
9
15
|
describe('Fetcher Safety Integration', () => {
|
|
10
16
|
let fetcher: Fetcher;
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach } from 'vitest';
|
|
2
|
+
import { Graph } from '../../src/graph/graph.js';
|
|
3
|
+
|
|
4
|
+
describe('Graph', () => {
|
|
5
|
+
let graph: Graph;
|
|
6
|
+
|
|
7
|
+
beforeEach(() => {
|
|
8
|
+
graph = new Graph();
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
it('should add a new node', () => {
|
|
12
|
+
graph.addNode('http://example.com', 0, 200);
|
|
13
|
+
const node = graph.nodes.get('http://example.com');
|
|
14
|
+
expect(node).toBeDefined();
|
|
15
|
+
expect(node?.depth).toBe(0);
|
|
16
|
+
expect(node?.status).toBe(200);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it('should update existing node status if non-zero', () => {
|
|
20
|
+
graph.addNode('http://example.com', 0, 0);
|
|
21
|
+
graph.addNode('http://example.com', 1, 200); // Should update status, but not depth?
|
|
22
|
+
// Wait, addNode implementation:
|
|
23
|
+
// if (!existing) { ... } else { if (status !== 0) existing.status = status; }
|
|
24
|
+
|
|
25
|
+
const node = graph.nodes.get('http://example.com');
|
|
26
|
+
expect(node?.status).toBe(200);
|
|
27
|
+
expect(node?.depth).toBe(0); // Depth should not change
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('should add an edge', () => {
|
|
31
|
+
graph.addNode('http://a.com', 0);
|
|
32
|
+
graph.addNode('http://b.com', 1);
|
|
33
|
+
graph.addEdge('http://a.com', 'http://b.com', 0.5);
|
|
34
|
+
|
|
35
|
+
const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
|
|
36
|
+
expect(graph.edges.has(edgeKey)).toBe(true);
|
|
37
|
+
expect(graph.edges.get(edgeKey)).toBe(0.5);
|
|
38
|
+
|
|
39
|
+
const source = graph.nodes.get('http://a.com');
|
|
40
|
+
const target = graph.nodes.get('http://b.com');
|
|
41
|
+
expect(source?.outLinks).toBe(1);
|
|
42
|
+
expect(target?.inLinks).toBe(1);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it('should update edge weight if new weight is higher', () => {
|
|
46
|
+
graph.addNode('http://a.com', 0);
|
|
47
|
+
graph.addNode('http://b.com', 1);
|
|
48
|
+
graph.addEdge('http://a.com', 'http://b.com', 0.5);
|
|
49
|
+
graph.addEdge('http://a.com', 'http://b.com', 0.8);
|
|
50
|
+
|
|
51
|
+
const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
|
|
52
|
+
expect(graph.edges.get(edgeKey)).toBe(0.8);
|
|
53
|
+
|
|
54
|
+
// Should not increment link counts again
|
|
55
|
+
const source = graph.nodes.get('http://a.com');
|
|
56
|
+
expect(source?.outLinks).toBe(1);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it('should not update edge weight if new weight is lower', () => {
|
|
60
|
+
graph.addNode('http://a.com', 0);
|
|
61
|
+
graph.addNode('http://b.com', 1);
|
|
62
|
+
graph.addEdge('http://a.com', 'http://b.com', 0.8);
|
|
63
|
+
graph.addEdge('http://a.com', 'http://b.com', 0.5);
|
|
64
|
+
|
|
65
|
+
const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
|
|
66
|
+
expect(graph.edges.get(edgeKey)).toBe(0.8);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it('should serialize to JSON and deserialize from JSON', () => {
|
|
70
|
+
graph.addNode('http://a.com', 0, 200);
|
|
71
|
+
graph.addNode('http://b.com', 1, 200);
|
|
72
|
+
graph.addEdge('http://a.com', 'http://b.com', 1.0);
|
|
73
|
+
graph.duplicateClusters = [{ id: '1', type: 'exact', size: 2, representative: 'http://a.com', severity: 'high' }];
|
|
74
|
+
graph.contentClusters = [{ id: 1, count: 2, primaryUrl: 'http://a.com', risk: 'high' }];
|
|
75
|
+
|
|
76
|
+
const json = graph.toJSON();
|
|
77
|
+
const newGraph = Graph.fromJSON(json);
|
|
78
|
+
|
|
79
|
+
expect(newGraph.nodes.size).toBe(2);
|
|
80
|
+
expect(newGraph.edges.size).toBe(1);
|
|
81
|
+
expect(newGraph.duplicateClusters).toHaveLength(1);
|
|
82
|
+
expect(newGraph.contentClusters).toHaveLength(1);
|
|
83
|
+
|
|
84
|
+
const nodeA = newGraph.nodes.get('http://a.com');
|
|
85
|
+
expect(nodeA?.status).toBe(200);
|
|
86
|
+
|
|
87
|
+
const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
|
|
88
|
+
expect(newGraph.edges.get(edgeKey)).toBe(1.0);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it('should handle partial JSON in fromJSON', () => {
|
|
92
|
+
const json = {
|
|
93
|
+
nodes: [{ url: 'http://a.com', depth: 0, status: 200, inLinks: 0, outLinks: 0 }],
|
|
94
|
+
// missing edges, clusters
|
|
95
|
+
};
|
|
96
|
+
const newGraph = Graph.fromJSON(json);
|
|
97
|
+
expect(newGraph.nodes.size).toBe(1);
|
|
98
|
+
expect(newGraph.edges.size).toBe(0);
|
|
99
|
+
});
|
|
100
|
+
});
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
|
|
3
|
+
import { getDb, closeDb } from '../src/db/index.js';
|
|
4
|
+
import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
|
|
5
|
+
import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
|
|
6
|
+
import { PageRepository } from '../src/db/repositories/PageRepository.js';
|
|
7
|
+
import { MetricsRepository } from '../src/db/repositories/MetricsRepository.js';
|
|
8
|
+
import { Database } from 'better-sqlite3';
|
|
9
|
+
|
|
10
|
+
describe('GraphLoader', () => {
|
|
11
|
+
let db: Database;
|
|
12
|
+
|
|
13
|
+
beforeEach(() => {
|
|
14
|
+
process.env.NODE_ENV = 'test';
|
|
15
|
+
closeDb();
|
|
16
|
+
db = getDb();
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
afterEach(() => {
|
|
20
|
+
closeDb();
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
it('should load graph with metrics correctly', () => {
|
|
24
|
+
const siteRepo = new SiteRepository(db);
|
|
25
|
+
const snapshotRepo = new SnapshotRepository(db);
|
|
26
|
+
const pageRepo = new PageRepository(db);
|
|
27
|
+
const metricsRepo = new MetricsRepository(db);
|
|
28
|
+
|
|
29
|
+
const siteId = siteRepo.createSite('example.com');
|
|
30
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
|
|
31
|
+
const url = 'http://example.com/page1';
|
|
32
|
+
|
|
33
|
+
// Create Page
|
|
34
|
+
pageRepo.upsertPage({
|
|
35
|
+
site_id: siteId,
|
|
36
|
+
normalized_url: url,
|
|
37
|
+
last_seen_snapshot_id: snapshotId,
|
|
38
|
+
http_status: 200,
|
|
39
|
+
depth: 0
|
|
40
|
+
});
|
|
41
|
+
const page = pageRepo.getPage(siteId, url)!;
|
|
42
|
+
|
|
43
|
+
// Insert Metrics
|
|
44
|
+
metricsRepo.insertMetrics({
|
|
45
|
+
snapshot_id: snapshotId,
|
|
46
|
+
page_id: page.id,
|
|
47
|
+
authority_score: 0.5,
|
|
48
|
+
hub_score: 0.2,
|
|
49
|
+
pagerank: 0.8,
|
|
50
|
+
pagerank_score: 80.0,
|
|
51
|
+
link_role: 'authority',
|
|
52
|
+
crawl_status: 'fetched',
|
|
53
|
+
word_count: 500,
|
|
54
|
+
thin_content_score: 10,
|
|
55
|
+
external_link_ratio: 0.1,
|
|
56
|
+
orphan_score: 5,
|
|
57
|
+
duplicate_cluster_id: null,
|
|
58
|
+
duplicate_type: null,
|
|
59
|
+
is_cluster_primary: 1
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// Load Graph
|
|
63
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
64
|
+
const node = graph.nodes.get(url);
|
|
65
|
+
|
|
66
|
+
expect(node).toBeDefined();
|
|
67
|
+
expect(node?.authorityScore).toBe(0.5);
|
|
68
|
+
expect(node?.hubScore).toBe(0.2);
|
|
69
|
+
// Verify new fields
|
|
70
|
+
expect(node?.crawlStatus).toBe('fetched');
|
|
71
|
+
expect(node?.wordCount).toBe(500);
|
|
72
|
+
expect(node?.thinContentScore).toBe(10);
|
|
73
|
+
expect(node?.externalLinkRatio).toBe(0.1);
|
|
74
|
+
expect(node?.orphanScore).toBe(5);
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it('should handle null metrics gracefully', () => {
|
|
78
|
+
const siteRepo = new SiteRepository(db);
|
|
79
|
+
const snapshotRepo = new SnapshotRepository(db);
|
|
80
|
+
const pageRepo = new PageRepository(db);
|
|
81
|
+
const metricsRepo = new MetricsRepository(db);
|
|
82
|
+
|
|
83
|
+
const siteId = siteRepo.createSite('example.com');
|
|
84
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
|
|
85
|
+
const url = 'http://example.com/page2';
|
|
86
|
+
|
|
87
|
+
pageRepo.upsertPage({
|
|
88
|
+
site_id: siteId,
|
|
89
|
+
normalized_url: url,
|
|
90
|
+
last_seen_snapshot_id: snapshotId,
|
|
91
|
+
http_status: 200,
|
|
92
|
+
depth: 1
|
|
93
|
+
});
|
|
94
|
+
const page = pageRepo.getPage(siteId, url)!;
|
|
95
|
+
|
|
96
|
+
// Insert Metrics with nulls
|
|
97
|
+
metricsRepo.insertMetrics({
|
|
98
|
+
snapshot_id: snapshotId,
|
|
99
|
+
page_id: page.id,
|
|
100
|
+
authority_score: null,
|
|
101
|
+
hub_score: null,
|
|
102
|
+
pagerank: null,
|
|
103
|
+
pagerank_score: null,
|
|
104
|
+
link_role: null,
|
|
105
|
+
crawl_status: null,
|
|
106
|
+
word_count: null,
|
|
107
|
+
thin_content_score: null,
|
|
108
|
+
external_link_ratio: null,
|
|
109
|
+
orphan_score: null,
|
|
110
|
+
duplicate_cluster_id: null,
|
|
111
|
+
duplicate_type: null,
|
|
112
|
+
is_cluster_primary: 0
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
const graph = loadGraphFromSnapshot(snapshotId);
|
|
116
|
+
const node = graph.nodes.get(url);
|
|
117
|
+
|
|
118
|
+
expect(node).toBeDefined();
|
|
119
|
+
// Check undefined
|
|
120
|
+
expect(node?.crawlStatus).toBeUndefined();
|
|
121
|
+
expect(node?.wordCount).toBeUndefined();
|
|
122
|
+
expect(node?.thinContentScore).toBeUndefined();
|
|
123
|
+
});
|
|
124
|
+
});
|
|
@@ -1,58 +1,59 @@
|
|
|
1
1
|
import { describe, expect, test } from 'vitest';
|
|
2
|
-
import {
|
|
3
|
-
import { Metrics } from '../src/graph/metrics.js';
|
|
2
|
+
import { renderAnalysisHtml, AnalysisResult, PageAnalysis } from '../src/analysis/analyze.js';
|
|
4
3
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
topPageRankPages: [],
|
|
20
|
-
limitReached: false,
|
|
21
|
-
sessionStats: {
|
|
22
|
-
pagesFetched: 5,
|
|
23
|
-
pagesCached: 2,
|
|
24
|
-
pagesSkipped: 0,
|
|
25
|
-
totalFound: 7
|
|
26
|
-
}
|
|
27
|
-
};
|
|
4
|
+
const mockPage: PageAnalysis = {
|
|
5
|
+
url: 'https://example.com',
|
|
6
|
+
status: 200,
|
|
7
|
+
seoScore: 85,
|
|
8
|
+
thinScore: 10,
|
|
9
|
+
title: { value: 'Example Title', length: 13, status: 'ok' },
|
|
10
|
+
metaDescription: { value: 'Example Desc', length: 12, status: 'ok' },
|
|
11
|
+
h1: { count: 1, status: 'ok', matchesTitle: true },
|
|
12
|
+
content: { wordCount: 500, uniqueSentenceCount: 50, textHtmlRatio: 0.6 },
|
|
13
|
+
images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
|
|
14
|
+
links: { internalLinks: 5, externalLinks: 2, nofollowCount: 0, externalRatio: 0.2 },
|
|
15
|
+
structuredData: { present: true, valid: true, types: ['Article'] },
|
|
16
|
+
meta: { canonical: 'https://example.com', noindex: false, nofollow: false }
|
|
17
|
+
};
|
|
28
18
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
19
|
+
const mockResult: AnalysisResult = {
|
|
20
|
+
site_summary: {
|
|
21
|
+
pages_analyzed: 1,
|
|
22
|
+
avg_seo_score: 85,
|
|
23
|
+
thin_pages: 0,
|
|
24
|
+
duplicate_titles: 0,
|
|
25
|
+
site_score: 90
|
|
26
|
+
},
|
|
27
|
+
site_scores: { overallScore: 90, seoHealthScore: 85 },
|
|
28
|
+
pages: [mockPage],
|
|
29
|
+
active_modules: { seo: true, content: true, accessibility: true }
|
|
30
|
+
};
|
|
33
31
|
|
|
34
|
-
|
|
32
|
+
describe('HTML Report Generation', () => {
|
|
33
|
+
test('generates single page report correctly', () => {
|
|
34
|
+
// If pages length is 1, it renders single page report
|
|
35
|
+
const html = renderAnalysisHtml(mockResult);
|
|
36
|
+
expect(html).toContain('<!DOCTYPE html>');
|
|
37
|
+
expect(html).toContain('Analysis for https://example.com');
|
|
38
|
+
expect(html).toContain('Example Title');
|
|
39
|
+
expect(html).toContain('Example Desc');
|
|
40
|
+
expect(html).toContain('500 words');
|
|
41
|
+
expect(html).toContain('<span class="status-ok">Valid</span>');
|
|
42
|
+
});
|
|
35
43
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
});
|
|
44
|
+
test('generates list report correctly', () => {
|
|
45
|
+
// Modify result to have 2 pages to trigger list view
|
|
46
|
+
const listResult: AnalysisResult = {
|
|
47
|
+
...mockResult,
|
|
48
|
+
pages: [mockPage, { ...mockPage, url: 'https://example.com/2' }]
|
|
49
|
+
};
|
|
50
|
+
const html = renderAnalysisHtml(listResult);
|
|
44
51
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
topAuthorityPages: [],
|
|
53
|
-
sessionStats: null
|
|
54
|
-
};
|
|
55
|
-
const html = generateHtml({ nodes: [], edges: [] }, mockMetrics as any);
|
|
56
|
-
expect(html).not.toContain('Session Crawl:');
|
|
57
|
-
});
|
|
52
|
+
expect(html).toContain('<!DOCTYPE html>');
|
|
53
|
+
expect(html).toContain('Crawlith Analysis Report');
|
|
54
|
+
expect(html).toContain('Pages: 1'); // site_summary.pages_analyzed is 1 in mockResult
|
|
55
|
+
expect(html).toContain('https://example.com');
|
|
56
|
+
expect(html).toContain('https://example.com/2');
|
|
57
|
+
expect(html).toContain('<td>85</td>'); // seoScore
|
|
58
|
+
});
|
|
58
59
|
});
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
2
|
+
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
3
|
+
import * as dns from 'dns';
|
|
4
|
+
|
|
5
|
+
vi.mock('dns', () => ({
|
|
6
|
+
lookup: vi.fn(),
|
|
7
|
+
resolve4: vi.fn(),
|
|
8
|
+
resolve6: vi.fn(),
|
|
9
|
+
}));
|
|
10
|
+
|
|
11
|
+
describe('IPGuard Secure Lookup', () => {
|
|
12
|
+
it('should resolve safe IPs', () => {
|
|
13
|
+
const lookupMock = vi.mocked(dns.lookup);
|
|
14
|
+
// Mock successful resolution
|
|
15
|
+
lookupMock.mockImplementation((hostname, options, callback) => {
|
|
16
|
+
callback(null, '8.8.8.8', 4);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
const callback = vi.fn();
|
|
20
|
+
IPGuard.secureLookup('google.com', {}, callback);
|
|
21
|
+
|
|
22
|
+
expect(callback).toHaveBeenCalledWith(null, '8.8.8.8', 4);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it('should block internal IPs', () => {
|
|
26
|
+
const lookupMock = vi.mocked(dns.lookup);
|
|
27
|
+
// Mock internal IP resolution
|
|
28
|
+
lookupMock.mockImplementation((hostname, options, callback) => {
|
|
29
|
+
callback(null, '127.0.0.1', 4);
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
const callback = vi.fn();
|
|
33
|
+
IPGuard.secureLookup('localhost', {}, callback);
|
|
34
|
+
|
|
35
|
+
expect(callback).toHaveBeenCalledWith(expect.any(Error), '127.0.0.1', 4);
|
|
36
|
+
const error = callback.mock.calls[0][0];
|
|
37
|
+
expect(error.message).toContain('Blocked internal IP');
|
|
38
|
+
expect(error.code).toBe('EBLOCKED');
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it('should handle array of IPs (IPv4)', () => {
|
|
42
|
+
const lookupMock = vi.mocked(dns.lookup);
|
|
43
|
+
// Mock array resolution
|
|
44
|
+
lookupMock.mockImplementation((hostname, options, callback) => {
|
|
45
|
+
// Mocking address array structure
|
|
46
|
+
const addresses = [
|
|
47
|
+
{ address: '1.1.1.1', family: 4 },
|
|
48
|
+
{ address: '127.0.0.1', family: 4 }
|
|
49
|
+
];
|
|
50
|
+
callback(null, addresses as any, 4);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
const callback = vi.fn();
|
|
54
|
+
IPGuard.secureLookup('mixed.com', { all: true } as any, callback);
|
|
55
|
+
|
|
56
|
+
expect(callback).toHaveBeenCalledWith(expect.any(Error), expect.anything(), 4);
|
|
57
|
+
const error = callback.mock.calls[0][0];
|
|
58
|
+
expect(error.message).toContain('Blocked internal IP');
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it('should pass through DNS errors', () => {
|
|
62
|
+
const lookupMock = vi.mocked(dns.lookup);
|
|
63
|
+
const dnsError = new Error('ENOTFOUND');
|
|
64
|
+
lookupMock.mockImplementation((hostname, options, callback) => {
|
|
65
|
+
callback(dnsError as any, undefined as any, 0);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
const callback = vi.fn();
|
|
69
|
+
IPGuard.secureLookup('invalid.domain', {}, callback);
|
|
70
|
+
|
|
71
|
+
expect(callback).toHaveBeenCalledWith(dnsError, undefined, 0);
|
|
72
|
+
});
|
|
73
|
+
});
|
|
@@ -6,6 +6,7 @@ import { existsSync, unlinkSync, readFileSync } from 'node:fs';
|
|
|
6
6
|
import path from 'node:path';
|
|
7
7
|
import os from 'node:os';
|
|
8
8
|
import { isPidAlive } from '../../src/lock/pidCheck.js';
|
|
9
|
+
import { EngineContext } from '../../src/events.js';
|
|
9
10
|
|
|
10
11
|
// Mock fs and os
|
|
11
12
|
vi.mock('node:fs/promises');
|
|
@@ -15,6 +16,8 @@ vi.mock('../../src/lock/pidCheck.js', () => ({
|
|
|
15
16
|
isPidAlive: vi.fn()
|
|
16
17
|
}));
|
|
17
18
|
|
|
19
|
+
const mockContext: EngineContext = { emit: vi.fn() };
|
|
20
|
+
|
|
18
21
|
describe('LockManager', () => {
|
|
19
22
|
const mockHomeDir = '/home/user';
|
|
20
23
|
const lockDir = path.join(mockHomeDir, '.crawlith', 'locks');
|
|
@@ -40,23 +43,18 @@ describe('LockManager', () => {
|
|
|
40
43
|
vi.spyOn(process, 'exit').mockImplementation((code) => {
|
|
41
44
|
throw new Error(`Process exit ${code}`);
|
|
42
45
|
});
|
|
43
|
-
|
|
44
|
-
// Mock console to suppress noise
|
|
45
|
-
vi.spyOn(console, 'log').mockImplementation(() => {});
|
|
46
|
-
vi.spyOn(console, 'warn').mockImplementation(() => {});
|
|
47
|
-
vi.spyOn(console, 'error').mockImplementation(() => {});
|
|
48
|
-
|
|
49
46
|
// Reset static state if any (LockManager stores lockFilePath)
|
|
50
|
-
// We can't easily reset private static via TS, but we can call releaseLock which clears it if set
|
|
51
47
|
LockManager.releaseLock();
|
|
52
48
|
});
|
|
53
49
|
|
|
54
50
|
afterEach(() => {
|
|
55
51
|
vi.restoreAllMocks();
|
|
52
|
+
// Reset static state
|
|
53
|
+
LockManager.releaseLock();
|
|
56
54
|
});
|
|
57
55
|
|
|
58
56
|
it('should acquire lock when no lock exists', async () => {
|
|
59
|
-
await LockManager.acquireLock(command, target, options);
|
|
57
|
+
await LockManager.acquireLock(command, target, options, mockContext);
|
|
60
58
|
|
|
61
59
|
expect(fs.mkdir).toHaveBeenCalledWith(lockDir, { recursive: true });
|
|
62
60
|
expect(fs.writeFile).toHaveBeenCalledWith(
|
|
@@ -77,9 +75,9 @@ describe('LockManager', () => {
|
|
|
77
75
|
}));
|
|
78
76
|
vi.mocked(isPidAlive).mockReturnValue(true);
|
|
79
77
|
|
|
80
|
-
await expect(LockManager.acquireLock(command, target, options)).rejects.toThrow('Process exit 1');
|
|
78
|
+
await expect(LockManager.acquireLock(command, target, options, mockContext)).rejects.toThrow('Process exit 1');
|
|
81
79
|
|
|
82
|
-
expect(
|
|
80
|
+
expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('already running') }));
|
|
83
81
|
});
|
|
84
82
|
|
|
85
83
|
it('should clear stale lock and acquire if PID is dead', async () => {
|
|
@@ -93,11 +91,11 @@ describe('LockManager', () => {
|
|
|
93
91
|
}));
|
|
94
92
|
vi.mocked(isPidAlive).mockReturnValue(false);
|
|
95
93
|
|
|
96
|
-
await LockManager.acquireLock(command, target, options);
|
|
94
|
+
await LockManager.acquireLock(command, target, options, mockContext);
|
|
97
95
|
|
|
98
96
|
expect(unlinkSync).toHaveBeenCalledWith(lockPath);
|
|
99
97
|
expect(fs.writeFile).toHaveBeenCalled();
|
|
100
|
-
expect(
|
|
98
|
+
expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'info', message: expect.stringContaining('Detected stale lock') }));
|
|
101
99
|
});
|
|
102
100
|
|
|
103
101
|
it('should override lock if force is true', async () => {
|
|
@@ -108,24 +106,24 @@ describe('LockManager', () => {
|
|
|
108
106
|
}));
|
|
109
107
|
vi.mocked(isPidAlive).mockReturnValue(true);
|
|
110
108
|
|
|
111
|
-
await LockManager.acquireLock(command, target, options, true); // force = true
|
|
109
|
+
await LockManager.acquireLock(command, target, options, mockContext, true); // force = true
|
|
112
110
|
|
|
113
111
|
expect(unlinkSync).toHaveBeenCalledWith(lockPath);
|
|
114
112
|
expect(fs.writeFile).toHaveBeenCalled();
|
|
115
|
-
expect(
|
|
113
|
+
expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'warn', message: expect.stringContaining('Force mode enabled') }));
|
|
116
114
|
});
|
|
117
115
|
|
|
118
116
|
it('should handle race condition (EEXIST)', async () => {
|
|
119
117
|
vi.mocked(existsSync).mockReturnValue(false);
|
|
120
118
|
vi.mocked(fs.writeFile).mockRejectedValue({ code: 'EEXIST' });
|
|
121
119
|
|
|
122
|
-
await expect(LockManager.acquireLock(command, target, options)).rejects.toThrow('Process exit 1');
|
|
123
|
-
expect(
|
|
120
|
+
await expect(LockManager.acquireLock(command, target, options, mockContext)).rejects.toThrow('Process exit 1');
|
|
121
|
+
expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('Race condition') }));
|
|
124
122
|
});
|
|
125
123
|
|
|
126
124
|
it('should release lock on exit', async () => {
|
|
127
125
|
// Acquire first (existsSync returns false by default from beforeEach)
|
|
128
|
-
await LockManager.acquireLock(command, target, options);
|
|
126
|
+
await LockManager.acquireLock(command, target, options, mockContext);
|
|
129
127
|
|
|
130
128
|
// Simulate file exists for release
|
|
131
129
|
vi.mocked(existsSync).mockReturnValue(true);
|
|
@@ -135,4 +133,66 @@ describe('LockManager', () => {
|
|
|
135
133
|
|
|
136
134
|
expect(unlinkSync).toHaveBeenCalledWith(lockPath);
|
|
137
135
|
});
|
|
136
|
+
|
|
137
|
+
it('should register signal handlers and cleanup on SIGINT', async () => {
|
|
138
|
+
const processOnSpy = vi.spyOn(process, 'on');
|
|
139
|
+
await LockManager.acquireLock(command, target, options, mockContext);
|
|
140
|
+
|
|
141
|
+
// Find the handler
|
|
142
|
+
const sigintCall = processOnSpy.mock.calls.find(call => call[0] === 'SIGINT');
|
|
143
|
+
expect(sigintCall).toBeDefined();
|
|
144
|
+
const handler = sigintCall![1] as () => void;
|
|
145
|
+
|
|
146
|
+
// Trigger handler
|
|
147
|
+
vi.mocked(existsSync).mockReturnValue(true); // Simulate file still exists
|
|
148
|
+
try {
|
|
149
|
+
handler();
|
|
150
|
+
} catch (e: any) {
|
|
151
|
+
// Expect process.exit(130) which throws error in our mock
|
|
152
|
+
expect(e.message).toBe('Process exit 130');
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
expect(unlinkSync).toHaveBeenCalledWith(lockPath);
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
it('should register signal handlers and cleanup on SIGTERM', async () => {
|
|
159
|
+
const processOnSpy = vi.spyOn(process, 'on');
|
|
160
|
+
await LockManager.acquireLock(command, target, options, mockContext);
|
|
161
|
+
|
|
162
|
+
// Find the handler
|
|
163
|
+
const sigtermCall = processOnSpy.mock.calls.find(call => call[0] === 'SIGTERM');
|
|
164
|
+
expect(sigtermCall).toBeDefined();
|
|
165
|
+
const handler = sigtermCall![1] as () => void;
|
|
166
|
+
|
|
167
|
+
// Trigger handler
|
|
168
|
+
vi.mocked(existsSync).mockReturnValue(true);
|
|
169
|
+
try {
|
|
170
|
+
handler();
|
|
171
|
+
} catch (e: any) {
|
|
172
|
+
expect(e.message).toBe('Process exit 143');
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
expect(unlinkSync).toHaveBeenCalledWith(lockPath);
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
it('should register signal handlers and cleanup on uncaughtException', async () => {
|
|
179
|
+
const processOnSpy = vi.spyOn(process, 'on');
|
|
180
|
+
await LockManager.acquireLock(command, target, options, mockContext);
|
|
181
|
+
|
|
182
|
+
// Find the handler
|
|
183
|
+
const uncaughtExceptionCall = processOnSpy.mock.calls.find(call => call[0] === 'uncaughtException');
|
|
184
|
+
expect(uncaughtExceptionCall).toBeDefined();
|
|
185
|
+
const handler = uncaughtExceptionCall![1] as (err: Error) => void;
|
|
186
|
+
|
|
187
|
+
// Trigger handler
|
|
188
|
+
vi.mocked(existsSync).mockReturnValue(true);
|
|
189
|
+
try {
|
|
190
|
+
handler(new Error('Test error'));
|
|
191
|
+
} catch (e: any) {
|
|
192
|
+
expect(e.message).toBe('Process exit 1');
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
expect(unlinkSync).toHaveBeenCalledWith(lockPath);
|
|
196
|
+
expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'error', message: expect.stringContaining('Uncaught Exception'), error: expect.any(Error) }));
|
|
197
|
+
});
|
|
138
198
|
});
|
package/tests/normalize.test.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { normalizeUrl } from '../src/crawler/normalize.js';
|
|
2
|
-
import { extractLinks } from '../src/crawler/extract.js';
|
|
3
2
|
import { test, expect } from 'vitest';
|
|
4
3
|
|
|
5
4
|
test('normalizeUrl', () => {
|
|
@@ -11,24 +10,6 @@ test('normalizeUrl', () => {
|
|
|
11
10
|
expect(normalizeUrl('https://example.com/', '')).toBe('https://example.com/');
|
|
12
11
|
});
|
|
13
12
|
|
|
14
|
-
test('extractLinks', () => {
|
|
15
|
-
const html = `
|
|
16
|
-
<html>
|
|
17
|
-
<body>
|
|
18
|
-
<a href="/foo">Foo</a>
|
|
19
|
-
<a href="bar">Bar</a>
|
|
20
|
-
<a href="https://other.com/baz">Baz</a>
|
|
21
|
-
<a href="#top">Top</a>
|
|
22
|
-
</body>
|
|
23
|
-
</html>
|
|
24
|
-
`;
|
|
25
|
-
const links = extractLinks(html, 'https://example.com/page/');
|
|
26
|
-
expect(links).toContain('https://example.com/foo');
|
|
27
|
-
expect(links).toContain('https://example.com/page/bar');
|
|
28
|
-
expect(links).toContain('https://other.com/baz');
|
|
29
|
-
expect(links).not.toContain('https://example.com/page/#top');
|
|
30
|
-
expect(links).toContain('https://example.com/page/'); // #top resolves to base url without fragment
|
|
31
|
-
});
|
|
32
13
|
test('normalizeUrl: absolute resolution', () => {
|
|
33
14
|
expect(normalizeUrl('/foo', 'https://example.com')).toBe('https://example.com/foo');
|
|
34
15
|
expect(normalizeUrl('bar', 'https://example.com/baz/')).toBe('https://example.com/baz/bar');
|
|
@@ -95,6 +76,12 @@ test('normalizeUrl: skip non-HTML assets', () => {
|
|
|
95
76
|
expect(normalizeUrl('https://example.com/page', '')).toBe('https://example.com/page');
|
|
96
77
|
});
|
|
97
78
|
|
|
79
|
+
test('normalizeUrl: invalid URL', () => {
|
|
80
|
+
expect(normalizeUrl('/foo', '')).toBeNull();
|
|
81
|
+
expect(normalizeUrl('invalid-url', '')).toBeNull();
|
|
82
|
+
expect(normalizeUrl('/foo', 'invalid-base')).toBeNull();
|
|
83
|
+
});
|
|
84
|
+
|
|
98
85
|
test('normalizeUrl: return format', () => {
|
|
99
86
|
const res = normalizeUrl('https://example.com/foo?a=1', '');
|
|
100
87
|
expect(res).toBe('https://example.com/foo?a=1');
|