@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { describe, expect, test } from 'vitest';
|
|
2
|
-
import { annotateOrphans, calculateOrphanSeverity, mapImpactLevel, type
|
|
2
|
+
import { annotateOrphans, calculateOrphanSeverity, mapImpactLevel, type CrawlNode, type CrawlEdge } from '../src/scoring/orphanSeverity.js';
|
|
3
3
|
|
|
4
|
-
function baseNode(url: string, overrides: Partial<
|
|
4
|
+
function baseNode(url: string, overrides: Partial<CrawlNode> = {}): CrawlNode {
|
|
5
5
|
return {
|
|
6
6
|
url,
|
|
7
7
|
depth: 1,
|
|
@@ -14,11 +14,11 @@ function baseNode(url: string, overrides: Partial<SitegraphNode> = {}): Sitegrap
|
|
|
14
14
|
|
|
15
15
|
describe('orphan detection and severity scoring', () => {
|
|
16
16
|
test('hard orphan detection and homepage exclusion', () => {
|
|
17
|
-
const nodes:
|
|
17
|
+
const nodes: CrawlNode[] = [
|
|
18
18
|
baseNode('https://example.com/', { depth: 0, inLinks: 0 }),
|
|
19
19
|
baseNode('https://example.com/orphan', { inLinks: 0 })
|
|
20
20
|
];
|
|
21
|
-
const edges:
|
|
21
|
+
const edges: CrawlEdge[] = [];
|
|
22
22
|
|
|
23
23
|
const result = annotateOrphans(nodes, edges, {
|
|
24
24
|
enabled: true,
|
|
@@ -34,7 +34,7 @@ describe('orphan detection and severity scoring', () => {
|
|
|
34
34
|
|
|
35
35
|
test('near orphan threshold override', () => {
|
|
36
36
|
const nodes = [baseNode('https://example.com/near', { inLinks: 2 })];
|
|
37
|
-
const edges:
|
|
37
|
+
const edges: CrawlEdge[] = [];
|
|
38
38
|
|
|
39
39
|
const resultDefault = annotateOrphans(nodes, edges, {
|
|
40
40
|
enabled: true,
|
|
@@ -54,14 +54,14 @@ describe('orphan detection and severity scoring', () => {
|
|
|
54
54
|
});
|
|
55
55
|
|
|
56
56
|
test('soft orphan detection only when enabled and inbound only from low-value sources', () => {
|
|
57
|
-
const nodes:
|
|
57
|
+
const nodes: CrawlNode[] = [
|
|
58
58
|
baseNode('https://example.com/tag/seo', { pageType: 'tag', outLinks: 1 }),
|
|
59
59
|
baseNode('https://example.com/list?page=2', { pageType: 'pagination', outLinks: 1 }),
|
|
60
60
|
baseNode('https://example.com/target', { inLinks: 2 }),
|
|
61
61
|
baseNode('https://example.com/normal', { outLinks: 1 })
|
|
62
62
|
];
|
|
63
63
|
|
|
64
|
-
const edges:
|
|
64
|
+
const edges: CrawlEdge[] = [
|
|
65
65
|
{ source: 'https://example.com/tag/seo', target: 'https://example.com/target' },
|
|
66
66
|
{ source: 'https://example.com/list?page=2', target: 'https://example.com/target' }
|
|
67
67
|
];
|
|
@@ -129,14 +129,14 @@ describe('orphan detection and severity scoring', () => {
|
|
|
129
129
|
});
|
|
130
130
|
|
|
131
131
|
test('canonical consolidation, robots exclusion, and deterministic JSON output snapshot', () => {
|
|
132
|
-
const nodes:
|
|
132
|
+
const nodes: CrawlNode[] = [
|
|
133
133
|
baseNode('https://example.com/canonical', { inLinks: 0 }),
|
|
134
134
|
baseNode('https://example.com/variant?a=1', { canonicalUrl: 'https://example.com/canonical', inLinks: 1 }),
|
|
135
135
|
baseNode('https://example.com/blocked', { inLinks: 0, robotsExcluded: true }),
|
|
136
136
|
baseNode('https://example.com/redirect-target', { inLinks: 1 })
|
|
137
137
|
];
|
|
138
138
|
|
|
139
|
-
const edges:
|
|
139
|
+
const edges: CrawlEdge[] = [
|
|
140
140
|
{ source: 'https://example.com/redirect-source', target: 'https://example.com/redirect-target' }
|
|
141
141
|
];
|
|
142
142
|
|
|
@@ -5,7 +5,11 @@ import { request } from 'undici';
|
|
|
5
5
|
|
|
6
6
|
vi.mock('undici', () => ({
|
|
7
7
|
request: vi.fn(),
|
|
8
|
-
ProxyAgent: vi.fn().mockImplementation(() => ({ dispatcher: {} }))
|
|
8
|
+
ProxyAgent: vi.fn().mockImplementation(() => ({ dispatcher: {} })),
|
|
9
|
+
Agent: class {
|
|
10
|
+
dispatch = vi.fn();
|
|
11
|
+
},
|
|
12
|
+
Dispatcher: class {}
|
|
9
13
|
}));
|
|
10
14
|
|
|
11
15
|
describe('RedirectController', () => {
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import { describe, expect, test } from 'vitest';
|
|
2
|
+
import { renderAnalysisCsv, AnalysisResult } from '../src/analysis/analyze.js';
|
|
3
|
+
|
|
4
|
+
describe('renderAnalysisCsv', () => {
|
|
5
|
+
test('renders CSV with headers', () => {
|
|
6
|
+
const result: AnalysisResult = {
|
|
7
|
+
pages: [],
|
|
8
|
+
site_summary: {
|
|
9
|
+
pages_analyzed: 0,
|
|
10
|
+
avg_seo_score: 0,
|
|
11
|
+
thin_pages: 0,
|
|
12
|
+
duplicate_titles: 0,
|
|
13
|
+
site_score: 0
|
|
14
|
+
},
|
|
15
|
+
site_scores: {} as any,
|
|
16
|
+
active_modules: {
|
|
17
|
+
seo: true,
|
|
18
|
+
content: true,
|
|
19
|
+
accessibility: true
|
|
20
|
+
}
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
const csv = renderAnalysisCsv(result);
|
|
24
|
+
expect(csv).toContain('URL,SEO Score,Thin Score,HTTP Status,Title,Title Length,Meta Description,Desc Length,Word Count,Internal Links,External Links');
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
test('renders a single page correctly', () => {
|
|
28
|
+
const result: AnalysisResult = {
|
|
29
|
+
pages: [
|
|
30
|
+
{
|
|
31
|
+
url: 'https://example.com',
|
|
32
|
+
status: 200,
|
|
33
|
+
seoScore: 85,
|
|
34
|
+
thinScore: 10,
|
|
35
|
+
title: { value: 'Example Domain', length: 14, status: 'ok' },
|
|
36
|
+
metaDescription: { value: 'This is an example description.', length: 29, status: 'ok' },
|
|
37
|
+
content: { wordCount: 500 } as any,
|
|
38
|
+
links: { internalLinks: 5, externalLinks: 2 } as any,
|
|
39
|
+
h1: {} as any,
|
|
40
|
+
images: {} as any,
|
|
41
|
+
structuredData: {} as any,
|
|
42
|
+
meta: {}
|
|
43
|
+
}
|
|
44
|
+
],
|
|
45
|
+
site_summary: {
|
|
46
|
+
pages_analyzed: 1,
|
|
47
|
+
avg_seo_score: 85,
|
|
48
|
+
thin_pages: 0,
|
|
49
|
+
duplicate_titles: 0,
|
|
50
|
+
site_score: 85
|
|
51
|
+
},
|
|
52
|
+
site_scores: {} as any,
|
|
53
|
+
active_modules: {
|
|
54
|
+
seo: true,
|
|
55
|
+
content: true,
|
|
56
|
+
accessibility: true
|
|
57
|
+
}
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
const csv = renderAnalysisCsv(result);
|
|
61
|
+
const lines = csv.split('\n');
|
|
62
|
+
expect(lines.length).toBe(2);
|
|
63
|
+
expect(lines[1]).toContain('https://example.com,85,10,200,"Example Domain",14,"This is an example description.",29,500,5,2');
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
test('escapes quotes in title and meta description', () => {
|
|
67
|
+
const result: AnalysisResult = {
|
|
68
|
+
pages: [
|
|
69
|
+
{
|
|
70
|
+
url: 'https://example.com/quote',
|
|
71
|
+
status: 200,
|
|
72
|
+
seoScore: 90,
|
|
73
|
+
thinScore: 5,
|
|
74
|
+
title: { value: 'Example "Quoted" Domain', length: 23, status: 'ok' },
|
|
75
|
+
metaDescription: { value: 'This description contains "quotes" inside.', length: 42, status: 'ok' },
|
|
76
|
+
content: { wordCount: 300 } as any,
|
|
77
|
+
links: { internalLinks: 3, externalLinks: 1 } as any,
|
|
78
|
+
h1: {} as any,
|
|
79
|
+
images: {} as any,
|
|
80
|
+
structuredData: {} as any,
|
|
81
|
+
meta: {}
|
|
82
|
+
}
|
|
83
|
+
],
|
|
84
|
+
site_summary: {
|
|
85
|
+
pages_analyzed: 1,
|
|
86
|
+
avg_seo_score: 90,
|
|
87
|
+
thin_pages: 0,
|
|
88
|
+
duplicate_titles: 0,
|
|
89
|
+
site_score: 90
|
|
90
|
+
},
|
|
91
|
+
site_scores: {} as any,
|
|
92
|
+
active_modules: {
|
|
93
|
+
seo: true,
|
|
94
|
+
content: true,
|
|
95
|
+
accessibility: true
|
|
96
|
+
}
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
const csv = renderAnalysisCsv(result);
|
|
100
|
+
const lines = csv.split('\n');
|
|
101
|
+
// Expect double quotes to be escaped with double quotes: " -> ""
|
|
102
|
+
// And the whole field wrapped in quotes
|
|
103
|
+
expect(lines[1]).toContain('"Example ""Quoted"" Domain"');
|
|
104
|
+
expect(lines[1]).toContain('"This description contains ""quotes"" inside."');
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
test('handles Pending/Limit status (status: 0)', () => {
|
|
108
|
+
const result: AnalysisResult = {
|
|
109
|
+
pages: [
|
|
110
|
+
{
|
|
111
|
+
url: 'https://example.com/pending',
|
|
112
|
+
status: 0,
|
|
113
|
+
seoScore: 0,
|
|
114
|
+
thinScore: 0,
|
|
115
|
+
title: { value: null, length: 0, status: 'missing' },
|
|
116
|
+
metaDescription: { value: null, length: 0, status: 'missing' },
|
|
117
|
+
content: { wordCount: 0 } as any,
|
|
118
|
+
links: { internalLinks: 0, externalLinks: 0 } as any,
|
|
119
|
+
h1: {} as any,
|
|
120
|
+
images: {} as any,
|
|
121
|
+
structuredData: {} as any,
|
|
122
|
+
meta: {}
|
|
123
|
+
}
|
|
124
|
+
],
|
|
125
|
+
site_summary: {
|
|
126
|
+
pages_analyzed: 1,
|
|
127
|
+
avg_seo_score: 0,
|
|
128
|
+
thin_pages: 0,
|
|
129
|
+
duplicate_titles: 0,
|
|
130
|
+
site_score: 0
|
|
131
|
+
},
|
|
132
|
+
site_scores: {} as any,
|
|
133
|
+
active_modules: {
|
|
134
|
+
seo: true,
|
|
135
|
+
content: true,
|
|
136
|
+
accessibility: true
|
|
137
|
+
}
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
const csv = renderAnalysisCsv(result);
|
|
141
|
+
const lines = csv.split('\n');
|
|
142
|
+
expect(lines[1]).toContain('Pending/Limit');
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
test('handles missing title and description gracefully', () => {
|
|
146
|
+
const result: AnalysisResult = {
|
|
147
|
+
pages: [
|
|
148
|
+
{
|
|
149
|
+
url: 'https://example.com/missing',
|
|
150
|
+
status: 404,
|
|
151
|
+
seoScore: 0,
|
|
152
|
+
thinScore: 0,
|
|
153
|
+
title: { value: undefined as any, length: 0, status: 'missing' },
|
|
154
|
+
metaDescription: { value: null as any, length: 0, status: 'missing' },
|
|
155
|
+
content: { wordCount: 0 } as any,
|
|
156
|
+
links: { internalLinks: 0, externalLinks: 0 } as any,
|
|
157
|
+
h1: {} as any,
|
|
158
|
+
images: {} as any,
|
|
159
|
+
structuredData: {} as any,
|
|
160
|
+
meta: {}
|
|
161
|
+
}
|
|
162
|
+
],
|
|
163
|
+
site_summary: {
|
|
164
|
+
pages_analyzed: 1,
|
|
165
|
+
avg_seo_score: 0,
|
|
166
|
+
thin_pages: 0,
|
|
167
|
+
duplicate_titles: 0,
|
|
168
|
+
site_score: 0
|
|
169
|
+
},
|
|
170
|
+
site_scores: {} as any,
|
|
171
|
+
active_modules: {
|
|
172
|
+
seo: true,
|
|
173
|
+
content: true,
|
|
174
|
+
accessibility: true
|
|
175
|
+
}
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
const csv = renderAnalysisCsv(result);
|
|
179
|
+
const lines = csv.split('\n');
|
|
180
|
+
// Should produce empty quoted strings ""
|
|
181
|
+
expect(lines[1]).toContain(',"",0,"",0,0,0,0');
|
|
182
|
+
});
|
|
183
|
+
});
|
package/tests/safety.test.ts
CHANGED
|
@@ -34,6 +34,18 @@ describe('IPGuard', () => {
|
|
|
34
34
|
expect(IPGuard.isInternal('fe80::1')).toBe(true);
|
|
35
35
|
});
|
|
36
36
|
|
|
37
|
+
it('should block IPv4-mapped IPv6 internal addresses', () => {
|
|
38
|
+
expect(IPGuard.isInternal('::ffff:127.0.0.1')).toBe(true);
|
|
39
|
+
expect(IPGuard.isInternal('::ffff:10.0.0.1')).toBe(true);
|
|
40
|
+
expect(IPGuard.isInternal('::ffff:192.168.1.1')).toBe(true);
|
|
41
|
+
expect(IPGuard.isInternal('::ffff:169.254.169.254')).toBe(true);
|
|
42
|
+
expect(IPGuard.isInternal('::ffff:7f00:0001')).toBe(true); // Hex 127.0.0.1
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it('should allow IPv4-mapped IPv6 public addresses', () => {
|
|
46
|
+
expect(IPGuard.isInternal('::ffff:8.8.8.8')).toBe(false);
|
|
47
|
+
});
|
|
48
|
+
|
|
37
49
|
it('should validate hostname by resolving IPs', async () => {
|
|
38
50
|
const resolve4Spy = vi.mocked(dns.resolve4);
|
|
39
51
|
const resolve6Spy = vi.mocked(dns.resolve6);
|
package/tests/scope.test.ts
CHANGED
|
@@ -25,6 +25,13 @@ describe('DomainFilter', () => {
|
|
|
25
25
|
const filter = new DomainFilter(['allowed.com']);
|
|
26
26
|
expect(filter.isAllowed('other.com')).toBe(false);
|
|
27
27
|
});
|
|
28
|
+
|
|
29
|
+
it('should fallback to raw string on invalid hostname', () => {
|
|
30
|
+
// '[' and 'http://denied-invalid-[' causes new URL() to throw
|
|
31
|
+
const filter = new DomainFilter(['['], ['denied-invalid-[']);
|
|
32
|
+
expect(filter.isAllowed('[')).toBe(true);
|
|
33
|
+
expect(filter.isAllowed('denied-invalid-[')).toBe(false);
|
|
34
|
+
});
|
|
28
35
|
});
|
|
29
36
|
|
|
30
37
|
describe('SubdomainPolicy', () => {
|
|
@@ -63,4 +70,15 @@ describe('ScopeManager', () => {
|
|
|
63
70
|
expect(manager.isUrlEligible('https://other.com/')).toBe('allowed');
|
|
64
71
|
expect(manager.isUrlEligible('https://google.com/')).toBe('blocked_by_domain_filter');
|
|
65
72
|
});
|
|
73
|
+
|
|
74
|
+
it('should handle trailing dots in hostnames', () => {
|
|
75
|
+
const manager = new ScopeManager({
|
|
76
|
+
rootUrl: 'https://example.com',
|
|
77
|
+
allowedDomains: ['example.com.'],
|
|
78
|
+
includeSubdomains: false
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
expect(manager.isUrlEligible('https://example.com./')).toBe('allowed');
|
|
82
|
+
expect(manager.isUrlEligible('https://example.com/')).toBe('allowed');
|
|
83
|
+
});
|
|
66
84
|
});
|
package/tests/scoring.test.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { expect, test } from 'vitest';
|
|
2
|
-
import { scorePageSeo
|
|
2
|
+
import { scorePageSeo } from '../src/analysis/scoring.js';
|
|
3
3
|
import { PageAnalysis } from '../src/analysis/analyze.js';
|
|
4
4
|
|
|
5
5
|
const basePage: PageAnalysis = {
|
|
@@ -13,7 +13,8 @@ const basePage: PageAnalysis = {
|
|
|
13
13
|
images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
|
|
14
14
|
links: { internalLinks: 5, externalLinks: 2, nofollowCount: 1, externalRatio: 2 / 7 },
|
|
15
15
|
structuredData: { present: true, valid: true, types: ['Article'] },
|
|
16
|
-
seoScore: 0
|
|
16
|
+
seoScore: 0,
|
|
17
|
+
meta: { noindex: false, nofollow: false }
|
|
17
18
|
};
|
|
18
19
|
|
|
19
20
|
test('page score stays in 0-100', () => {
|
|
@@ -34,26 +35,26 @@ test('page score stays in 0-100', () => {
|
|
|
34
35
|
expect(scorePageSeo(badPage)).toBeLessThan(50);
|
|
35
36
|
});
|
|
36
37
|
|
|
37
|
-
test('aggregate site score includes existing metrics signals', () => {
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
38
|
+
// test('aggregate site score includes existing metrics signals', () => {
|
|
39
|
+
// const score = aggregateSiteScore({
|
|
40
|
+
// totalPages: 2,
|
|
41
|
+
// totalEdges: 1,
|
|
42
|
+
// orphanPages: ['https://example.com/x'],
|
|
43
|
+
// nearOrphans: [],
|
|
44
|
+
// deepPages: [],
|
|
45
|
+
// topAuthorityPages: [{ url: 'a', authority: 1 }],
|
|
46
|
+
// averageOutDegree: 1,
|
|
47
|
+
// maxDepthFound: 1,
|
|
48
|
+
// crawlEfficiencyScore: 0.8,
|
|
49
|
+
// averageDepth: 1,
|
|
50
|
+
// structuralEntropy: 2,
|
|
51
|
+
// limitReached: false
|
|
52
|
+
// }, [
|
|
53
|
+
// { ...basePage, seoScore: 70 },
|
|
54
|
+
// { ...basePage, seoScore: 90, url: 'https://example.com/2' }
|
|
55
|
+
// ]);
|
|
55
56
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
});
|
|
57
|
+
// expect(score.seoHealthScore).toBe(80);
|
|
58
|
+
// expect(score.overallScore).toBeGreaterThan(0);
|
|
59
|
+
// expect(score.overallScore).toBeLessThanOrEqual(100);
|
|
60
|
+
// });
|
package/tests/sitemap.test.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { test, expect, beforeEach } from 'vitest';
|
|
1
|
+
import { test, expect, beforeEach, vi } from 'vitest';
|
|
2
2
|
import { Sitemap } from '../src/crawler/sitemap.js';
|
|
3
3
|
import { MockAgent, setGlobalDispatcher } from 'undici';
|
|
4
|
+
import { EngineContext } from '../src/events.js';
|
|
4
5
|
|
|
5
6
|
let mockAgent: MockAgent;
|
|
6
7
|
|
|
@@ -86,3 +87,14 @@ test('handles fetch errors gracefully', async () => {
|
|
|
86
87
|
const urls = await sitemap.fetch('https://example.com/error.xml');
|
|
87
88
|
expect(urls.length).toBe(0);
|
|
88
89
|
});
|
|
90
|
+
|
|
91
|
+
test('emits warning on fetch error', async () => {
|
|
92
|
+
const client = mockAgent.get('https://example.com');
|
|
93
|
+
client.intercept({ path: '/error.xml', method: 'GET' }).replyWithError(new Error('Network error'));
|
|
94
|
+
|
|
95
|
+
const mockContext: EngineContext = { emit: vi.fn() };
|
|
96
|
+
const sitemap = new Sitemap(mockContext);
|
|
97
|
+
await sitemap.fetch('https://example.com/error.xml');
|
|
98
|
+
|
|
99
|
+
expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'warn' }));
|
|
100
|
+
});
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
+
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
3
|
+
import { request } from 'undici';
|
|
4
|
+
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
5
|
+
|
|
6
|
+
// Mock undici request to fail with EBLOCKED
|
|
7
|
+
vi.mock('undici', () => {
|
|
8
|
+
return {
|
|
9
|
+
request: vi.fn(),
|
|
10
|
+
Agent: class {
|
|
11
|
+
dispatch = vi.fn();
|
|
12
|
+
},
|
|
13
|
+
Dispatcher: class {}
|
|
14
|
+
};
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
// Mock IPGuard.validateHost to pass
|
|
18
|
+
vi.mock('../src/core/security/ipGuard.js', async () => {
|
|
19
|
+
const original = await vi.importActual('../src/core/security/ipGuard.js');
|
|
20
|
+
return {
|
|
21
|
+
...original as any,
|
|
22
|
+
IPGuard: {
|
|
23
|
+
...original.IPGuard,
|
|
24
|
+
validateHost: vi.fn().mockResolvedValue(true), // Pass step 1
|
|
25
|
+
getSecureDispatcher: vi.fn()
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
describe('SSRF Fix Reproduction', () => {
|
|
31
|
+
let fetcher: Fetcher;
|
|
32
|
+
|
|
33
|
+
beforeEach(() => {
|
|
34
|
+
vi.clearAllMocks();
|
|
35
|
+
// Setup default mock return for dispatcher
|
|
36
|
+
vi.mocked(IPGuard.getSecureDispatcher).mockReturnValue({} as any);
|
|
37
|
+
fetcher = new Fetcher({ rate: 100 });
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('should return blocked_internal_ip when secureDispatcher blocks', async () => {
|
|
41
|
+
const mockRequest = vi.mocked(request);
|
|
42
|
+
const mockGetSecureDispatcher = vi.mocked(IPGuard.getSecureDispatcher);
|
|
43
|
+
const mockDispatcher = { dispatch: vi.fn() } as any;
|
|
44
|
+
mockGetSecureDispatcher.mockReturnValue(mockDispatcher);
|
|
45
|
+
|
|
46
|
+
// Re-initialize fetcher so it calls getSecureDispatcher and gets our specific mock
|
|
47
|
+
fetcher = new Fetcher({ rate: 100 });
|
|
48
|
+
|
|
49
|
+
// Simulate secureDispatcher blocking via undici request throwing EBLOCKED
|
|
50
|
+
const blockedError = new Error('Blocked internal IP: 127.0.0.1');
|
|
51
|
+
(blockedError as any).code = 'EBLOCKED';
|
|
52
|
+
|
|
53
|
+
mockRequest.mockRejectedValueOnce(blockedError);
|
|
54
|
+
|
|
55
|
+
const res = await fetcher.fetch('http://example.com');
|
|
56
|
+
|
|
57
|
+
// Now we expect correct handling
|
|
58
|
+
expect(res.status).toBe('blocked_internal_ip');
|
|
59
|
+
|
|
60
|
+
// Verify that the secure dispatcher was indeed used
|
|
61
|
+
expect(mockGetSecureDispatcher).toHaveBeenCalled();
|
|
62
|
+
expect(mockRequest).toHaveBeenCalledWith(
|
|
63
|
+
expect.stringContaining('http://example.com'),
|
|
64
|
+
expect.objectContaining({
|
|
65
|
+
dispatcher: mockDispatcher
|
|
66
|
+
})
|
|
67
|
+
);
|
|
68
|
+
});
|
|
69
|
+
});
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import {
|
|
2
|
+
import { Crawl_HTML } from '../src/report/crawl_template.js';
|
|
3
3
|
import { Graph } from '../src/graph/graph.js';
|
|
4
4
|
import { computePageRank } from '../src/graph/pagerank.js';
|
|
5
5
|
|
|
@@ -23,24 +23,24 @@ describe('Visualization Data & Template', () => {
|
|
|
23
23
|
});
|
|
24
24
|
|
|
25
25
|
it('should contain UI toggle buttons for Authority Mode', () => {
|
|
26
|
-
expect(
|
|
27
|
-
expect(
|
|
26
|
+
expect(Crawl_HTML).toContain('id="btn-auth-pagerank"');
|
|
27
|
+
expect(Crawl_HTML).toContain('id="btn-auth-structural"');
|
|
28
28
|
});
|
|
29
29
|
|
|
30
30
|
it('should contain setAuthorityMode function', () => {
|
|
31
31
|
// Use regex to be flexible with whitespace
|
|
32
|
-
expect(
|
|
33
|
-
expect(
|
|
32
|
+
expect(Crawl_HTML).toMatch(/function\s+setAuthorityMode\s*\(mode,\s*btn\)/);
|
|
33
|
+
expect(Crawl_HTML).toContain('n.authority = mode === \'pagerank\' ? n.pageRankAuthority : n.structuralAuthority');
|
|
34
34
|
});
|
|
35
35
|
|
|
36
36
|
it('should contain logic to calculate pageRankAuthority from pageRankScore', () => {
|
|
37
|
-
expect(
|
|
38
|
-
expect(
|
|
37
|
+
expect(Crawl_HTML).toContain('n.pageRankAuthority = n.pageRankScore / 100');
|
|
38
|
+
expect(Crawl_HTML).toContain('n.structuralAuthority = Math.log(1 + n.inLinks)');
|
|
39
39
|
});
|
|
40
40
|
|
|
41
41
|
it('should update details panel to show both metrics', () => {
|
|
42
|
-
expect(
|
|
43
|
-
expect(
|
|
44
|
-
expect(
|
|
42
|
+
expect(Crawl_HTML).toContain('id="d-auth-container"');
|
|
43
|
+
expect(Crawl_HTML).toContain('In-Degree: ${structVal}');
|
|
44
|
+
expect(Crawl_HTML).toContain('PR: <strong>${prVal}</strong>');
|
|
45
45
|
});
|
|
46
46
|
});
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export declare const SITEGRAPH_HTML = "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n <meta charset=\"UTF-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n <title>Crawlith Site Graph</title>\n <style>\n :root {\n --bg-color: #121212;\n --text-color: #e0e0e0;\n --panel-bg: #1e1e1e;\n --border-color: #333;\n --accent-color: #4a90e2;\n --sidebar-width: 300px;\n }\n body { margin: 0; font-family: -apple-system, BlinkMacSystemFont, \"Segoe UI\", Roboto, Helvetica, Arial, sans-serif; background: var(--bg-color); color: var(--text-color); height: 100vh; display: flex; flex-direction: column; overflow: hidden; }\n\n /* Layout */\n header { padding: 0 20px; background: var(--panel-bg); border-bottom: 1px solid var(--border-color); display: flex; justify-content: space-between; align-items: center; height: 60px; box-sizing: border-box; z-index: 10; }\n main { flex: 1; display: flex; overflow: hidden; position: relative; }\n #graph-container { flex: 1; position: relative; overflow: hidden; background: var(--bg-color); }\n #details-panel { width: var(--sidebar-width); background: var(--panel-bg); border-left: 1px solid var(--border-color); padding: 20px; overflow-y: auto; box-sizing: border-box; display: none; flex-direction: column; gap: 15px; }\n #details-panel.visible { display: flex; }\n footer { padding: 5px 20px; background: var(--panel-bg); border-top: 1px solid var(--border-color); font-size: 0.8rem; text-align: center; color: #666; height: 30px; display: flex; align-items: center; justify-content: center; }\n\n /* Header Components */\n .brand { font-weight: bold; font-size: 1.2rem; display: flex; align-items: center; gap: 10px; }\n .brand span { color: var(--accent-color); }\n #metrics-summary { font-size: 0.9rem; color: #aaa; display: flex; gap: 20px; }\n .metric { display: flex; flex-direction: column; align-items: center; line-height: 1.1; }\n .metric-value { font-weight: bold; color: var(--text-color); }\n .metric-label { font-size: 0.7rem; }\n\n #controls { display: flex; gap: 10px; align-items: center; }\n .btn-group { display: flex; background: #333; border-radius: 4px; overflow: hidden; }\n button { background: transparent; color: #aaa; border: none; padding: 6px 12px; cursor: pointer; font-size: 0.85rem; transition: all 0.2s; }\n button:hover { color: white; background: rgba(255,255,255,0.1); }\n button.active { background: var(--accent-color); color: white; }\n\n /* Search */\n #search-container { position: absolute; top: 15px; left: 15px; z-index: 5; }\n #search-input { background: rgba(30,30,30,0.9); border: 1px solid #444; color: white; padding: 8px 12px; border-radius: 20px; width: 200px; outline: none; transition: width 0.3s; }\n #search-input:focus { width: 280px; border-color: var(--accent-color); }\n\n /* Graph */\n svg { width: 100%; height: 100%; display: block; }\n .node { cursor: pointer; transition: stroke-width 0.1s; }\n .link { stroke: #555; stroke-opacity: 0.3; fill: none; pointer-events: none; }\n\n /* Interaction States */\n .node.highlight { stroke: #fff; stroke-width: 2px; }\n .link.highlight { stroke-opacity: 0.8; stroke: #999; }\n .node.faded { opacity: 0.1; }\n .link.faded { opacity: 0.05; }\n\n /* Details Panel Content */\n .detail-section { border-bottom: 1px solid #333; padding-bottom: 10px; }\n .detail-section:last-child { border-bottom: none; }\n .detail-label { font-size: 0.75rem; color: #888; text-transform: uppercase; letter-spacing: 0.5px; margin-bottom: 4px; }\n .detail-value { font-size: 0.95rem; word-break: break-all; }\n .detail-list { list-style: none; padding: 0; margin: 0; max-height: 150px; overflow-y: auto; font-size: 0.85rem; }\n .detail-list li { padding: 4px 0; border-bottom: 1px solid #2a2a2a; }\n .detail-list a { color: var(--accent-color); text-decoration: none; }\n .detail-list a:hover { text-decoration: underline; }\n\n .status-badge { display: inline-block; padding: 2px 6px; border-radius: 3px; font-size: 0.75rem; font-weight: bold; margin-top: 5px; }\n .status-ok { background: #2e7d32; color: white; }\n .status-warn { background: #f9a825; color: black; }\n .status-error { background: #c62828; color: white; }\n\n /* Tooltip */\n #tooltip { position: absolute; background: rgba(20,20,20,0.95); color: white; padding: 10px; border-radius: 6px; pointer-events: none; font-size: 12px; z-index: 100; box-shadow: 0 4px 15px rgba(0,0,0,0.5); border: 1px solid #444; display: none; transform: translate(-50%, -100%); margin-top: -10px; white-space: nowrap; }\n\n /* Responsive Sidebar */\n @media (max-width: 768px) {\n #details-panel { position: absolute; right: 0; top: 0; bottom: 0; z-index: 20; box-shadow: -5px 0 15px rgba(0,0,0,0.5); transform: translateX(100%); transition: transform 0.3s ease; }\n #details-panel.visible { transform: translateX(0); }\n #metrics-summary { display: none; }\n }\n </style>\n</head>\n<body>\n <header>\n <div class=\"brand\"><span>Crawlith</span> SiteGraph</div>\n\n <div id=\"metrics-summary\">\n <div class=\"metric\"><span class=\"metric-value\" id=\"m-pages\">-</span><span class=\"metric-label\">Pages</span></div>\n <div class=\"metric\"><span class=\"metric-value\" id=\"m-depth\">-</span><span class=\"metric-label\">Max Depth</span></div>\n <div class=\"metric\"><span class=\"metric-value\" id=\"m-eff\">-</span><span class=\"metric-label\">Efficiency</span></div>\n <div class=\"metric\"><span class=\"metric-value\" id=\"m-orphan\">-</span><span class=\"metric-label\">Orphans</span></div>\n </div>\n\n <div id=\"controls\">\n <div class=\"btn-group\" style=\"margin-right: 15px;\">\n <button id=\"btn-auth-pagerank\" class=\"active\" title=\"PageRank Authority\">PageRank</button>\n <button id=\"btn-auth-structural\" title=\"Structural Authority (In-Degree)\">In-Degree</button>\n </div>\n <div class=\"btn-group\">\n <button id=\"btn-hierarchical\" class=\"active\">Hierarchical</button>\n <button id=\"btn-radial\">Radial</button>\n </div>\n </div>\n </header>\n\n <main>\n <div id=\"graph-container\">\n <div id=\"search-container\">\n <input type=\"text\" id=\"search-input\" placeholder=\"Search URL...\">\n </div>\n <svg id=\"graph\"></svg>\n <div id=\"tooltip\"></div>\n </div>\n\n <aside id=\"details-panel\">\n <div class=\"detail-section\">\n <div class=\"detail-label\">URL</div>\n <div class=\"detail-value\" id=\"d-url\">-</div>\n <div id=\"d-status\"></div>\n </div>\n <div class=\"detail-section\" style=\"display: flex; gap: 20px;\">\n <div>\n <div class=\"detail-label\">Depth</div>\n <div class=\"detail-value\" id=\"d-depth\">-</div>\n </div>\n <div>\n <div class=\"detail-label\">Authority</div>\n <div class=\"detail-value\" id=\"d-auth-container\">-</div>\n </div>\n </div>\n <div class=\"detail-section\">\n <div class=\"detail-label\">In-links (<span id=\"d-in-count\">0</span>)</div>\n <!-- List could be populated here if we had the reverse index, for now just count -->\n </div>\n <div class=\"detail-section\">\n <div class=\"detail-label\">Out-links (<span id=\"d-out-count\">0</span>)</div>\n <ul class=\"detail-list\" id=\"d-out-list\"></ul>\n </div>\n </aside>\n </main>\n\n <footer>\n Generated by Crawlith Crawler\n </footer>\n\n <!-- D3 from CDN -->\n <script src=\"https://d3js.org/d3.v7.min.js\"></script>\n\n <script>\n // --- State ---\n const state = {\n nodes: [],\n links: [],\n metrics: {},\n adjacency: new Map(), // url -> { in: [], out: [] }\n simulation: null,\n width: 0,\n height: 0,\n transform: d3.zoomIdentity,\n activeNode: null,\n mode: 'hierarchical', // 'hierarchical' | 'radial'\n maxDepth: 0,\n maxInLinks: 0,\n nodeSelection: null,\n linkSelection: null,\n zoom: null\n };\n\n // --- DOM Elements ---\n const svg = d3.select(\"#graph\");\n const container = svg.append(\"g\");\n const linkGroup = container.append(\"g\").attr(\"class\", \"links\");\n const nodeGroup = container.append(\"g\").attr(\"class\", \"nodes\");\n const tooltip = d3.select(\"#tooltip\");\n const detailsPanel = d3.select(\"#details-panel\");\n\n // --- Initialization ---\n // --- Initialization ---\n async function init() {\n try {\n let graphData, metricsData;\n\n // 1. Try to use injected data (for file:// usage)\n // @ts-ignore\n if (window.GRAPH_DATA) graphData = window.GRAPH_DATA;\n // @ts-ignore\n if (window.METRICS_DATA) metricsData = window.METRICS_DATA;\n\n // 2. Fallback to fetching JSON files (for web server usage)\n if (!graphData || !metricsData) {\n try {\n const [graphRes, metricsRes] = await Promise.all([\n fetch('graph.json'),\n fetch('metrics.json')\n ]);\n if (graphRes.ok && metricsRes.ok) {\n graphData = await graphRes.json();\n metricsData = await metricsRes.json();\n }\n } catch (e) {\n console.warn(\"Fetch failed, possibly due to CORS or missing files.\", e);\n }\n }\n\n if (!graphData || !metricsData) {\n throw new Error(\"No data available. Ensure graph.json exists or data is injected.\");\n }\n\n state.metrics = metricsData;\n processData(graphData);\n updateMetricsUI();\n\n // Setup UI\n setupResize();\n setupInteractions();\n setupSearch();\n\n // Start Simulation\n initSimulation();\n\n } catch (err) {\n console.error(err);\n alert(\"Error loading visualization data: \" + err.message);\n }\n }\n\n function processData(data) {\n // Create a map for fast lookup\n const nodeMap = new Map();\n\n data.nodes.forEach(n => {\n n.inLinks = n.inLinks || 0;\n n.outLinks = n.outLinks || 0;\n nodeMap.set(n.url, n);\n });\n\n // Filter valid links\n state.links = data.edges\n .map(e => ({ source: nodeMap.get(e.source), target: nodeMap.get(e.target) }))\n .filter(e => e.source && e.target);\n\n state.nodes = data.nodes;\n\n // Calculate Stats\n state.maxDepth = d3.max(state.nodes, d => d.depth) || 1;\n state.maxInLinks = d3.max(state.nodes, d => d.inLinks) || 1;\n\n // Calculate Authority & Enrich Nodes\n state.nodes.forEach(n => {\n // Structural Authority: log-scaled normalized 0-1 based on in-links\n n.structuralAuthority = Math.log(1 + n.inLinks) / Math.log(1 + state.maxInLinks);\n\n // PageRank Authority: normalized 0-1 from pageRankScore (0-100)\n if (typeof n.pageRankScore === 'number') {\n n.pageRankAuthority = n.pageRankScore / 100;\n } else {\n n.pageRankAuthority = n.structuralAuthority;\n }\n\n // Default authority to PageRank if available, else structural\n n.authority = n.pageRankAuthority;\n\n // Ensure x,y are initialized to avoid NaNs if D3 doesn't do it fast enough\n n.x = 0; n.y = 0;\n });\n\n // Build Adjacency Map\n state.nodes.forEach(n => state.adjacency.set(n.url, { in: [], out: [] }));\n state.links.forEach(l => {\n state.adjacency.get(l.source.url).out.push(l.target);\n state.adjacency.get(l.target.url).in.push(l.source);\n });\n }\n\n function updateMetricsUI() {\n document.getElementById('m-pages').textContent = state.metrics.totalPages;\n document.getElementById('m-depth').textContent = state.metrics.maxDepthFound;\n document.getElementById('m-eff').textContent = (state.metrics.crawlEfficiencyScore * 100).toFixed(1) + '%';\n document.getElementById('m-orphan').textContent = state.metrics.orphanPages.length;\n }\n\n // --- Simulation ---\n function initSimulation() {\n const { width, height } = getDimensions();\n state.width = width;\n state.height = height;\n\n // Safeguards\n const nodeCount = state.nodes.length;\n const enableCollision = nodeCount <= 1200;\n const alphaDecay = nodeCount > 1000 ? 0.05 : 0.02; // Faster decay for large graphs\n\n state.simulation = d3.forceSimulation(state.nodes)\n .alphaDecay(alphaDecay)\n .force(\"link\", d3.forceLink(state.links).id(d => d.url).strength(0.5)) // Reduced strength for flexibility\n .force(\"charge\", d3.forceManyBody().strength(nodeCount > 1000 ? -100 : -300))\n .force(\"center\", d3.forceCenter(width / 2, height / 2));\n\n if (enableCollision) {\n state.simulation.force(\"collide\", d3.forceCollide().radius(d => getNodeRadius(d) + 2).iterations(1));\n }\n\n // Apply Layout Mode\n applyLayoutMode(state.mode);\n\n // Rendering loop\n state.simulation.on(\"tick\", ticked);\n\n // Render initial SVG elements\n render();\n }\n\n function applyLayoutMode(mode) {\n state.mode = mode;\n const { width, height } = state;\n const centerY = height / 2;\n const centerX = width / 2;\n\n // Remove conflicting forces\n state.simulation.force(\"y\", null);\n state.simulation.force(\"radial\", null);\n\n if (mode === 'hierarchical') {\n const depthSpacing = height / (state.maxDepth + 2);\n // Hierarchical: Nodes pushed to Y levels based on depth\n state.simulation.force(\"y\", d3.forceY(d => {\n return (d.depth * depthSpacing) - (height/2) + 50; // Offset to start from top\n }).strength(1));\n // We rely on \"center\" force to keep X centered, but maybe add weak forceX?\n // Let's add weak forceX to prevent wide spread\n state.simulation.force(\"x\", d3.forceX(0).strength(0.05));\n state.simulation.force(\"center\", d3.forceCenter(width/2, height/2)); // Recenter\n\n } else if (mode === 'radial') {\n const maxRadius = Math.min(width, height) / 2 - 50;\n const ringSpacing = maxRadius / (state.maxDepth + 1);\n\n state.simulation.force(\"radial\", d3.forceRadial(\n d => d.depth * ringSpacing,\n width / 2,\n height / 2\n ).strength(0.8));\n\n state.simulation.force(\"x\", null); // Remove X constraint\n }\n\n state.simulation.alpha(1).restart();\n }\n\n function getNodeRadius(d) {\n // 5 + authority * 15\n return 5 + (d.authority * 15);\n }\n\n function getNodeColor(d) {\n // Depth-based sequential color (Blue -> Purple -> Pink)\n const t = d.depth / (state.maxDepth || 1);\n return d3.interpolateViridis(1 - t); // Invert Viridis for better contrast on dark\n }\n\n function render() {\n // Links\n state.linkSelection = linkGroup.selectAll(\"line\")\n .data(state.links)\n .join(\"line\")\n .attr(\"class\", \"link\")\n .attr(\"stroke-width\", 0.5);\n\n // Nodes\n state.nodeSelection = nodeGroup.selectAll(\"circle\")\n .data(state.nodes)\n .join(\"circle\")\n .attr(\"class\", \"node\")\n .attr(\"r\", d => getNodeRadius(d))\n .attr(\"fill\", d => getNodeColor(d))\n .attr(\"stroke\", d => d.status >= 400 ? \"#ff4444\" : null) // Red stroke for errors\n .on(\"mouseover\", (event, d) => {\n if (state.activeNode) return;\n highlightNode(d);\n showTooltip(event, d);\n })\n .on(\"mouseout\", () => {\n if (state.activeNode) return;\n resetHighlight();\n hideTooltip();\n })\n .on(\"click\", (event, d) => {\n event.stopPropagation();\n selectNode(d);\n })\n .call(d3.drag()\n .on(\"start\", dragstarted)\n .on(\"drag\", dragged)\n .on(\"end\", dragended));\n\n // Zoom\n state.zoom = d3.zoom()\n .scaleExtent([0.1, 4])\n .on(\"zoom\", (event) => {\n state.transform = event.transform;\n container.attr(\"transform\", event.transform);\n });\n\n svg.call(state.zoom)\n .call(state.zoom.transform, d3.zoomIdentity.translate(state.width/2, state.height/2).scale(0.8).translate(-state.width/2, -state.height/2)); // Initial zoom out\n }\n\n function ticked() {\n if (state.linkSelection) {\n state.linkSelection\n .attr(\"x1\", d => d.source.x)\n .attr(\"y1\", d => d.source.y)\n .attr(\"x2\", d => d.target.x)\n .attr(\"y2\", d => d.target.y);\n }\n\n if (state.nodeSelection) {\n state.nodeSelection\n .attr(\"cx\", d => d.x)\n .attr(\"cy\", d => d.y);\n }\n }\n\n // --- Interactions ---\n\n function setupInteractions() {\n // Background click to clear selection\n svg.on(\"click\", () => {\n state.activeNode = null;\n resetHighlight();\n detailsPanel.classed(\"visible\", false);\n });\n\n // Layout Toggle\n d3.select(\"#btn-hierarchical\").on(\"click\", function() {\n setMode('hierarchical', this);\n });\n d3.select(\"#btn-radial\").on(\"click\", function() {\n setMode('radial', this);\n });\n\n // Authority Toggle\n d3.select(\"#btn-auth-pagerank\").on(\"click\", function() {\n setAuthorityMode('pagerank', this);\n });\n d3.select(\"#btn-auth-structural\").on(\"click\", function() {\n setAuthorityMode('structural', this);\n });\n }\n\n function setAuthorityMode(mode, btn) {\n d3.select(\"#btn-auth-pagerank\").classed(\"active\", false);\n d3.select(\"#btn-auth-structural\").classed(\"active\", false);\n d3.select(btn).classed(\"active\", true);\n\n state.nodes.forEach(n => {\n n.authority = mode === 'pagerank' ? n.pageRankAuthority : n.structuralAuthority;\n });\n\n // Update Visuals\n nodeGroup.selectAll(\"circle\")\n .transition().duration(500)\n .attr(\"r\", d => getNodeRadius(d));\n\n // Update collision force if enabled\n if (state.simulation.force(\"collide\")) {\n state.simulation.force(\"collide\", d3.forceCollide().radius(d => getNodeRadius(d) + 2).iterations(1));\n state.simulation.alpha(0.3).restart();\n }\n }\n\n function setMode(mode, btn) {\n d3.selectAll(\"#controls button\").classed(\"active\", false);\n d3.select(btn).classed(\"active\", true);\n applyLayoutMode(mode);\n }\n\n function highlightNode(d) {\n const neighbors = new Set();\n const adj = state.adjacency.get(d.url);\n if (adj) {\n adj.in.forEach(n => neighbors.add(n.url));\n adj.out.forEach(n => neighbors.add(n.url));\n }\n neighbors.add(d.url);\n\n nodeGroup.selectAll(\"circle\").classed(\"faded\", n => !neighbors.has(n.url));\n nodeGroup.selectAll(\"circle\").classed(\"highlight\", n => n.url === d.url);\n\n linkGroup.selectAll(\"line\").classed(\"faded\", l =>\n l.source.url !== d.url && l.target.url !== d.url\n );\n linkGroup.selectAll(\"line\").classed(\"highlight\", l =>\n l.source.url === d.url || l.target.url === d.url\n );\n }\n\n function resetHighlight() {\n nodeGroup.selectAll(\"circle\").classed(\"faded\", false).classed(\"highlight\", false);\n linkGroup.selectAll(\"line\").classed(\"faded\", false).classed(\"highlight\", false);\n }\n\n function selectNode(d) {\n state.activeNode = d;\n highlightNode(d);\n showDetails(d);\n }\n\n function showTooltip(event, d) {\n // If we are transforming the container, we need to map coordinates correctly or just use pageX/Y\n tooltip.style(\"display\", \"block\")\n .html(`<strong>${new URL(d.url).pathname}</strong><br>Auth: ${(d.authority * 10).toFixed(1)}`)\n .style(\"left\", (event.pageX) + \"px\")\n .style(\"top\", (event.pageY - 10) + \"px\");\n }\n\n function hideTooltip() {\n tooltip.style(\"display\", \"none\");\n }\n\n function showDetails(d) {\n detailsPanel.classed(\"visible\", true);\n d3.select(\"#d-url\").text(d.url);\n d3.select(\"#d-depth\").text(d.depth);\n\n const authContainer = d3.select(\"#d-auth-container\");\n authContainer.html(\"\");\n const prVal = (d.pageRankAuthority * 100).toFixed(1);\n const structVal = d.structuralAuthority.toFixed(3);\n authContainer.append(\"div\").html(`PR: <strong>${prVal}</strong>`);\n authContainer.append(\"div\").style(\"color\", \"#888\").style(\"font-size\", \"0.8em\").text(`In-Degree: ${structVal}`);\n\n d3.select(\"#d-in-count\").text(d.inLinks);\n d3.select(\"#d-out-count\").text(d.outLinks);\n\n // Status badge\n const statusDiv = d3.select(\"#d-status\");\n statusDiv.html(\"\");\n let sClass = \"status-ok\";\n if (d.status >= 400) sClass = \"status-error\";\n else if (d.status >= 300) sClass = \"status-warn\";\n statusDiv.append(\"span\").attr(\"class\", \"status-badge \" + sClass).text(d.status);\n\n // Outlinks list (limit to 20)\n const list = d3.select(\"#d-out-list\");\n list.html(\"\");\n const adj = state.adjacency.get(d.url);\n if (adj && adj.out.length > 0) {\n adj.out.slice(0, 50).forEach(target => {\n list.append(\"li\").append(\"a\")\n .attr(\"href\", target.url)\n .attr(\"target\", \"_blank\")\n .text(new URL(target.url).pathname);\n });\n if (adj.out.length > 50) {\n list.append(\"li\").text(`...and ${adj.out.length - 50} more`);\n }\n } else {\n list.append(\"li\").text(\"No outgoing links\");\n }\n }\n\n // --- Search ---\n function setupSearch() {\n const input = document.getElementById('search-input');\n input.addEventListener('keydown', (e) => {\n if (e.key === 'Enter') {\n const val = input.value.trim().toLowerCase();\n if (!val) return;\n\n const found = state.nodes.find(n => n.url.toLowerCase().includes(val));\n if (found) {\n selectNode(found);\n // Center view on node\n const transform = d3.zoomIdentity\n .translate(state.width/2, state.height/2)\n .scale(2)\n .translate(-found.x, -found.y);\n\n svg.transition().duration(750).call(state.zoom.transform, transform);\n }\n }\n });\n }\n\n function setupResize() {\n window.addEventListener(\"resize\", () => {\n const { width, height } = getDimensions();\n state.width = width;\n state.height = height;\n state.simulation.force(\"center\", d3.forceCenter(width / 2, height / 2));\n if (state.mode === 'hierarchical') {\n // Re-evaluate Y force if needed, but usually center is enough\n }\n state.simulation.alpha(0.3).restart();\n });\n }\n\n function getDimensions() {\n const rect = document.getElementById(\"graph-container\").getBoundingClientRect();\n return { width: rect.width, height: rect.height };\n }\n\n // --- Dragging ---\n function dragstarted(event, d) {\n if (!event.active) state.simulation.alphaTarget(0.3).restart();\n d.fx = d.x;\n d.fy = d.y;\n }\n\n function dragged(event, d) {\n d.fx = event.x;\n d.fy = event.y;\n }\n\n function dragended(event, d) {\n if (!event.active) state.simulation.alphaTarget(0);\n d.fx = null;\n d.fy = null;\n }\n\n // Start\n if (document.readyState === 'loading') {\n document.addEventListener('DOMContentLoaded', init);\n } else {\n init();\n }\n </script>\n</body>\n</html>\n";
|