@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
|
|
2
2
|
export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
|
|
3
3
|
|
|
4
|
-
export interface
|
|
4
|
+
export interface CrawlNode {
|
|
5
5
|
url: string;
|
|
6
6
|
depth: number;
|
|
7
7
|
inLinks: number;
|
|
@@ -19,7 +19,7 @@ export interface SitegraphNode {
|
|
|
19
19
|
isProductOrCommercial?: boolean;
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
export interface
|
|
22
|
+
export interface CrawlEdge {
|
|
23
23
|
source: string;
|
|
24
24
|
target: string;
|
|
25
25
|
}
|
|
@@ -32,7 +32,7 @@ export interface OrphanScoringOptions {
|
|
|
32
32
|
rootUrl?: string;
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
-
export type AnnotatedNode =
|
|
35
|
+
export type AnnotatedNode = CrawlNode & {
|
|
36
36
|
orphan: boolean;
|
|
37
37
|
orphanType?: OrphanType;
|
|
38
38
|
orphanSeverity?: number;
|
|
@@ -46,7 +46,7 @@ const LOW_VALUE_PATTERNS = [
|
|
|
46
46
|
/\/search(\/|\?|$)/i
|
|
47
47
|
];
|
|
48
48
|
|
|
49
|
-
function isLowValuePage(node:
|
|
49
|
+
function isLowValuePage(node: CrawlNode): boolean {
|
|
50
50
|
const type = (node.pageType || '').toLowerCase();
|
|
51
51
|
if (['pagination', 'tag', 'category', 'filter', 'search', 'archive'].includes(type)) {
|
|
52
52
|
return true;
|
|
@@ -68,7 +68,7 @@ export function mapImpactLevel(score: number): ImpactLevel {
|
|
|
68
68
|
return 'critical';
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
-
export function calculateOrphanSeverity(orphanType: OrphanType, node:
|
|
71
|
+
export function calculateOrphanSeverity(orphanType: OrphanType, node: CrawlNode): number {
|
|
72
72
|
let score = 0;
|
|
73
73
|
|
|
74
74
|
switch (orphanType) {
|
|
@@ -106,7 +106,7 @@ export function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphN
|
|
|
106
106
|
return clampScore(score);
|
|
107
107
|
}
|
|
108
108
|
|
|
109
|
-
function consolidateInboundByCanonical(nodes:
|
|
109
|
+
function consolidateInboundByCanonical(nodes: CrawlNode[]): Map<string, number> {
|
|
110
110
|
const canonicalInbound = new Map<string, number>();
|
|
111
111
|
for (const node of nodes) {
|
|
112
112
|
const canonical = node.canonicalUrl || node.url;
|
|
@@ -115,7 +115,7 @@ function consolidateInboundByCanonical(nodes: SitegraphNode[]): Map<string, numb
|
|
|
115
115
|
return canonicalInbound;
|
|
116
116
|
}
|
|
117
117
|
|
|
118
|
-
export function annotateOrphans(nodes:
|
|
118
|
+
export function annotateOrphans(nodes: CrawlNode[], edges: CrawlEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
|
|
119
119
|
if (!options.enabled) {
|
|
120
120
|
return nodes.map((node) => ({ ...node, orphan: false }));
|
|
121
121
|
}
|
|
@@ -144,7 +144,7 @@ export function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[],
|
|
|
144
144
|
const inboundSources = edges
|
|
145
145
|
.filter((edge) => edge.target === node.url)
|
|
146
146
|
.map((edge) => nodeByUrl.get(edge.source))
|
|
147
|
-
.filter((source): source is
|
|
147
|
+
.filter((source): source is CrawlNode => Boolean(source));
|
|
148
148
|
|
|
149
149
|
if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) {
|
|
150
150
|
orphanType = 'soft';
|
|
@@ -76,6 +76,33 @@ describe('structured data', () => {
|
|
|
76
76
|
const missing = analyzeStructuredData('<p>none</p>');
|
|
77
77
|
expect(missing.present).toBe(false);
|
|
78
78
|
});
|
|
79
|
+
|
|
80
|
+
test('handles array of types', () => {
|
|
81
|
+
const html = '<script type="application/ld+json">{"@type": ["Article", "NewsArticle"]}</script>';
|
|
82
|
+
const result = analyzeStructuredData(html);
|
|
83
|
+
expect(result.types).toContain('Article');
|
|
84
|
+
expect(result.types).toContain('NewsArticle');
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
test('handles @graph structure', () => {
|
|
88
|
+
const html = '<script type="application/ld+json">{"@graph": [{"@type": "Person"}, {"@type": "Organization"}]}</script>';
|
|
89
|
+
const result = analyzeStructuredData(html);
|
|
90
|
+
expect(result.types).toContain('Person');
|
|
91
|
+
expect(result.types).toContain('Organization');
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
test('handles top-level array', () => {
|
|
95
|
+
const html = '<script type="application/ld+json">[{"@type": "A"}, {"@type": "B"}]</script>';
|
|
96
|
+
const result = analyzeStructuredData(html);
|
|
97
|
+
expect(result.types).toContain('A');
|
|
98
|
+
expect(result.types).toContain('B');
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
test('handles empty script content', () => {
|
|
102
|
+
const html = '<script type="application/ld+json"> </script>';
|
|
103
|
+
const result = analyzeStructuredData(html);
|
|
104
|
+
expect(result.valid).toBe(false);
|
|
105
|
+
});
|
|
79
106
|
});
|
|
80
107
|
|
|
81
108
|
describe('links and images', () => {
|
|
@@ -88,6 +115,15 @@ describe('links and images', () => {
|
|
|
88
115
|
expect(links.externalRatio).toBeCloseTo(2 / 3);
|
|
89
116
|
});
|
|
90
117
|
|
|
118
|
+
test('link ratio with no links', () => {
|
|
119
|
+
const html = '<div><p>No links here</p></div>';
|
|
120
|
+
const links = analyzeLinks(html, 'https://example.com/page', 'https://example.com');
|
|
121
|
+
expect(links.internalLinks).toBe(0);
|
|
122
|
+
expect(links.externalLinks).toBe(0);
|
|
123
|
+
expect(links.nofollowCount).toBe(0);
|
|
124
|
+
expect(links.externalRatio).toBe(0);
|
|
125
|
+
});
|
|
126
|
+
|
|
91
127
|
test('image alt detection', () => {
|
|
92
128
|
const html = '<img src="a"><img src="b" alt=""><img src="c" alt="ok">';
|
|
93
129
|
const imgs = analyzeImageAlts(html);
|
|
@@ -95,4 +131,12 @@ describe('links and images', () => {
|
|
|
95
131
|
expect(imgs.missingAlt).toBe(1);
|
|
96
132
|
expect(imgs.emptyAlt).toBe(1);
|
|
97
133
|
});
|
|
134
|
+
|
|
135
|
+
test('image alt detection no images', () => {
|
|
136
|
+
const html = '<div><p>No images here</p></div>';
|
|
137
|
+
const imgs = analyzeImageAlts(html);
|
|
138
|
+
expect(imgs.totalImages).toBe(0);
|
|
139
|
+
expect(imgs.missingAlt).toBe(0);
|
|
140
|
+
expect(imgs.emptyAlt).toBe(0);
|
|
141
|
+
});
|
|
98
142
|
});
|
|
@@ -1,13 +1,75 @@
|
|
|
1
|
-
import { describe, expect, test } from 'vitest';
|
|
1
|
+
import { describe, expect, test, afterEach, vi } from 'vitest';
|
|
2
2
|
import path from 'node:path';
|
|
3
3
|
import fs from 'node:fs/promises';
|
|
4
4
|
import { analyzeSite, renderAnalysisHtml } from '../src/analysis/analyze.js';
|
|
5
|
+
import { getDb, closeDb } from '../src/db/index.js';
|
|
6
|
+
import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
|
|
7
|
+
import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
|
|
8
|
+
import { PageRepository } from '../src/db/repositories/PageRepository.js';
|
|
9
|
+
import { EdgeRepository } from '../src/db/repositories/EdgeRepository.js';
|
|
10
|
+
import { EngineContext } from '../src/events.js';
|
|
11
|
+
|
|
12
|
+
const mockContext: EngineContext = { emit: vi.fn() };
|
|
5
13
|
|
|
6
14
|
describe('analyze integration', () => {
|
|
7
15
|
const fixturePath = path.resolve(import.meta.dirname, 'fixtures/analyze-crawl.json');
|
|
8
16
|
|
|
17
|
+
async function setupTestDb(rawData: any) {
|
|
18
|
+
// Force in-memory DB for this test
|
|
19
|
+
process.env.CRAWLITH_DB_PATH = ':memory:';
|
|
20
|
+
|
|
21
|
+
// Close existing DB connection if any to ensure fresh start
|
|
22
|
+
closeDb();
|
|
23
|
+
|
|
24
|
+
const db = getDb();
|
|
25
|
+
const siteRepo = new SiteRepository(db);
|
|
26
|
+
const snapshotRepo = new SnapshotRepository(db);
|
|
27
|
+
const pageRepo = new PageRepository(db);
|
|
28
|
+
const edgeRepo = new EdgeRepository(db);
|
|
29
|
+
|
|
30
|
+
// Create site and snapshot
|
|
31
|
+
const domain = 'example.com';
|
|
32
|
+
const siteId = siteRepo.createSite(domain);
|
|
33
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
|
|
34
|
+
|
|
35
|
+
// Parse fixture and load pages into db
|
|
36
|
+
const pages = rawData.pages || rawData.nodes || [];
|
|
37
|
+
pages.forEach((p: any) => {
|
|
38
|
+
pageRepo.upsertPage({
|
|
39
|
+
site_id: siteId,
|
|
40
|
+
normalized_url: p.url,
|
|
41
|
+
last_seen_snapshot_id: snapshotId,
|
|
42
|
+
http_status: p.status || 200,
|
|
43
|
+
html: p.html || '',
|
|
44
|
+
depth: p.depth || 0,
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
if (rawData.edges) {
|
|
49
|
+
rawData.edges.forEach((e: any) => {
|
|
50
|
+
const sourceId = pageRepo.getIdByUrl(siteId, e.source);
|
|
51
|
+
const targetId = pageRepo.getIdByUrl(siteId, e.target);
|
|
52
|
+
if (sourceId && targetId) {
|
|
53
|
+
edgeRepo.insertEdge(snapshotId, sourceId, targetId);
|
|
54
|
+
}
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: pages.length, edge_count: (rawData.edges || []).length });
|
|
59
|
+
return { db, siteId, snapshotId };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
afterEach(() => {
|
|
63
|
+
closeDb();
|
|
64
|
+
delete process.env.CRAWLITH_DB_PATH;
|
|
65
|
+
});
|
|
66
|
+
|
|
9
67
|
test('analyzes full crawl fixture and schema', async () => {
|
|
10
|
-
const
|
|
68
|
+
const rawContent = await fs.readFile(fixturePath, 'utf-8');
|
|
69
|
+
const rawData = JSON.parse(rawContent);
|
|
70
|
+
await setupTestDb(rawData);
|
|
71
|
+
|
|
72
|
+
const result = await analyzeSite('https://example.com', { allPages: true }, mockContext);
|
|
11
73
|
|
|
12
74
|
expect(result.site_summary.pages_analyzed).toBe(3);
|
|
13
75
|
expect(result.site_summary.duplicate_titles).toBe(2);
|
|
@@ -20,79 +82,52 @@ describe('analyze integration', () => {
|
|
|
20
82
|
});
|
|
21
83
|
|
|
22
84
|
test('module filter flags behavior', async () => {
|
|
23
|
-
const
|
|
85
|
+
const rawContent = await fs.readFile(fixturePath, 'utf-8');
|
|
86
|
+
const rawData = JSON.parse(rawContent);
|
|
87
|
+
await setupTestDb(rawData);
|
|
88
|
+
|
|
89
|
+
const seoOnly = await analyzeSite('https://example.com', { seo: true }, mockContext);
|
|
24
90
|
expect(seoOnly.pages[0].content.wordCount).toBe(0);
|
|
25
91
|
expect(seoOnly.pages[0].images.totalImages).toBe(0);
|
|
26
92
|
|
|
27
|
-
const contentOnly = await analyzeSite('https://example.com', {
|
|
93
|
+
const contentOnly = await analyzeSite('https://example.com', { content: true }, mockContext);
|
|
28
94
|
expect(contentOnly.pages[0].title.status).toBe('missing');
|
|
29
95
|
expect(contentOnly.pages[0].thinScore).toBeGreaterThanOrEqual(0);
|
|
30
96
|
|
|
31
|
-
const accessibilityOnly = await analyzeSite('https://example.com', {
|
|
97
|
+
const accessibilityOnly = await analyzeSite('https://example.com', { accessibility: true }, mockContext);
|
|
32
98
|
expect(accessibilityOnly.pages[0].images.totalImages).toBeGreaterThan(0);
|
|
33
99
|
expect(accessibilityOnly.pages[0].title.status).toBe('missing');
|
|
34
100
|
});
|
|
35
101
|
|
|
36
102
|
test('html report generation', async () => {
|
|
37
|
-
const
|
|
103
|
+
const rawContent = await fs.readFile(fixturePath, 'utf-8');
|
|
104
|
+
const rawData = JSON.parse(rawContent);
|
|
105
|
+
await setupTestDb(rawData);
|
|
106
|
+
|
|
107
|
+
const result = await analyzeSite('https://example.com', {}, mockContext);
|
|
38
108
|
const html = renderAnalysisHtml(result);
|
|
39
109
|
expect(html).toContain('<table');
|
|
40
110
|
expect(html).toContain('Analysis');
|
|
41
111
|
});
|
|
42
112
|
|
|
43
113
|
test('default database loading', async () => {
|
|
44
|
-
//
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
closeDb();
|
|
50
|
-
|
|
51
|
-
// Setup repositories
|
|
52
|
-
const { SiteRepository } = await import('../src/db/repositories/SiteRepository.js');
|
|
53
|
-
const { SnapshotRepository } = await import('../src/db/repositories/SnapshotRepository.js');
|
|
54
|
-
const { PageRepository } = await import('../src/db/repositories/PageRepository.js');
|
|
55
|
-
|
|
56
|
-
const db = getDb();
|
|
57
|
-
const siteRepo = new SiteRepository(db);
|
|
58
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
59
|
-
const pageRepo = new PageRepository(db);
|
|
60
|
-
|
|
61
|
-
// Create site and snapshot
|
|
62
|
-
const siteId = siteRepo.createSite('example.com');
|
|
63
|
-
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
|
|
114
|
+
// This is essentially same as 'analyzes full crawl fixture' but was explicit before.
|
|
115
|
+
// We can keep it to verify manual DB setup works as expected (which setupTestDb does).
|
|
116
|
+
const rawContent = await fs.readFile(fixturePath, 'utf-8');
|
|
117
|
+
const rawData = JSON.parse(rawContent);
|
|
118
|
+
await setupTestDb(rawData);
|
|
64
119
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
const rawData = JSON.parse(rawYaml);
|
|
68
|
-
(rawData.pages || rawData.nodes).forEach((p: any) => {
|
|
69
|
-
pageRepo.upsertPage({
|
|
70
|
-
site_id: siteId,
|
|
71
|
-
normalized_url: p.url,
|
|
72
|
-
last_seen_snapshot_id: snapshotId,
|
|
73
|
-
http_status: p.status || 200,
|
|
74
|
-
html: p.html || '',
|
|
75
|
-
depth: p.depth || 0,
|
|
76
|
-
});
|
|
77
|
-
});
|
|
78
|
-
|
|
79
|
-
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 3, edge_count: 0 });
|
|
80
|
-
|
|
81
|
-
try {
|
|
82
|
-
const result = await analyzeSite('https://example.com', {});
|
|
83
|
-
expect(result.site_summary.pages_analyzed).toBe(3);
|
|
84
|
-
} finally {
|
|
85
|
-
closeDb();
|
|
86
|
-
delete process.env.CRAWLITH_DB_PATH;
|
|
87
|
-
}
|
|
120
|
+
const result = await analyzeSite('https://example.com', { allPages: true }, mockContext);
|
|
121
|
+
expect(result.site_summary.pages_analyzed).toBe(3);
|
|
88
122
|
});
|
|
89
123
|
|
|
90
124
|
test('handles large html and js-only content', async () => {
|
|
91
125
|
const hugeText = '<html><body><script>document.write("x")</script>' + '<p>word </p>'.repeat(1000) + '</body></html>';
|
|
92
|
-
const
|
|
93
|
-
|
|
94
|
-
|
|
126
|
+
const data = { pages: [{ url: 'https://example.com/', status: 200, depth: 0, html: hugeText }] };
|
|
127
|
+
|
|
128
|
+
await setupTestDb(data);
|
|
129
|
+
|
|
130
|
+
const result = await analyzeSite('https://example.com', {}, mockContext);
|
|
95
131
|
expect(result.pages[0].content.wordCount).toBe(1000);
|
|
96
|
-
await fs.unlink(tmpFile);
|
|
97
132
|
});
|
|
98
133
|
});
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { describe, expect, test } from 'vitest';
|
|
2
|
+
import { renderAnalysisMarkdown, AnalysisResult, PageAnalysis } from '../src/analysis/analyze.js';
|
|
3
|
+
|
|
4
|
+
describe('renderAnalysisMarkdown', () => {
|
|
5
|
+
const mockPage: PageAnalysis = {
|
|
6
|
+
url: 'https://example.com/page1',
|
|
7
|
+
status: 200,
|
|
8
|
+
title: { value: 'Page 1', length: 6, status: 'ok' },
|
|
9
|
+
metaDescription: { value: 'Desc 1', length: 6, status: 'ok' },
|
|
10
|
+
h1: { count: 1, status: 'ok', matchesTitle: true },
|
|
11
|
+
content: { wordCount: 100, textHtmlRatio: 0.5, uniqueSentenceCount: 10 },
|
|
12
|
+
thinScore: 0,
|
|
13
|
+
images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
|
|
14
|
+
links: { internalLinks: 5, externalLinks: 2, nofollowCount: 0, externalRatio: 0.2 },
|
|
15
|
+
structuredData: { present: true, valid: true, types: ['Article'] },
|
|
16
|
+
seoScore: 90,
|
|
17
|
+
meta: {}
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
const mockResult: AnalysisResult = {
|
|
21
|
+
site_summary: {
|
|
22
|
+
pages_analyzed: 2,
|
|
23
|
+
avg_seo_score: 85,
|
|
24
|
+
thin_pages: 0,
|
|
25
|
+
duplicate_titles: 0,
|
|
26
|
+
site_score: 88,
|
|
27
|
+
},
|
|
28
|
+
site_scores: {
|
|
29
|
+
overallScore: 88,
|
|
30
|
+
seoHealthScore: 85,
|
|
31
|
+
} as any, // casting to any to avoid mocking full return type of aggregateSiteScore if complex
|
|
32
|
+
pages: [
|
|
33
|
+
mockPage,
|
|
34
|
+
{
|
|
35
|
+
...mockPage,
|
|
36
|
+
url: 'https://example.com/page2',
|
|
37
|
+
seoScore: 80,
|
|
38
|
+
thinScore: 10,
|
|
39
|
+
title: { value: 'Page 2', length: 6, status: 'duplicate' },
|
|
40
|
+
metaDescription: { value: 'Desc 2', length: 6, status: 'missing' },
|
|
41
|
+
}
|
|
42
|
+
],
|
|
43
|
+
active_modules: {
|
|
44
|
+
seo: true,
|
|
45
|
+
content: true,
|
|
46
|
+
accessibility: true,
|
|
47
|
+
},
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
test('renders markdown summary correctly', () => {
|
|
51
|
+
const markdown = renderAnalysisMarkdown(mockResult);
|
|
52
|
+
|
|
53
|
+
expect(markdown).toContain('# Crawlith SEO Analysis Report');
|
|
54
|
+
expect(markdown).toContain('## 📊 Summary');
|
|
55
|
+
expect(markdown).toContain('- Pages Analyzed: 2');
|
|
56
|
+
expect(markdown).toContain('- Overall Site Score: 88.0');
|
|
57
|
+
expect(markdown).toContain('- Avg SEO Score: 85.0');
|
|
58
|
+
expect(markdown).toContain('- Thin Pages Found: 0');
|
|
59
|
+
expect(markdown).toContain('- Duplicate Titles: 0');
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
test('renders page details table header', () => {
|
|
63
|
+
const markdown = renderAnalysisMarkdown(mockResult);
|
|
64
|
+
|
|
65
|
+
expect(markdown).toContain('## 📄 Page Details');
|
|
66
|
+
expect(markdown).toContain('| URL | SEO Score | Thin Score | Title Status | Meta Status |');
|
|
67
|
+
expect(markdown).toContain('| :--- | :--- | :--- | :--- | :--- |');
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test('renders page rows correctly', () => {
|
|
71
|
+
const markdown = renderAnalysisMarkdown(mockResult);
|
|
72
|
+
|
|
73
|
+
// Check first page row
|
|
74
|
+
expect(markdown).toContain('| https://example.com/page1 | 90 | 0 | ok | ok |');
|
|
75
|
+
|
|
76
|
+
// Check second page row
|
|
77
|
+
expect(markdown).toContain('| https://example.com/page2 | 80 | 10 | duplicate | missing |');
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
test('handles empty pages list', () => {
|
|
81
|
+
const emptyResult: AnalysisResult = {
|
|
82
|
+
...mockResult,
|
|
83
|
+
pages: [],
|
|
84
|
+
site_summary: {
|
|
85
|
+
...mockResult.site_summary,
|
|
86
|
+
pages_analyzed: 0,
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
const markdown = renderAnalysisMarkdown(emptyResult);
|
|
91
|
+
|
|
92
|
+
expect(markdown).toContain('- Pages Analyzed: 0');
|
|
93
|
+
// Should still contain headers
|
|
94
|
+
expect(markdown).toContain('| URL | SEO Score | Thin Score | Title Status | Meta Status |');
|
|
95
|
+
// Should not contain any data rows
|
|
96
|
+
expect(markdown).not.toContain('| https://example.com');
|
|
97
|
+
});
|
|
98
|
+
});
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
+
import { auditUrl } from '../../src/audit/index.js';
|
|
3
|
+
import { resolveDns } from '../../src/audit/dns.js';
|
|
4
|
+
import { analyzeTransport } from '../../src/audit/transport.js';
|
|
5
|
+
import { analyzeHeaders } from '../../src/audit/headers.js';
|
|
6
|
+
import { calculateScore } from '../../src/audit/scoring.js';
|
|
7
|
+
import { IPGuard } from '../../src/core/security/ipGuard.js';
|
|
8
|
+
|
|
9
|
+
// Mock dependencies
|
|
10
|
+
vi.mock('../../src/audit/dns.js', () => ({
|
|
11
|
+
resolveDns: vi.fn(),
|
|
12
|
+
}));
|
|
13
|
+
vi.mock('../../src/audit/transport.js', () => ({
|
|
14
|
+
analyzeTransport: vi.fn(),
|
|
15
|
+
}));
|
|
16
|
+
vi.mock('../../src/audit/headers.js', () => ({
|
|
17
|
+
analyzeHeaders: vi.fn(),
|
|
18
|
+
}));
|
|
19
|
+
vi.mock('../../src/audit/scoring.js', () => ({
|
|
20
|
+
calculateScore: vi.fn(),
|
|
21
|
+
}));
|
|
22
|
+
vi.mock('../../src/core/security/ipGuard.js', () => ({
|
|
23
|
+
IPGuard: {
|
|
24
|
+
validateHost: vi.fn(),
|
|
25
|
+
},
|
|
26
|
+
}));
|
|
27
|
+
|
|
28
|
+
describe('auditUrl', () => {
|
|
29
|
+
const mockUrl = 'https://example.com';
|
|
30
|
+
|
|
31
|
+
beforeEach(() => {
|
|
32
|
+
vi.resetAllMocks();
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('should successfully audit a valid URL', async () => {
|
|
36
|
+
// Setup mocks
|
|
37
|
+
vi.mocked(IPGuard.validateHost).mockResolvedValue(true);
|
|
38
|
+
|
|
39
|
+
const mockDnsResult = { ip: '1.2.3.4' };
|
|
40
|
+
vi.mocked(resolveDns).mockResolvedValue(mockDnsResult as any);
|
|
41
|
+
|
|
42
|
+
const mockTransportResult = {
|
|
43
|
+
transport: { headers: {} },
|
|
44
|
+
performance: { loadTime: 100 },
|
|
45
|
+
issues: [],
|
|
46
|
+
};
|
|
47
|
+
vi.mocked(analyzeTransport).mockResolvedValue(mockTransportResult as any);
|
|
48
|
+
|
|
49
|
+
const mockHeadersResult = { grade: 'A' };
|
|
50
|
+
vi.mocked(analyzeHeaders).mockReturnValue(mockHeadersResult as any);
|
|
51
|
+
|
|
52
|
+
const mockScoringResult = {
|
|
53
|
+
score: 95,
|
|
54
|
+
grade: 'A',
|
|
55
|
+
issues: [],
|
|
56
|
+
};
|
|
57
|
+
vi.mocked(calculateScore).mockReturnValue(mockScoringResult as any);
|
|
58
|
+
|
|
59
|
+
// Execute
|
|
60
|
+
const result = await auditUrl(mockUrl);
|
|
61
|
+
|
|
62
|
+
// Verify
|
|
63
|
+
expect(IPGuard.validateHost).toHaveBeenCalledWith('example.com');
|
|
64
|
+
expect(resolveDns).toHaveBeenCalledWith('example.com');
|
|
65
|
+
expect(analyzeTransport).toHaveBeenCalledWith(mockUrl, 10000); // default timeout
|
|
66
|
+
expect(analyzeHeaders).toHaveBeenCalledWith(mockTransportResult.transport.headers);
|
|
67
|
+
expect(calculateScore).toHaveBeenCalled();
|
|
68
|
+
|
|
69
|
+
expect(result).toEqual({
|
|
70
|
+
url: mockUrl,
|
|
71
|
+
transport: mockTransportResult.transport,
|
|
72
|
+
securityHeaders: mockHeadersResult,
|
|
73
|
+
dns: mockDnsResult,
|
|
74
|
+
performance: mockTransportResult.performance,
|
|
75
|
+
score: mockScoringResult.score,
|
|
76
|
+
grade: mockScoringResult.grade,
|
|
77
|
+
issues: mockScoringResult.issues,
|
|
78
|
+
});
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('should throw error for invalid URL protocol', async () => {
|
|
82
|
+
await expect(auditUrl('ftp://example.com')).rejects.toThrow('Only HTTP and HTTPS protocols are supported');
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it('should throw error for malformed URL', async () => {
|
|
86
|
+
await expect(auditUrl('not-a-url')).rejects.toThrow('Invalid URL');
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
it('should throw error if SSRF check fails', async () => {
|
|
90
|
+
vi.mocked(IPGuard.validateHost).mockResolvedValue(false);
|
|
91
|
+
await expect(auditUrl(mockUrl)).rejects.toThrow('Access to internal or private infrastructure is prohibited');
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it('should propagate errors from dependencies', async () => {
|
|
95
|
+
vi.mocked(IPGuard.validateHost).mockResolvedValue(true);
|
|
96
|
+
vi.mocked(resolveDns).mockRejectedValue(new Error('DNS Error'));
|
|
97
|
+
vi.mocked(analyzeTransport).mockResolvedValue({} as any); // Should resolve if DNS fails? Wait, Promise.all fails if any fails.
|
|
98
|
+
|
|
99
|
+
await expect(auditUrl(mockUrl)).rejects.toThrow('DNS Error');
|
|
100
|
+
});
|
|
101
|
+
});
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { describe, it, expect } from 'vitest';
|
|
2
2
|
import { calculateScore } from '../../src/audit/scoring.js';
|
|
3
|
-
import { TransportDiagnostics, DnsDiagnostics, SecurityHeadersResult, PerformanceMetrics
|
|
3
|
+
import { TransportDiagnostics, DnsDiagnostics, SecurityHeadersResult, PerformanceMetrics } from '../../src/audit/types.js';
|
|
4
4
|
|
|
5
5
|
describe('Scoring Engine', () => {
|
|
6
6
|
const mockTransport: TransportDiagnostics = {
|
|
@@ -84,8 +84,8 @@ describe('Scoring Engine', () => {
|
|
|
84
84
|
|
|
85
85
|
it('should fail on expired cert', () => {
|
|
86
86
|
const expiredTransport = {
|
|
87
|
-
|
|
88
|
-
|
|
87
|
+
...mockTransport,
|
|
88
|
+
certificate: { ...mockTransport.certificate!, daysUntilExpiry: -5, validTo: '2023-01-01' }
|
|
89
89
|
};
|
|
90
90
|
const result = calculateScore(expiredTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
91
91
|
expect(result.grade).toBe('F');
|
|
@@ -104,30 +104,30 @@ describe('Scoring Engine', () => {
|
|
|
104
104
|
});
|
|
105
105
|
|
|
106
106
|
it('should penalize poor performance', () => {
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
107
|
+
const badPerf = { ...mockPerformance, ttfb: 1000, htmlSize: 2000000 };
|
|
108
|
+
const result = calculateScore(mockTransport, mockDns, mockHeaders, badPerf, []);
|
|
109
|
+
// TTFB > 800: Lose 10 pts
|
|
110
|
+
// HTML > 1MB: Lose 5 pts
|
|
111
|
+
// Total perf score (30) -> 15.
|
|
112
|
+
expect(result.categoryScores.performance).toBe(15);
|
|
113
|
+
expect(result.score).toBe(85);
|
|
114
|
+
expect(result.issues).toEqual(expect.arrayContaining([
|
|
115
|
+
expect.objectContaining({ id: 'slow-ttfb' }),
|
|
116
|
+
expect.objectContaining({ id: 'large-html' })
|
|
117
|
+
]));
|
|
118
118
|
});
|
|
119
119
|
|
|
120
120
|
it('should penalize infrastructure issues', () => {
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
121
|
+
const badDns = { ...mockDns, ipv6Support: false, ipCount: 1 };
|
|
122
|
+
const result = calculateScore(mockTransport, badDns, mockHeaders, mockPerformance, []);
|
|
123
|
+
// No IPv6: Lose 10 pts
|
|
124
|
+
// Single IP: Lose 10 pts
|
|
125
|
+
// Infra score (20) -> 0.
|
|
126
|
+
expect(result.categoryScores.infrastructure).toBe(0);
|
|
127
|
+
expect(result.score).toBe(80);
|
|
128
|
+
expect(result.issues).toEqual(expect.arrayContaining([
|
|
129
|
+
expect.objectContaining({ id: 'no-ipv6' }),
|
|
130
|
+
expect.objectContaining({ id: 'single-ip' })
|
|
131
|
+
]));
|
|
132
132
|
});
|
|
133
133
|
});
|