@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -1,7 +1,7 @@
1
1
  export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
2
2
  export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
3
3
 
4
- export interface SitegraphNode {
4
+ export interface CrawlNode {
5
5
  url: string;
6
6
  depth: number;
7
7
  inLinks: number;
@@ -19,7 +19,7 @@ export interface SitegraphNode {
19
19
  isProductOrCommercial?: boolean;
20
20
  }
21
21
 
22
- export interface SitegraphEdge {
22
+ export interface CrawlEdge {
23
23
  source: string;
24
24
  target: string;
25
25
  }
@@ -32,7 +32,7 @@ export interface OrphanScoringOptions {
32
32
  rootUrl?: string;
33
33
  }
34
34
 
35
- export type AnnotatedNode = SitegraphNode & {
35
+ export type AnnotatedNode = CrawlNode & {
36
36
  orphan: boolean;
37
37
  orphanType?: OrphanType;
38
38
  orphanSeverity?: number;
@@ -46,7 +46,7 @@ const LOW_VALUE_PATTERNS = [
46
46
  /\/search(\/|\?|$)/i
47
47
  ];
48
48
 
49
- function isLowValuePage(node: SitegraphNode): boolean {
49
+ function isLowValuePage(node: CrawlNode): boolean {
50
50
  const type = (node.pageType || '').toLowerCase();
51
51
  if (['pagination', 'tag', 'category', 'filter', 'search', 'archive'].includes(type)) {
52
52
  return true;
@@ -68,7 +68,7 @@ export function mapImpactLevel(score: number): ImpactLevel {
68
68
  return 'critical';
69
69
  }
70
70
 
71
- export function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphNode): number {
71
+ export function calculateOrphanSeverity(orphanType: OrphanType, node: CrawlNode): number {
72
72
  let score = 0;
73
73
 
74
74
  switch (orphanType) {
@@ -106,7 +106,7 @@ export function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphN
106
106
  return clampScore(score);
107
107
  }
108
108
 
109
- function consolidateInboundByCanonical(nodes: SitegraphNode[]): Map<string, number> {
109
+ function consolidateInboundByCanonical(nodes: CrawlNode[]): Map<string, number> {
110
110
  const canonicalInbound = new Map<string, number>();
111
111
  for (const node of nodes) {
112
112
  const canonical = node.canonicalUrl || node.url;
@@ -115,7 +115,7 @@ function consolidateInboundByCanonical(nodes: SitegraphNode[]): Map<string, numb
115
115
  return canonicalInbound;
116
116
  }
117
117
 
118
- export function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
118
+ export function annotateOrphans(nodes: CrawlNode[], edges: CrawlEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
119
119
  if (!options.enabled) {
120
120
  return nodes.map((node) => ({ ...node, orphan: false }));
121
121
  }
@@ -144,7 +144,7 @@ export function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[],
144
144
  const inboundSources = edges
145
145
  .filter((edge) => edge.target === node.url)
146
146
  .map((edge) => nodeByUrl.get(edge.source))
147
- .filter((source): source is SitegraphNode => Boolean(source));
147
+ .filter((source): source is CrawlNode => Boolean(source));
148
148
 
149
149
  if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) {
150
150
  orphanType = 'soft';
@@ -76,6 +76,33 @@ describe('structured data', () => {
76
76
  const missing = analyzeStructuredData('<p>none</p>');
77
77
  expect(missing.present).toBe(false);
78
78
  });
79
+
80
+ test('handles array of types', () => {
81
+ const html = '<script type="application/ld+json">{"@type": ["Article", "NewsArticle"]}</script>';
82
+ const result = analyzeStructuredData(html);
83
+ expect(result.types).toContain('Article');
84
+ expect(result.types).toContain('NewsArticle');
85
+ });
86
+
87
+ test('handles @graph structure', () => {
88
+ const html = '<script type="application/ld+json">{"@graph": [{"@type": "Person"}, {"@type": "Organization"}]}</script>';
89
+ const result = analyzeStructuredData(html);
90
+ expect(result.types).toContain('Person');
91
+ expect(result.types).toContain('Organization');
92
+ });
93
+
94
+ test('handles top-level array', () => {
95
+ const html = '<script type="application/ld+json">[{"@type": "A"}, {"@type": "B"}]</script>';
96
+ const result = analyzeStructuredData(html);
97
+ expect(result.types).toContain('A');
98
+ expect(result.types).toContain('B');
99
+ });
100
+
101
+ test('handles empty script content', () => {
102
+ const html = '<script type="application/ld+json"> </script>';
103
+ const result = analyzeStructuredData(html);
104
+ expect(result.valid).toBe(false);
105
+ });
79
106
  });
80
107
 
81
108
  describe('links and images', () => {
@@ -88,6 +115,15 @@ describe('links and images', () => {
88
115
  expect(links.externalRatio).toBeCloseTo(2 / 3);
89
116
  });
90
117
 
118
+ test('link ratio with no links', () => {
119
+ const html = '<div><p>No links here</p></div>';
120
+ const links = analyzeLinks(html, 'https://example.com/page', 'https://example.com');
121
+ expect(links.internalLinks).toBe(0);
122
+ expect(links.externalLinks).toBe(0);
123
+ expect(links.nofollowCount).toBe(0);
124
+ expect(links.externalRatio).toBe(0);
125
+ });
126
+
91
127
  test('image alt detection', () => {
92
128
  const html = '<img src="a"><img src="b" alt=""><img src="c" alt="ok">';
93
129
  const imgs = analyzeImageAlts(html);
@@ -95,4 +131,12 @@ describe('links and images', () => {
95
131
  expect(imgs.missingAlt).toBe(1);
96
132
  expect(imgs.emptyAlt).toBe(1);
97
133
  });
134
+
135
+ test('image alt detection no images', () => {
136
+ const html = '<div><p>No images here</p></div>';
137
+ const imgs = analyzeImageAlts(html);
138
+ expect(imgs.totalImages).toBe(0);
139
+ expect(imgs.missingAlt).toBe(0);
140
+ expect(imgs.emptyAlt).toBe(0);
141
+ });
98
142
  });
@@ -1,13 +1,75 @@
1
- import { describe, expect, test } from 'vitest';
1
+ import { describe, expect, test, afterEach, vi } from 'vitest';
2
2
  import path from 'node:path';
3
3
  import fs from 'node:fs/promises';
4
4
  import { analyzeSite, renderAnalysisHtml } from '../src/analysis/analyze.js';
5
+ import { getDb, closeDb } from '../src/db/index.js';
6
+ import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
7
+ import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
8
+ import { PageRepository } from '../src/db/repositories/PageRepository.js';
9
+ import { EdgeRepository } from '../src/db/repositories/EdgeRepository.js';
10
+ import { EngineContext } from '../src/events.js';
11
+
12
+ const mockContext: EngineContext = { emit: vi.fn() };
5
13
 
6
14
  describe('analyze integration', () => {
7
15
  const fixturePath = path.resolve(import.meta.dirname, 'fixtures/analyze-crawl.json');
8
16
 
17
+ async function setupTestDb(rawData: any) {
18
+ // Force in-memory DB for this test
19
+ process.env.CRAWLITH_DB_PATH = ':memory:';
20
+
21
+ // Close existing DB connection if any to ensure fresh start
22
+ closeDb();
23
+
24
+ const db = getDb();
25
+ const siteRepo = new SiteRepository(db);
26
+ const snapshotRepo = new SnapshotRepository(db);
27
+ const pageRepo = new PageRepository(db);
28
+ const edgeRepo = new EdgeRepository(db);
29
+
30
+ // Create site and snapshot
31
+ const domain = 'example.com';
32
+ const siteId = siteRepo.createSite(domain);
33
+ const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
34
+
35
+ // Parse fixture and load pages into db
36
+ const pages = rawData.pages || rawData.nodes || [];
37
+ pages.forEach((p: any) => {
38
+ pageRepo.upsertPage({
39
+ site_id: siteId,
40
+ normalized_url: p.url,
41
+ last_seen_snapshot_id: snapshotId,
42
+ http_status: p.status || 200,
43
+ html: p.html || '',
44
+ depth: p.depth || 0,
45
+ });
46
+ });
47
+
48
+ if (rawData.edges) {
49
+ rawData.edges.forEach((e: any) => {
50
+ const sourceId = pageRepo.getIdByUrl(siteId, e.source);
51
+ const targetId = pageRepo.getIdByUrl(siteId, e.target);
52
+ if (sourceId && targetId) {
53
+ edgeRepo.insertEdge(snapshotId, sourceId, targetId);
54
+ }
55
+ });
56
+ }
57
+
58
+ snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: pages.length, edge_count: (rawData.edges || []).length });
59
+ return { db, siteId, snapshotId };
60
+ }
61
+
62
+ afterEach(() => {
63
+ closeDb();
64
+ delete process.env.CRAWLITH_DB_PATH;
65
+ });
66
+
9
67
  test('analyzes full crawl fixture and schema', async () => {
10
- const result = await analyzeSite('https://example.com', { fromCrawl: fixturePath });
68
+ const rawContent = await fs.readFile(fixturePath, 'utf-8');
69
+ const rawData = JSON.parse(rawContent);
70
+ await setupTestDb(rawData);
71
+
72
+ const result = await analyzeSite('https://example.com', { allPages: true }, mockContext);
11
73
 
12
74
  expect(result.site_summary.pages_analyzed).toBe(3);
13
75
  expect(result.site_summary.duplicate_titles).toBe(2);
@@ -20,79 +82,52 @@ describe('analyze integration', () => {
20
82
  });
21
83
 
22
84
  test('module filter flags behavior', async () => {
23
- const seoOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, seo: true });
85
+ const rawContent = await fs.readFile(fixturePath, 'utf-8');
86
+ const rawData = JSON.parse(rawContent);
87
+ await setupTestDb(rawData);
88
+
89
+ const seoOnly = await analyzeSite('https://example.com', { seo: true }, mockContext);
24
90
  expect(seoOnly.pages[0].content.wordCount).toBe(0);
25
91
  expect(seoOnly.pages[0].images.totalImages).toBe(0);
26
92
 
27
- const contentOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, content: true });
93
+ const contentOnly = await analyzeSite('https://example.com', { content: true }, mockContext);
28
94
  expect(contentOnly.pages[0].title.status).toBe('missing');
29
95
  expect(contentOnly.pages[0].thinScore).toBeGreaterThanOrEqual(0);
30
96
 
31
- const accessibilityOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, accessibility: true });
97
+ const accessibilityOnly = await analyzeSite('https://example.com', { accessibility: true }, mockContext);
32
98
  expect(accessibilityOnly.pages[0].images.totalImages).toBeGreaterThan(0);
33
99
  expect(accessibilityOnly.pages[0].title.status).toBe('missing');
34
100
  });
35
101
 
36
102
  test('html report generation', async () => {
37
- const result = await analyzeSite('https://example.com', { fromCrawl: fixturePath });
103
+ const rawContent = await fs.readFile(fixturePath, 'utf-8');
104
+ const rawData = JSON.parse(rawContent);
105
+ await setupTestDb(rawData);
106
+
107
+ const result = await analyzeSite('https://example.com', {}, mockContext);
38
108
  const html = renderAnalysisHtml(result);
39
109
  expect(html).toContain('<table');
40
110
  expect(html).toContain('Analysis');
41
111
  });
42
112
 
43
113
  test('default database loading', async () => {
44
- // Force in-memory DB for this test
45
- process.env.CRAWLITH_DB_PATH = ':memory:';
46
-
47
- // Close existing DB connection if any to ensure fresh start
48
- const { getDb, closeDb } = await import('../src/db/index.js');
49
- closeDb();
50
-
51
- // Setup repositories
52
- const { SiteRepository } = await import('../src/db/repositories/SiteRepository.js');
53
- const { SnapshotRepository } = await import('../src/db/repositories/SnapshotRepository.js');
54
- const { PageRepository } = await import('../src/db/repositories/PageRepository.js');
55
-
56
- const db = getDb();
57
- const siteRepo = new SiteRepository(db);
58
- const snapshotRepo = new SnapshotRepository(db);
59
- const pageRepo = new PageRepository(db);
60
-
61
- // Create site and snapshot
62
- const siteId = siteRepo.createSite('example.com');
63
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
114
+ // This is essentially same as 'analyzes full crawl fixture' but was explicit before.
115
+ // We can keep it to verify manual DB setup works as expected (which setupTestDb does).
116
+ const rawContent = await fs.readFile(fixturePath, 'utf-8');
117
+ const rawData = JSON.parse(rawContent);
118
+ await setupTestDb(rawData);
64
119
 
65
- // Parse fixture and load pages into db
66
- const rawYaml = await fs.readFile(fixturePath, 'utf-8');
67
- const rawData = JSON.parse(rawYaml);
68
- (rawData.pages || rawData.nodes).forEach((p: any) => {
69
- pageRepo.upsertPage({
70
- site_id: siteId,
71
- normalized_url: p.url,
72
- last_seen_snapshot_id: snapshotId,
73
- http_status: p.status || 200,
74
- html: p.html || '',
75
- depth: p.depth || 0,
76
- });
77
- });
78
-
79
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 3, edge_count: 0 });
80
-
81
- try {
82
- const result = await analyzeSite('https://example.com', {});
83
- expect(result.site_summary.pages_analyzed).toBe(3);
84
- } finally {
85
- closeDb();
86
- delete process.env.CRAWLITH_DB_PATH;
87
- }
120
+ const result = await analyzeSite('https://example.com', { allPages: true }, mockContext);
121
+ expect(result.site_summary.pages_analyzed).toBe(3);
88
122
  });
89
123
 
90
124
  test('handles large html and js-only content', async () => {
91
125
  const hugeText = '<html><body><script>document.write("x")</script>' + '<p>word </p>'.repeat(1000) + '</body></html>';
92
- const tmpFile = path.resolve(import.meta.dirname, 'fixtures/large-analyze.json');
93
- await fs.writeFile(tmpFile, JSON.stringify({ pages: [{ url: 'https://example.com/', status: 200, depth: 0, html: hugeText }] }));
94
- const result = await analyzeSite('https://example.com', { fromCrawl: tmpFile });
126
+ const data = { pages: [{ url: 'https://example.com/', status: 200, depth: 0, html: hugeText }] };
127
+
128
+ await setupTestDb(data);
129
+
130
+ const result = await analyzeSite('https://example.com', {}, mockContext);
95
131
  expect(result.pages[0].content.wordCount).toBe(1000);
96
- await fs.unlink(tmpFile);
97
132
  });
98
133
  });
@@ -0,0 +1,98 @@
1
+ import { describe, expect, test } from 'vitest';
2
+ import { renderAnalysisMarkdown, AnalysisResult, PageAnalysis } from '../src/analysis/analyze.js';
3
+
4
+ describe('renderAnalysisMarkdown', () => {
5
+ const mockPage: PageAnalysis = {
6
+ url: 'https://example.com/page1',
7
+ status: 200,
8
+ title: { value: 'Page 1', length: 6, status: 'ok' },
9
+ metaDescription: { value: 'Desc 1', length: 6, status: 'ok' },
10
+ h1: { count: 1, status: 'ok', matchesTitle: true },
11
+ content: { wordCount: 100, textHtmlRatio: 0.5, uniqueSentenceCount: 10 },
12
+ thinScore: 0,
13
+ images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
14
+ links: { internalLinks: 5, externalLinks: 2, nofollowCount: 0, externalRatio: 0.2 },
15
+ structuredData: { present: true, valid: true, types: ['Article'] },
16
+ seoScore: 90,
17
+ meta: {}
18
+ };
19
+
20
+ const mockResult: AnalysisResult = {
21
+ site_summary: {
22
+ pages_analyzed: 2,
23
+ avg_seo_score: 85,
24
+ thin_pages: 0,
25
+ duplicate_titles: 0,
26
+ site_score: 88,
27
+ },
28
+ site_scores: {
29
+ overallScore: 88,
30
+ seoHealthScore: 85,
31
+ } as any, // casting to any to avoid mocking full return type of aggregateSiteScore if complex
32
+ pages: [
33
+ mockPage,
34
+ {
35
+ ...mockPage,
36
+ url: 'https://example.com/page2',
37
+ seoScore: 80,
38
+ thinScore: 10,
39
+ title: { value: 'Page 2', length: 6, status: 'duplicate' },
40
+ metaDescription: { value: 'Desc 2', length: 6, status: 'missing' },
41
+ }
42
+ ],
43
+ active_modules: {
44
+ seo: true,
45
+ content: true,
46
+ accessibility: true,
47
+ },
48
+ };
49
+
50
+ test('renders markdown summary correctly', () => {
51
+ const markdown = renderAnalysisMarkdown(mockResult);
52
+
53
+ expect(markdown).toContain('# Crawlith SEO Analysis Report');
54
+ expect(markdown).toContain('## 📊 Summary');
55
+ expect(markdown).toContain('- Pages Analyzed: 2');
56
+ expect(markdown).toContain('- Overall Site Score: 88.0');
57
+ expect(markdown).toContain('- Avg SEO Score: 85.0');
58
+ expect(markdown).toContain('- Thin Pages Found: 0');
59
+ expect(markdown).toContain('- Duplicate Titles: 0');
60
+ });
61
+
62
+ test('renders page details table header', () => {
63
+ const markdown = renderAnalysisMarkdown(mockResult);
64
+
65
+ expect(markdown).toContain('## 📄 Page Details');
66
+ expect(markdown).toContain('| URL | SEO Score | Thin Score | Title Status | Meta Status |');
67
+ expect(markdown).toContain('| :--- | :--- | :--- | :--- | :--- |');
68
+ });
69
+
70
+ test('renders page rows correctly', () => {
71
+ const markdown = renderAnalysisMarkdown(mockResult);
72
+
73
+ // Check first page row
74
+ expect(markdown).toContain('| https://example.com/page1 | 90 | 0 | ok | ok |');
75
+
76
+ // Check second page row
77
+ expect(markdown).toContain('| https://example.com/page2 | 80 | 10 | duplicate | missing |');
78
+ });
79
+
80
+ test('handles empty pages list', () => {
81
+ const emptyResult: AnalysisResult = {
82
+ ...mockResult,
83
+ pages: [],
84
+ site_summary: {
85
+ ...mockResult.site_summary,
86
+ pages_analyzed: 0,
87
+ }
88
+ };
89
+
90
+ const markdown = renderAnalysisMarkdown(emptyResult);
91
+
92
+ expect(markdown).toContain('- Pages Analyzed: 0');
93
+ // Should still contain headers
94
+ expect(markdown).toContain('| URL | SEO Score | Thin Score | Title Status | Meta Status |');
95
+ // Should not contain any data rows
96
+ expect(markdown).not.toContain('| https://example.com');
97
+ });
98
+ });
@@ -0,0 +1,101 @@
1
+ import { describe, it, expect, vi, beforeEach } from 'vitest';
2
+ import { auditUrl } from '../../src/audit/index.js';
3
+ import { resolveDns } from '../../src/audit/dns.js';
4
+ import { analyzeTransport } from '../../src/audit/transport.js';
5
+ import { analyzeHeaders } from '../../src/audit/headers.js';
6
+ import { calculateScore } from '../../src/audit/scoring.js';
7
+ import { IPGuard } from '../../src/core/security/ipGuard.js';
8
+
9
+ // Mock dependencies
10
+ vi.mock('../../src/audit/dns.js', () => ({
11
+ resolveDns: vi.fn(),
12
+ }));
13
+ vi.mock('../../src/audit/transport.js', () => ({
14
+ analyzeTransport: vi.fn(),
15
+ }));
16
+ vi.mock('../../src/audit/headers.js', () => ({
17
+ analyzeHeaders: vi.fn(),
18
+ }));
19
+ vi.mock('../../src/audit/scoring.js', () => ({
20
+ calculateScore: vi.fn(),
21
+ }));
22
+ vi.mock('../../src/core/security/ipGuard.js', () => ({
23
+ IPGuard: {
24
+ validateHost: vi.fn(),
25
+ },
26
+ }));
27
+
28
+ describe('auditUrl', () => {
29
+ const mockUrl = 'https://example.com';
30
+
31
+ beforeEach(() => {
32
+ vi.resetAllMocks();
33
+ });
34
+
35
+ it('should successfully audit a valid URL', async () => {
36
+ // Setup mocks
37
+ vi.mocked(IPGuard.validateHost).mockResolvedValue(true);
38
+
39
+ const mockDnsResult = { ip: '1.2.3.4' };
40
+ vi.mocked(resolveDns).mockResolvedValue(mockDnsResult as any);
41
+
42
+ const mockTransportResult = {
43
+ transport: { headers: {} },
44
+ performance: { loadTime: 100 },
45
+ issues: [],
46
+ };
47
+ vi.mocked(analyzeTransport).mockResolvedValue(mockTransportResult as any);
48
+
49
+ const mockHeadersResult = { grade: 'A' };
50
+ vi.mocked(analyzeHeaders).mockReturnValue(mockHeadersResult as any);
51
+
52
+ const mockScoringResult = {
53
+ score: 95,
54
+ grade: 'A',
55
+ issues: [],
56
+ };
57
+ vi.mocked(calculateScore).mockReturnValue(mockScoringResult as any);
58
+
59
+ // Execute
60
+ const result = await auditUrl(mockUrl);
61
+
62
+ // Verify
63
+ expect(IPGuard.validateHost).toHaveBeenCalledWith('example.com');
64
+ expect(resolveDns).toHaveBeenCalledWith('example.com');
65
+ expect(analyzeTransport).toHaveBeenCalledWith(mockUrl, 10000); // default timeout
66
+ expect(analyzeHeaders).toHaveBeenCalledWith(mockTransportResult.transport.headers);
67
+ expect(calculateScore).toHaveBeenCalled();
68
+
69
+ expect(result).toEqual({
70
+ url: mockUrl,
71
+ transport: mockTransportResult.transport,
72
+ securityHeaders: mockHeadersResult,
73
+ dns: mockDnsResult,
74
+ performance: mockTransportResult.performance,
75
+ score: mockScoringResult.score,
76
+ grade: mockScoringResult.grade,
77
+ issues: mockScoringResult.issues,
78
+ });
79
+ });
80
+
81
+ it('should throw error for invalid URL protocol', async () => {
82
+ await expect(auditUrl('ftp://example.com')).rejects.toThrow('Only HTTP and HTTPS protocols are supported');
83
+ });
84
+
85
+ it('should throw error for malformed URL', async () => {
86
+ await expect(auditUrl('not-a-url')).rejects.toThrow('Invalid URL');
87
+ });
88
+
89
+ it('should throw error if SSRF check fails', async () => {
90
+ vi.mocked(IPGuard.validateHost).mockResolvedValue(false);
91
+ await expect(auditUrl(mockUrl)).rejects.toThrow('Access to internal or private infrastructure is prohibited');
92
+ });
93
+
94
+ it('should propagate errors from dependencies', async () => {
95
+ vi.mocked(IPGuard.validateHost).mockResolvedValue(true);
96
+ vi.mocked(resolveDns).mockRejectedValue(new Error('DNS Error'));
97
+ vi.mocked(analyzeTransport).mockResolvedValue({} as any); // Should resolve if DNS fails? Wait, Promise.all fails if any fails.
98
+
99
+ await expect(auditUrl(mockUrl)).rejects.toThrow('DNS Error');
100
+ });
101
+ });
@@ -1,6 +1,6 @@
1
1
  import { describe, it, expect } from 'vitest';
2
2
  import { calculateScore } from '../../src/audit/scoring.js';
3
- import { TransportDiagnostics, DnsDiagnostics, SecurityHeadersResult, PerformanceMetrics, AuditIssue } from '../../src/audit/types.js';
3
+ import { TransportDiagnostics, DnsDiagnostics, SecurityHeadersResult, PerformanceMetrics } from '../../src/audit/types.js';
4
4
 
5
5
  describe('Scoring Engine', () => {
6
6
  const mockTransport: TransportDiagnostics = {
@@ -84,8 +84,8 @@ describe('Scoring Engine', () => {
84
84
 
85
85
  it('should fail on expired cert', () => {
86
86
  const expiredTransport = {
87
- ...mockTransport,
88
- certificate: { ...mockTransport.certificate!, daysUntilExpiry: -5, validTo: '2023-01-01' }
87
+ ...mockTransport,
88
+ certificate: { ...mockTransport.certificate!, daysUntilExpiry: -5, validTo: '2023-01-01' }
89
89
  };
90
90
  const result = calculateScore(expiredTransport, mockDns, mockHeaders, mockPerformance, []);
91
91
  expect(result.grade).toBe('F');
@@ -104,30 +104,30 @@ describe('Scoring Engine', () => {
104
104
  });
105
105
 
106
106
  it('should penalize poor performance', () => {
107
- const badPerf = { ...mockPerformance, ttfb: 1000, htmlSize: 2000000 };
108
- const result = calculateScore(mockTransport, mockDns, mockHeaders, badPerf, []);
109
- // TTFB > 800: Lose 10 pts
110
- // HTML > 1MB: Lose 5 pts
111
- // Total perf score (30) -> 15.
112
- expect(result.categoryScores.performance).toBe(15);
113
- expect(result.score).toBe(85);
114
- expect(result.issues).toEqual(expect.arrayContaining([
115
- expect.objectContaining({ id: 'slow-ttfb' }),
116
- expect.objectContaining({ id: 'large-html' })
117
- ]));
107
+ const badPerf = { ...mockPerformance, ttfb: 1000, htmlSize: 2000000 };
108
+ const result = calculateScore(mockTransport, mockDns, mockHeaders, badPerf, []);
109
+ // TTFB > 800: Lose 10 pts
110
+ // HTML > 1MB: Lose 5 pts
111
+ // Total perf score (30) -> 15.
112
+ expect(result.categoryScores.performance).toBe(15);
113
+ expect(result.score).toBe(85);
114
+ expect(result.issues).toEqual(expect.arrayContaining([
115
+ expect.objectContaining({ id: 'slow-ttfb' }),
116
+ expect.objectContaining({ id: 'large-html' })
117
+ ]));
118
118
  });
119
119
 
120
120
  it('should penalize infrastructure issues', () => {
121
- const badDns = { ...mockDns, ipv6Support: false, ipCount: 1 };
122
- const result = calculateScore(mockTransport, badDns, mockHeaders, mockPerformance, []);
123
- // No IPv6: Lose 10 pts
124
- // Single IP: Lose 10 pts
125
- // Infra score (20) -> 0.
126
- expect(result.categoryScores.infrastructure).toBe(0);
127
- expect(result.score).toBe(80);
128
- expect(result.issues).toEqual(expect.arrayContaining([
129
- expect.objectContaining({ id: 'no-ipv6' }),
130
- expect.objectContaining({ id: 'single-ip' })
131
- ]));
121
+ const badDns = { ...mockDns, ipv6Support: false, ipCount: 1 };
122
+ const result = calculateScore(mockTransport, badDns, mockHeaders, mockPerformance, []);
123
+ // No IPv6: Lose 10 pts
124
+ // Single IP: Lose 10 pts
125
+ // Infra score (20) -> 0.
126
+ expect(result.categoryScores.infrastructure).toBe(0);
127
+ expect(result.score).toBe(80);
128
+ expect(result.issues).toEqual(expect.arrayContaining([
129
+ expect.objectContaining({ id: 'no-ipv6' }),
130
+ expect.objectContaining({ id: 'single-ip' })
131
+ ]));
132
132
  });
133
133
  });
@@ -1,7 +1,6 @@
1
1
  import { describe, it, expect, vi, afterEach } from 'vitest';
2
2
  import { analyzeTransport } from '../../src/audit/transport.js';
3
3
  import https from 'node:https';
4
- import http from 'node:http';
5
4
  import tls from 'node:tls';
6
5
  import { EventEmitter } from 'events';
7
6