@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,49 +0,0 @@
1
- // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
2
-
3
- exports[`orphan detection and severity scoring > canonical consolidation, robots exclusion, and deterministic JSON output snapshot 1`] = `
4
- "[
5
- {
6
- "url": "https://example.com/canonical",
7
- "depth": 1,
8
- "inLinks": 0,
9
- "outLinks": 0,
10
- "status": 200,
11
- "orphan": true,
12
- "orphanType": "near",
13
- "orphanSeverity": 80,
14
- "impactLevel": "high"
15
- },
16
- {
17
- "url": "https://example.com/variant?a=1",
18
- "depth": 1,
19
- "inLinks": 1,
20
- "outLinks": 0,
21
- "status": 200,
22
- "canonicalUrl": "https://example.com/canonical",
23
- "orphan": true,
24
- "orphanType": "near",
25
- "orphanSeverity": 80,
26
- "impactLevel": "high"
27
- },
28
- {
29
- "url": "https://example.com/blocked",
30
- "depth": 1,
31
- "inLinks": 0,
32
- "outLinks": 0,
33
- "status": 200,
34
- "robotsExcluded": true,
35
- "orphan": false
36
- },
37
- {
38
- "url": "https://example.com/redirect-target",
39
- "depth": 1,
40
- "inLinks": 1,
41
- "outLinks": 0,
42
- "status": 200,
43
- "orphan": true,
44
- "orphanType": "near",
45
- "orphanSeverity": 80,
46
- "impactLevel": "high"
47
- }
48
- ]"
49
- `;
@@ -1,142 +0,0 @@
1
- import { describe, expect, test } from 'vitest';
2
- import { analyzeTitle, analyzeMetaDescription, applyDuplicateStatuses, analyzeH1 } from '../src/analysis/seo.js';
3
- import { analyzeContent, calculateThinContentScore } from '../src/analysis/content.js';
4
- import { analyzeStructuredData } from '../src/analysis/structuredData.js';
5
- import { analyzeLinks } from '../src/analysis/links.js';
6
- import { analyzeImageAlts } from '../src/analysis/images.js';
7
-
8
- describe('SEO module', () => {
9
- test('analyze title edge cases', () => {
10
- expect(analyzeTitle('<html></html>').status).toBe('missing');
11
- expect(analyzeTitle('<title>short</title>').status).toBe('too_short');
12
- expect(analyzeTitle(`<title>${'a'.repeat(61)}</title>`).status).toBe('too_long');
13
- expect(analyzeTitle(`<title>${'a'.repeat(55)}</title>`).status).toBe('ok');
14
- });
15
-
16
- test('duplicate detection', () => {
17
- const values = applyDuplicateStatuses([
18
- { value: 'Same', length: 4, status: 'ok' as const },
19
- { value: 'same', length: 4, status: 'ok' as const },
20
- { value: null, length: 0, status: 'missing' as const }
21
- ]);
22
- expect(values[0].status).toBe('duplicate');
23
- expect(values[1].status).toBe('duplicate');
24
- expect(values[2].status).toBe('missing');
25
- });
26
-
27
- test('meta description boundaries', () => {
28
- expect(analyzeMetaDescription('<meta name="description" content="">').status).toBe('missing');
29
- expect(analyzeMetaDescription('<html></html>').status).toBe('missing');
30
- expect(analyzeMetaDescription('<meta name="description" content="short">').status).toBe('too_short');
31
- expect(analyzeMetaDescription(`<meta name="description" content="${'x'.repeat(150)}">`).status).toBe('ok');
32
- expect(analyzeMetaDescription(`<meta name="description" content="${'x'.repeat(170)}">`).status).toBe('too_long');
33
- });
34
-
35
- test('h1 variations', () => {
36
- expect(analyzeH1('<h1>One</h1>', 'Title').status).toBe('ok');
37
- expect(analyzeH1('<h1>One</h1><h1>Two</h1>', 'Title').status).toBe('warning');
38
- const noH1 = analyzeH1('<p>none</p>', 'Title');
39
- expect(noH1.status).toBe('critical');
40
- const same = analyzeH1('<h1>same</h1>', 'Same');
41
- expect(same.matchesTitle).toBe(true);
42
- });
43
- });
44
-
45
- describe('content module', () => {
46
- test('word count strips nav/footer/script/style', () => {
47
- const html = '<body><nav>skip me</nav><p>keep words here</p><footer>skip</footer><script>var x</script><style>.x{}</style></body>';
48
- const result = analyzeContent(html);
49
- expect(result.wordCount).toBe(3);
50
- expect(result.uniqueSentenceCount).toBe(1);
51
- expect(result.textHtmlRatio).toBeGreaterThan(0);
52
- });
53
-
54
- test('thin score boundaries', () => {
55
- expect(calculateThinContentScore({ wordCount: 600, textHtmlRatio: 0.5, uniqueSentenceCount: 4 }, 0)).toBe(0);
56
- expect(calculateThinContentScore({ wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 1 }, 100)).toBe(100);
57
- });
58
-
59
- test('content handles malformed/empty html', () => {
60
- expect(analyzeContent('').wordCount).toBe(0);
61
- expect(analyzeContent('<div><span>broken').wordCount).toBeGreaterThanOrEqual(1);
62
- });
63
- });
64
-
65
- describe('structured data', () => {
66
- test('valid and invalid JSON-LD parsing', () => {
67
- const valid = analyzeStructuredData('<script type="application/ld+json">{"@type":"Article"}</script>');
68
- expect(valid.present).toBe(true);
69
- expect(valid.valid).toBe(true);
70
- expect(valid.types).toContain('Article');
71
-
72
- const invalid = analyzeStructuredData('<script type="application/ld+json">{invalid}</script>');
73
- expect(invalid.present).toBe(true);
74
- expect(invalid.valid).toBe(false);
75
-
76
- const missing = analyzeStructuredData('<p>none</p>');
77
- expect(missing.present).toBe(false);
78
- });
79
-
80
- test('handles array of types', () => {
81
- const html = '<script type="application/ld+json">{"@type": ["Article", "NewsArticle"]}</script>';
82
- const result = analyzeStructuredData(html);
83
- expect(result.types).toContain('Article');
84
- expect(result.types).toContain('NewsArticle');
85
- });
86
-
87
- test('handles @graph structure', () => {
88
- const html = '<script type="application/ld+json">{"@graph": [{"@type": "Person"}, {"@type": "Organization"}]}</script>';
89
- const result = analyzeStructuredData(html);
90
- expect(result.types).toContain('Person');
91
- expect(result.types).toContain('Organization');
92
- });
93
-
94
- test('handles top-level array', () => {
95
- const html = '<script type="application/ld+json">[{"@type": "A"}, {"@type": "B"}]</script>';
96
- const result = analyzeStructuredData(html);
97
- expect(result.types).toContain('A');
98
- expect(result.types).toContain('B');
99
- });
100
-
101
- test('handles empty script content', () => {
102
- const html = '<script type="application/ld+json"> </script>';
103
- const result = analyzeStructuredData(html);
104
- expect(result.valid).toBe(false);
105
- });
106
- });
107
-
108
- describe('links and images', () => {
109
- test('link ratio calculation', () => {
110
- const html = '<a href="/a">A</a><a href="https://other.com">B</a><a href="https://other.com" rel="nofollow">C</a>';
111
- const links = analyzeLinks(html, 'https://example.com/page', 'https://example.com');
112
- expect(links.internalLinks).toBe(1);
113
- expect(links.externalLinks).toBe(2);
114
- expect(links.nofollowCount).toBe(1);
115
- expect(links.externalRatio).toBeCloseTo(2 / 3);
116
- });
117
-
118
- test('link ratio with no links', () => {
119
- const html = '<div><p>No links here</p></div>';
120
- const links = analyzeLinks(html, 'https://example.com/page', 'https://example.com');
121
- expect(links.internalLinks).toBe(0);
122
- expect(links.externalLinks).toBe(0);
123
- expect(links.nofollowCount).toBe(0);
124
- expect(links.externalRatio).toBe(0);
125
- });
126
-
127
- test('image alt detection', () => {
128
- const html = '<img src="a"><img src="b" alt=""><img src="c" alt="ok">';
129
- const imgs = analyzeImageAlts(html);
130
- expect(imgs.totalImages).toBe(3);
131
- expect(imgs.missingAlt).toBe(1);
132
- expect(imgs.emptyAlt).toBe(1);
133
- });
134
-
135
- test('image alt detection no images', () => {
136
- const html = '<div><p>No images here</p></div>';
137
- const imgs = analyzeImageAlts(html);
138
- expect(imgs.totalImages).toBe(0);
139
- expect(imgs.missingAlt).toBe(0);
140
- expect(imgs.emptyAlt).toBe(0);
141
- });
142
- });
@@ -1,133 +0,0 @@
1
- import { describe, expect, test, afterEach, vi } from 'vitest';
2
- import path from 'node:path';
3
- import fs from 'node:fs/promises';
4
- import { analyzeSite, renderAnalysisHtml } from '../src/analysis/analyze.js';
5
- import { getDb, closeDb } from '../src/db/index.js';
6
- import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
7
- import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
8
- import { PageRepository } from '../src/db/repositories/PageRepository.js';
9
- import { EdgeRepository } from '../src/db/repositories/EdgeRepository.js';
10
- import { EngineContext } from '../src/events.js';
11
-
12
- const mockContext: EngineContext = { emit: vi.fn() };
13
-
14
- describe('analyze integration', () => {
15
- const fixturePath = path.resolve(import.meta.dirname, 'fixtures/analyze-crawl.json');
16
-
17
- async function setupTestDb(rawData: any) {
18
- // Force in-memory DB for this test
19
- process.env.CRAWLITH_DB_PATH = ':memory:';
20
-
21
- // Close existing DB connection if any to ensure fresh start
22
- closeDb();
23
-
24
- const db = getDb();
25
- const siteRepo = new SiteRepository(db);
26
- const snapshotRepo = new SnapshotRepository(db);
27
- const pageRepo = new PageRepository(db);
28
- const edgeRepo = new EdgeRepository(db);
29
-
30
- // Create site and snapshot
31
- const domain = 'example.com';
32
- const siteId = siteRepo.createSite(domain);
33
- const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
34
-
35
- // Parse fixture and load pages into db
36
- const pages = rawData.pages || rawData.nodes || [];
37
- pages.forEach((p: any) => {
38
- pageRepo.upsertPage({
39
- site_id: siteId,
40
- normalized_url: p.url,
41
- last_seen_snapshot_id: snapshotId,
42
- http_status: p.status || 200,
43
- html: p.html || '',
44
- depth: p.depth || 0,
45
- });
46
- });
47
-
48
- if (rawData.edges) {
49
- rawData.edges.forEach((e: any) => {
50
- const sourceId = pageRepo.getIdByUrl(siteId, e.source);
51
- const targetId = pageRepo.getIdByUrl(siteId, e.target);
52
- if (sourceId && targetId) {
53
- edgeRepo.insertEdge(snapshotId, sourceId, targetId);
54
- }
55
- });
56
- }
57
-
58
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: pages.length, edge_count: (rawData.edges || []).length });
59
- return { db, siteId, snapshotId };
60
- }
61
-
62
- afterEach(() => {
63
- closeDb();
64
- delete process.env.CRAWLITH_DB_PATH;
65
- });
66
-
67
- test('analyzes full crawl fixture and schema', async () => {
68
- const rawContent = await fs.readFile(fixturePath, 'utf-8');
69
- const rawData = JSON.parse(rawContent);
70
- await setupTestDb(rawData);
71
-
72
- const result = await analyzeSite('https://example.com', { allPages: true }, mockContext);
73
-
74
- expect(result.site_summary.pages_analyzed).toBe(3);
75
- expect(result.site_summary.duplicate_titles).toBe(2);
76
- expect(result.site_summary.avg_seo_score).toBeGreaterThanOrEqual(0);
77
- expect(result.pages[0]).toHaveProperty('title');
78
- expect(result.pages[0]).toHaveProperty('content');
79
- expect(result.pages[0]).toHaveProperty('links');
80
- expect(result.site_scores.overallScore).toBeGreaterThanOrEqual(0);
81
- expect(result.site_scores.overallScore).toBeLessThanOrEqual(100);
82
- });
83
-
84
- test('module filter flags behavior', async () => {
85
- const rawContent = await fs.readFile(fixturePath, 'utf-8');
86
- const rawData = JSON.parse(rawContent);
87
- await setupTestDb(rawData);
88
-
89
- const seoOnly = await analyzeSite('https://example.com', { seo: true }, mockContext);
90
- expect(seoOnly.pages[0].content.wordCount).toBe(0);
91
- expect(seoOnly.pages[0].images.totalImages).toBe(0);
92
-
93
- const contentOnly = await analyzeSite('https://example.com', { content: true }, mockContext);
94
- expect(contentOnly.pages[0].title.status).toBe('missing');
95
- expect(contentOnly.pages[0].thinScore).toBeGreaterThanOrEqual(0);
96
-
97
- const accessibilityOnly = await analyzeSite('https://example.com', { accessibility: true }, mockContext);
98
- expect(accessibilityOnly.pages[0].images.totalImages).toBeGreaterThan(0);
99
- expect(accessibilityOnly.pages[0].title.status).toBe('missing');
100
- });
101
-
102
- test('html report generation', async () => {
103
- const rawContent = await fs.readFile(fixturePath, 'utf-8');
104
- const rawData = JSON.parse(rawContent);
105
- await setupTestDb(rawData);
106
-
107
- const result = await analyzeSite('https://example.com', {}, mockContext);
108
- const html = renderAnalysisHtml(result);
109
- expect(html).toContain('<table');
110
- expect(html).toContain('Analysis');
111
- });
112
-
113
- test('default database loading', async () => {
114
- // This is essentially same as 'analyzes full crawl fixture' but was explicit before.
115
- // We can keep it to verify manual DB setup works as expected (which setupTestDb does).
116
- const rawContent = await fs.readFile(fixturePath, 'utf-8');
117
- const rawData = JSON.parse(rawContent);
118
- await setupTestDb(rawData);
119
-
120
- const result = await analyzeSite('https://example.com', { allPages: true }, mockContext);
121
- expect(result.site_summary.pages_analyzed).toBe(3);
122
- });
123
-
124
- test('handles large html and js-only content', async () => {
125
- const hugeText = '<html><body><script>document.write("x")</script>' + '<p>word </p>'.repeat(1000) + '</body></html>';
126
- const data = { pages: [{ url: 'https://example.com/', status: 200, depth: 0, html: hugeText }] };
127
-
128
- await setupTestDb(data);
129
-
130
- const result = await analyzeSite('https://example.com', {}, mockContext);
131
- expect(result.pages[0].content.wordCount).toBe(1000);
132
- });
133
- });
@@ -1,98 +0,0 @@
1
- import { describe, expect, test } from 'vitest';
2
- import { renderAnalysisMarkdown, AnalysisResult, PageAnalysis } from '../src/analysis/analyze.js';
3
-
4
- describe('renderAnalysisMarkdown', () => {
5
- const mockPage: PageAnalysis = {
6
- url: 'https://example.com/page1',
7
- status: 200,
8
- title: { value: 'Page 1', length: 6, status: 'ok' },
9
- metaDescription: { value: 'Desc 1', length: 6, status: 'ok' },
10
- h1: { count: 1, status: 'ok', matchesTitle: true },
11
- content: { wordCount: 100, textHtmlRatio: 0.5, uniqueSentenceCount: 10 },
12
- thinScore: 0,
13
- images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
14
- links: { internalLinks: 5, externalLinks: 2, nofollowCount: 0, externalRatio: 0.2 },
15
- structuredData: { present: true, valid: true, types: ['Article'] },
16
- seoScore: 90,
17
- meta: {}
18
- };
19
-
20
- const mockResult: AnalysisResult = {
21
- site_summary: {
22
- pages_analyzed: 2,
23
- avg_seo_score: 85,
24
- thin_pages: 0,
25
- duplicate_titles: 0,
26
- site_score: 88,
27
- },
28
- site_scores: {
29
- overallScore: 88,
30
- seoHealthScore: 85,
31
- } as any, // casting to any to avoid mocking full return type of aggregateSiteScore if complex
32
- pages: [
33
- mockPage,
34
- {
35
- ...mockPage,
36
- url: 'https://example.com/page2',
37
- seoScore: 80,
38
- thinScore: 10,
39
- title: { value: 'Page 2', length: 6, status: 'duplicate' },
40
- metaDescription: { value: 'Desc 2', length: 6, status: 'missing' },
41
- }
42
- ],
43
- active_modules: {
44
- seo: true,
45
- content: true,
46
- accessibility: true,
47
- },
48
- };
49
-
50
- test('renders markdown summary correctly', () => {
51
- const markdown = renderAnalysisMarkdown(mockResult);
52
-
53
- expect(markdown).toContain('# Crawlith SEO Analysis Report');
54
- expect(markdown).toContain('## 📊 Summary');
55
- expect(markdown).toContain('- Pages Analyzed: 2');
56
- expect(markdown).toContain('- Overall Site Score: 88.0');
57
- expect(markdown).toContain('- Avg SEO Score: 85.0');
58
- expect(markdown).toContain('- Thin Pages Found: 0');
59
- expect(markdown).toContain('- Duplicate Titles: 0');
60
- });
61
-
62
- test('renders page details table header', () => {
63
- const markdown = renderAnalysisMarkdown(mockResult);
64
-
65
- expect(markdown).toContain('## 📄 Page Details');
66
- expect(markdown).toContain('| URL | SEO Score | Thin Score | Title Status | Meta Status |');
67
- expect(markdown).toContain('| :--- | :--- | :--- | :--- | :--- |');
68
- });
69
-
70
- test('renders page rows correctly', () => {
71
- const markdown = renderAnalysisMarkdown(mockResult);
72
-
73
- // Check first page row
74
- expect(markdown).toContain('| https://example.com/page1 | 90 | 0 | ok | ok |');
75
-
76
- // Check second page row
77
- expect(markdown).toContain('| https://example.com/page2 | 80 | 10 | duplicate | missing |');
78
- });
79
-
80
- test('handles empty pages list', () => {
81
- const emptyResult: AnalysisResult = {
82
- ...mockResult,
83
- pages: [],
84
- site_summary: {
85
- ...mockResult.site_summary,
86
- pages_analyzed: 0,
87
- }
88
- };
89
-
90
- const markdown = renderAnalysisMarkdown(emptyResult);
91
-
92
- expect(markdown).toContain('- Pages Analyzed: 0');
93
- // Should still contain headers
94
- expect(markdown).toContain('| URL | SEO Score | Thin Score | Title Status | Meta Status |');
95
- // Should not contain any data rows
96
- expect(markdown).not.toContain('| https://example.com');
97
- });
98
- });
@@ -1,101 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { auditUrl } from '../../src/audit/index.js';
3
- import { resolveDns } from '../../src/audit/dns.js';
4
- import { analyzeTransport } from '../../src/audit/transport.js';
5
- import { analyzeHeaders } from '../../src/audit/headers.js';
6
- import { calculateScore } from '../../src/audit/scoring.js';
7
- import { IPGuard } from '../../src/core/security/ipGuard.js';
8
-
9
- // Mock dependencies
10
- vi.mock('../../src/audit/dns.js', () => ({
11
- resolveDns: vi.fn(),
12
- }));
13
- vi.mock('../../src/audit/transport.js', () => ({
14
- analyzeTransport: vi.fn(),
15
- }));
16
- vi.mock('../../src/audit/headers.js', () => ({
17
- analyzeHeaders: vi.fn(),
18
- }));
19
- vi.mock('../../src/audit/scoring.js', () => ({
20
- calculateScore: vi.fn(),
21
- }));
22
- vi.mock('../../src/core/security/ipGuard.js', () => ({
23
- IPGuard: {
24
- validateHost: vi.fn(),
25
- },
26
- }));
27
-
28
- describe('auditUrl', () => {
29
- const mockUrl = 'https://example.com';
30
-
31
- beforeEach(() => {
32
- vi.resetAllMocks();
33
- });
34
-
35
- it('should successfully audit a valid URL', async () => {
36
- // Setup mocks
37
- vi.mocked(IPGuard.validateHost).mockResolvedValue(true);
38
-
39
- const mockDnsResult = { ip: '1.2.3.4' };
40
- vi.mocked(resolveDns).mockResolvedValue(mockDnsResult as any);
41
-
42
- const mockTransportResult = {
43
- transport: { headers: {} },
44
- performance: { loadTime: 100 },
45
- issues: [],
46
- };
47
- vi.mocked(analyzeTransport).mockResolvedValue(mockTransportResult as any);
48
-
49
- const mockHeadersResult = { grade: 'A' };
50
- vi.mocked(analyzeHeaders).mockReturnValue(mockHeadersResult as any);
51
-
52
- const mockScoringResult = {
53
- score: 95,
54
- grade: 'A',
55
- issues: [],
56
- };
57
- vi.mocked(calculateScore).mockReturnValue(mockScoringResult as any);
58
-
59
- // Execute
60
- const result = await auditUrl(mockUrl);
61
-
62
- // Verify
63
- expect(IPGuard.validateHost).toHaveBeenCalledWith('example.com');
64
- expect(resolveDns).toHaveBeenCalledWith('example.com');
65
- expect(analyzeTransport).toHaveBeenCalledWith(mockUrl, 10000); // default timeout
66
- expect(analyzeHeaders).toHaveBeenCalledWith(mockTransportResult.transport.headers);
67
- expect(calculateScore).toHaveBeenCalled();
68
-
69
- expect(result).toEqual({
70
- url: mockUrl,
71
- transport: mockTransportResult.transport,
72
- securityHeaders: mockHeadersResult,
73
- dns: mockDnsResult,
74
- performance: mockTransportResult.performance,
75
- score: mockScoringResult.score,
76
- grade: mockScoringResult.grade,
77
- issues: mockScoringResult.issues,
78
- });
79
- });
80
-
81
- it('should throw error for invalid URL protocol', async () => {
82
- await expect(auditUrl('ftp://example.com')).rejects.toThrow('Only HTTP and HTTPS protocols are supported');
83
- });
84
-
85
- it('should throw error for malformed URL', async () => {
86
- await expect(auditUrl('not-a-url')).rejects.toThrow('Invalid URL');
87
- });
88
-
89
- it('should throw error if SSRF check fails', async () => {
90
- vi.mocked(IPGuard.validateHost).mockResolvedValue(false);
91
- await expect(auditUrl(mockUrl)).rejects.toThrow('Access to internal or private infrastructure is prohibited');
92
- });
93
-
94
- it('should propagate errors from dependencies', async () => {
95
- vi.mocked(IPGuard.validateHost).mockResolvedValue(true);
96
- vi.mocked(resolveDns).mockRejectedValue(new Error('DNS Error'));
97
- vi.mocked(analyzeTransport).mockResolvedValue({} as any); // Should resolve if DNS fails? Wait, Promise.all fails if any fails.
98
-
99
- await expect(auditUrl(mockUrl)).rejects.toThrow('DNS Error');
100
- });
101
- });
@@ -1,31 +0,0 @@
1
- import { describe, it, expect, vi } from 'vitest';
2
- import { resolveDns } from '../../src/audit/dns.js';
3
- import dns from 'node:dns/promises';
4
-
5
- vi.mock('node:dns/promises');
6
-
7
- describe('DNS Diagnostics', () => {
8
- it('should resolve all records', async () => {
9
- vi.spyOn(dns, 'resolve4').mockResolvedValue(['1.1.1.1']);
10
- vi.spyOn(dns, 'resolve6').mockResolvedValue(['2606::1']);
11
- vi.spyOn(dns, 'resolveCname').mockRejectedValue(new Error('ENODATA'));
12
- vi.spyOn(dns, 'reverse').mockResolvedValue(['one.one.one.one']);
13
-
14
- const result = await resolveDns('example.com');
15
- expect(result.a).toEqual(['1.1.1.1']);
16
- expect(result.aaaa).toEqual(['2606::1']);
17
- expect(result.ipv6Support).toBe(true);
18
- expect(result.reverse).toEqual(['one.one.one.one']);
19
- expect(result.resolutionTime).toBeGreaterThanOrEqual(0);
20
- });
21
-
22
- it('should handle failures gracefully', async () => {
23
- vi.spyOn(dns, 'resolve4').mockRejectedValue(new Error('ENOTFOUND'));
24
- vi.spyOn(dns, 'resolve6').mockRejectedValue(new Error('ENOTFOUND'));
25
- vi.spyOn(dns, 'resolveCname').mockRejectedValue(new Error('ENOTFOUND'));
26
-
27
- const result = await resolveDns('invalid.com');
28
- expect(result.a).toEqual([]);
29
- expect(result.ipCount).toBe(0);
30
- });
31
- });
@@ -1,45 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { analyzeHeaders } from '../../src/audit/headers.js';
3
-
4
- describe('Headers Analysis', () => {
5
- it('should detect all secure headers', () => {
6
- const headers = {
7
- 'strict-transport-security': 'max-age=31536000; includeSubDomains',
8
- 'content-security-policy': "default-src 'self'",
9
- 'x-frame-options': 'DENY',
10
- 'x-content-type-options': 'nosniff',
11
- 'referrer-policy': 'strict-origin-when-cross-origin',
12
- 'permissions-policy': 'geolocation=()'
13
- };
14
- const result = analyzeHeaders(headers);
15
- expect(result.score).toBe(100);
16
- expect(result.strictTransportSecurity.valid).toBe(true);
17
- });
18
-
19
- it('should handle missing headers', () => {
20
- const headers = {};
21
- const result = analyzeHeaders(headers);
22
- expect(result.score).toBe(0);
23
- expect(result.strictTransportSecurity.present).toBe(false);
24
- });
25
-
26
- it('should validate HSTS properly', () => {
27
- const headers = {
28
- 'strict-transport-security': 'max-age=0'
29
- };
30
- // valid requires max-age
31
- const result = analyzeHeaders(headers);
32
- expect(result.strictTransportSecurity.valid).toBe(true);
33
- // Wait, checkHSTS: includes('max-age=') is true. includes('includeSubDomains') is false.
34
- // Issues will contain 'Missing includeSubDomains'.
35
- expect(result.strictTransportSecurity.issues).toContain('Missing includeSubDomains');
36
- });
37
-
38
- it('should validate invalid HSTS', () => {
39
- const headers = {
40
- 'strict-transport-security': 'invalid'
41
- };
42
- const result = analyzeHeaders(headers);
43
- expect(result.strictTransportSecurity.valid).toBe(false);
44
- });
45
- });