@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,84 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { DomainFilter } from '../src/core/scope/domainFilter.js';
3
- import { SubdomainPolicy } from '../src/core/scope/subdomainPolicy.js';
4
- import { ScopeManager } from '../src/core/scope/scopeManager.js';
5
-
6
- describe('DomainFilter', () => {
7
- it('should normalize hostnames', () => {
8
- const filter = new DomainFilter(['EXAMPLE.COM.'], ['DENY.COM.']);
9
- expect(filter.isAllowed('example.com')).toBe(true);
10
- expect(filter.isAllowed('deny.com')).toBe(false);
11
- });
12
-
13
- it('should respect precedence (deny wins)', () => {
14
- const filter = new DomainFilter(['example.com'], ['example.com']);
15
- expect(filter.isAllowed('example.com')).toBe(false);
16
- });
17
-
18
- it('should handle punycode', () => {
19
- // xn--80ak6aa92e.com is punycode for пример.com
20
- const filter = new DomainFilter(['xn--80ak6aa92e.com']);
21
- expect(filter.isAllowed('XN--80AK6AA92E.COM')).toBe(true);
22
- });
23
-
24
- it('should block if not in allow list (when list not empty)', () => {
25
- const filter = new DomainFilter(['allowed.com']);
26
- expect(filter.isAllowed('other.com')).toBe(false);
27
- });
28
-
29
- it('should fallback to raw string on invalid hostname', () => {
30
- // '[' and 'http://denied-invalid-[' causes new URL() to throw
31
- const filter = new DomainFilter(['['], ['denied-invalid-[']);
32
- expect(filter.isAllowed('[')).toBe(true);
33
- expect(filter.isAllowed('denied-invalid-[')).toBe(false);
34
- });
35
- });
36
-
37
- describe('SubdomainPolicy', () => {
38
- it('should enforce exact match by default', () => {
39
- const policy = new SubdomainPolicy('https://example.com');
40
- expect(policy.isAllowed('example.com')).toBe(true);
41
- expect(policy.isAllowed('sub.example.com')).toBe(false);
42
- });
43
-
44
- it('should allow valid subdomains when enabled', () => {
45
- const policy = new SubdomainPolicy('https://example.com', true);
46
- expect(policy.isAllowed('example.com')).toBe(true);
47
- expect(policy.isAllowed('sub.example.com')).toBe(true);
48
- expect(policy.isAllowed('deep.sub.example.com')).toBe(true);
49
- });
50
-
51
- it('should reject malicious suffix matches', () => {
52
- const policy = new SubdomainPolicy('https://example.com', true);
53
- expect(policy.isAllowed('evil-example.com')).toBe(false);
54
- expect(policy.isAllowed('example.com.evil.com')).toBe(false);
55
- });
56
- });
57
-
58
- describe('ScopeManager', () => {
59
- it('should compose policies correctly', () => {
60
- const manager = new ScopeManager({
61
- rootUrl: 'https://example.com',
62
- allowedDomains: ['example.com', 'sub.example.com', 'other.com'],
63
- deniedDomains: ['bad.example.com'],
64
- includeSubdomains: true
65
- });
66
-
67
- expect(manager.isUrlEligible('https://example.com/')).toBe('allowed');
68
- expect(manager.isUrlEligible('https://sub.example.com/')).toBe('allowed');
69
- expect(manager.isUrlEligible('https://bad.example.com/')).toBe('blocked_by_domain_filter');
70
- expect(manager.isUrlEligible('https://other.com/')).toBe('allowed');
71
- expect(manager.isUrlEligible('https://google.com/')).toBe('blocked_by_domain_filter');
72
- });
73
-
74
- it('should handle trailing dots in hostnames', () => {
75
- const manager = new ScopeManager({
76
- rootUrl: 'https://example.com',
77
- allowedDomains: ['example.com.'],
78
- includeSubdomains: false
79
- });
80
-
81
- expect(manager.isUrlEligible('https://example.com./')).toBe('allowed');
82
- expect(manager.isUrlEligible('https://example.com/')).toBe('allowed');
83
- });
84
- });
@@ -1,60 +0,0 @@
1
- import { expect, test } from 'vitest';
2
- import { scorePageSeo } from '../src/analysis/scoring.js';
3
- import { PageAnalysis } from '../src/analysis/analyze.js';
4
-
5
- const basePage: PageAnalysis = {
6
- url: 'https://example.com',
7
- status: 200,
8
- title: { value: 'x'.repeat(55), length: 55, status: 'ok' },
9
- metaDescription: { value: 'x'.repeat(150), length: 150, status: 'ok' },
10
- h1: { count: 1, status: 'ok', matchesTitle: false },
11
- content: { wordCount: 700, textHtmlRatio: 0.3, uniqueSentenceCount: 8 },
12
- thinScore: 0,
13
- images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
14
- links: { internalLinks: 5, externalLinks: 2, nofollowCount: 1, externalRatio: 2 / 7 },
15
- structuredData: { present: true, valid: true, types: ['Article'] },
16
- seoScore: 0,
17
- meta: { noindex: false, nofollow: false }
18
- };
19
-
20
- test('page score stays in 0-100', () => {
21
- expect(scorePageSeo(basePage)).toBeGreaterThanOrEqual(0);
22
- expect(scorePageSeo(basePage)).toBeLessThanOrEqual(100);
23
-
24
- const badPage: PageAnalysis = {
25
- ...basePage,
26
- title: { value: null, length: 0, status: 'missing' },
27
- metaDescription: { value: null, length: 0, status: 'missing' },
28
- h1: { count: 0, status: 'critical', matchesTitle: false },
29
- content: { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
30
- thinScore: 100,
31
- images: { totalImages: 2, missingAlt: 2, emptyAlt: 0 },
32
- structuredData: { present: false, valid: false, types: [] },
33
- links: { internalLinks: 0, externalLinks: 9, nofollowCount: 9, externalRatio: 1 }
34
- };
35
- expect(scorePageSeo(badPage)).toBeLessThan(50);
36
- });
37
-
38
- // test('aggregate site score includes existing metrics signals', () => {
39
- // const score = aggregateSiteScore({
40
- // totalPages: 2,
41
- // totalEdges: 1,
42
- // orphanPages: ['https://example.com/x'],
43
- // nearOrphans: [],
44
- // deepPages: [],
45
- // topAuthorityPages: [{ url: 'a', authority: 1 }],
46
- // averageOutDegree: 1,
47
- // maxDepthFound: 1,
48
- // crawlEfficiencyScore: 0.8,
49
- // averageDepth: 1,
50
- // structuralEntropy: 2,
51
- // limitReached: false
52
- // }, [
53
- // { ...basePage, seoScore: 70 },
54
- // { ...basePage, seoScore: 90, url: 'https://example.com/2' }
55
- // ]);
56
-
57
- // expect(score.seoHealthScore).toBe(80);
58
- // expect(score.overallScore).toBeGreaterThan(0);
59
- // expect(score.overallScore).toBeLessThanOrEqual(100);
60
- // });
@@ -1,100 +0,0 @@
1
- import { test, expect, beforeEach, vi } from 'vitest';
2
- import { Sitemap } from '../src/crawler/sitemap.js';
3
- import { MockAgent, setGlobalDispatcher } from 'undici';
4
- import { EngineContext } from '../src/events.js';
5
-
6
- let mockAgent: MockAgent;
7
-
8
- beforeEach(() => {
9
- mockAgent = new MockAgent();
10
- mockAgent.disableNetConnect();
11
- setGlobalDispatcher(mockAgent);
12
- });
13
-
14
- test('fetches and parses simple sitemap', async () => {
15
- const client = mockAgent.get('https://example.com');
16
- client.intercept({
17
- path: '/sitemap.xml',
18
- method: 'GET'
19
- }).reply(200, `
20
- <?xml version="1.0" encoding="UTF-8"?>
21
- <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
22
- <url>
23
- <loc>https://example.com/page1</loc>
24
- </url>
25
- <url>
26
- <loc>https://example.com/page2</loc>
27
- </url>
28
- </urlset>
29
- `);
30
-
31
- const sitemap = new Sitemap();
32
- const urls = await sitemap.fetch('https://example.com/sitemap.xml');
33
- expect(urls).toContain('https://example.com/page1');
34
- expect(urls).toContain('https://example.com/page2');
35
- expect(urls.length).toBe(2);
36
- });
37
-
38
- test('handles sitemap index recursively', async () => {
39
- const client = mockAgent.get('https://example.com');
40
-
41
- // Index
42
- client.intercept({
43
- path: '/sitemap-index.xml',
44
- method: 'GET'
45
- }).reply(200, `
46
- <?xml version="1.0" encoding="UTF-8"?>
47
- <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
48
- <sitemap>
49
- <loc>https://example.com/sitemap1.xml</loc>
50
- </sitemap>
51
- </sitemapindex>
52
- `);
53
-
54
- // Child sitemap
55
- client.intercept({
56
- path: '/sitemap1.xml',
57
- method: 'GET'
58
- }).reply(200, `
59
- <?xml version="1.0" encoding="UTF-8"?>
60
- <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
61
- <url>
62
- <loc>https://example.com/page3</loc>
63
- </url>
64
- </urlset>
65
- `);
66
-
67
- const sitemap = new Sitemap();
68
- const urls = await sitemap.fetch('https://example.com/sitemap-index.xml');
69
- expect(urls).toContain('https://example.com/page3');
70
- expect(urls.length).toBe(1);
71
- });
72
-
73
- test('handles invalid xml gracefully', async () => {
74
- const client = mockAgent.get('https://example.com');
75
- client.intercept({ path: '/bad.xml', method: 'GET' }).reply(200, 'Not XML');
76
-
77
- const sitemap = new Sitemap();
78
- const urls = await sitemap.fetch('https://example.com/bad.xml');
79
- expect(urls.length).toBe(0);
80
- });
81
-
82
- test('handles fetch errors gracefully', async () => {
83
- const client = mockAgent.get('https://example.com');
84
- client.intercept({ path: '/error.xml', method: 'GET' }).reply(500, 'Error');
85
-
86
- const sitemap = new Sitemap();
87
- const urls = await sitemap.fetch('https://example.com/error.xml');
88
- expect(urls.length).toBe(0);
89
- });
90
-
91
- test('emits warning on fetch error', async () => {
92
- const client = mockAgent.get('https://example.com');
93
- client.intercept({ path: '/error.xml', method: 'GET' }).replyWithError(new Error('Network error'));
94
-
95
- const mockContext: EngineContext = { emit: vi.fn() };
96
- const sitemap = new Sitemap(mockContext);
97
- await sitemap.fetch('https://example.com/error.xml');
98
-
99
- expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'warn' }));
100
- });
@@ -1,41 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { Parser } from '../src/crawler/parser.js';
3
-
4
- describe('Soft 404 Detection', () => {
5
- const parser = new Parser();
6
- const baseUrl = 'https://example.com';
7
-
8
- it('should detect soft 404 by title pattern', () => {
9
- const html = '<html><head><title>Page Not Found</title></head><body>Welcome to the site</body></html>';
10
- const result = parser.parse(html, baseUrl, 200);
11
- expect(result.soft404Score).toBeGreaterThan(0.3);
12
- expect(result.soft404Signals).toContain('title_pattern_not_found');
13
- });
14
-
15
- it('should detect soft 404 by H1 pattern', () => {
16
- const html = '<html><body><h1>404 Error</h1></body></html>';
17
- const result = parser.parse(html, baseUrl, 200);
18
- expect(result.soft404Score).toBeGreaterThan(0.2);
19
- expect(result.soft404Signals).toContain('h1_pattern_404');
20
- });
21
-
22
- it('should detect soft 404 by very low word count', () => {
23
- const html = '<html><body>Short text</body></html>';
24
- const result = parser.parse(html, baseUrl, 200);
25
- expect(result.soft404Score).toBeGreaterThan(0.2);
26
- expect(result.soft404Signals).toContain('very_low_word_count');
27
- });
28
-
29
- it('should detect soft 404 by lack of outbound links', () => {
30
- const html = '<html><body>A page with some text but no links.</body></html>';
31
- const result = parser.parse(html, baseUrl, 200);
32
- expect(result.soft404Signals).toContain('no_outbound_links');
33
- });
34
-
35
- it('should combine multiple signals for high score', () => {
36
- const html = '<html><head><title>Error</title></head><body><h1>Not Found</h1><p>The requested page was not found.</p></body></html>';
37
- const result = parser.parse(html, baseUrl, 200);
38
- // title (0.4) + h1 (0.3) + body phrase (0.2) + low word count (0.3) = 1.2 -> capped at 1.0
39
- expect(result.soft404Score).toBe(1.0);
40
- });
41
- });
@@ -1,69 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { Fetcher } from '../src/crawler/fetcher.js';
3
- import { request } from 'undici';
4
- import { IPGuard } from '../src/core/security/ipGuard.js';
5
-
6
- // Mock undici request to fail with EBLOCKED
7
- vi.mock('undici', () => {
8
- return {
9
- request: vi.fn(),
10
- Agent: class {
11
- dispatch = vi.fn();
12
- },
13
- Dispatcher: class {}
14
- };
15
- });
16
-
17
- // Mock IPGuard.validateHost to pass
18
- vi.mock('../src/core/security/ipGuard.js', async () => {
19
- const original = await vi.importActual('../src/core/security/ipGuard.js');
20
- return {
21
- ...original as any,
22
- IPGuard: {
23
- ...original.IPGuard,
24
- validateHost: vi.fn().mockResolvedValue(true), // Pass step 1
25
- getSecureDispatcher: vi.fn()
26
- }
27
- };
28
- });
29
-
30
- describe('SSRF Fix Reproduction', () => {
31
- let fetcher: Fetcher;
32
-
33
- beforeEach(() => {
34
- vi.clearAllMocks();
35
- // Setup default mock return for dispatcher
36
- vi.mocked(IPGuard.getSecureDispatcher).mockReturnValue({} as any);
37
- fetcher = new Fetcher({ rate: 100 });
38
- });
39
-
40
- it('should return blocked_internal_ip when secureDispatcher blocks', async () => {
41
- const mockRequest = vi.mocked(request);
42
- const mockGetSecureDispatcher = vi.mocked(IPGuard.getSecureDispatcher);
43
- const mockDispatcher = { dispatch: vi.fn() } as any;
44
- mockGetSecureDispatcher.mockReturnValue(mockDispatcher);
45
-
46
- // Re-initialize fetcher so it calls getSecureDispatcher and gets our specific mock
47
- fetcher = new Fetcher({ rate: 100 });
48
-
49
- // Simulate secureDispatcher blocking via undici request throwing EBLOCKED
50
- const blockedError = new Error('Blocked internal IP: 127.0.0.1');
51
- (blockedError as any).code = 'EBLOCKED';
52
-
53
- mockRequest.mockRejectedValueOnce(blockedError);
54
-
55
- const res = await fetcher.fetch('http://example.com');
56
-
57
- // Now we expect correct handling
58
- expect(res.status).toBe('blocked_internal_ip');
59
-
60
- // Verify that the secure dispatcher was indeed used
61
- expect(mockGetSecureDispatcher).toHaveBeenCalled();
62
- expect(mockRequest).toHaveBeenCalledWith(
63
- expect.stringContaining('http://example.com'),
64
- expect.objectContaining({
65
- dispatcher: mockDispatcher
66
- })
67
- );
68
- });
69
- });
@@ -1,39 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { TrapDetector } from '../src/crawler/trap.js';
3
-
4
- describe('TrapDetector', () => {
5
- const detector = new TrapDetector();
6
-
7
- it('should detect session ID traps', () => {
8
- const result = detector.checkTrap('https://example.com/page?sid=12345', 1);
9
- expect(result.risk).toBeGreaterThan(0.8);
10
- expect(result.type).toBe('session_trap');
11
- });
12
-
13
- it('should detect calendar patterns', () => {
14
- const result = detector.checkTrap('https://example.com/archive/2023/12/01/', 1);
15
- expect(result.risk).toBeGreaterThan(0.6);
16
- expect(result.type).toBe('calendar_trap');
17
- });
18
-
19
- it('should detect pagination loops', () => {
20
- // Simulate many pages
21
- for (let i = 1; i <= 60; i++) {
22
- detector.checkTrap(`https://example.com/blog?page=${i}`, 1);
23
- }
24
- const result = detector.checkTrap('https://example.com/blog?page=61', 1);
25
- expect(result.risk).toBeGreaterThan(0.8);
26
- expect(result.type).toBe('pagination_loop');
27
- });
28
-
29
- it('should detect faceted navigation / parameter explosion', () => {
30
- detector.reset();
31
- const basePath = 'https://example.com/products';
32
- for (let i = 1; i <= 35; i++) {
33
- detector.checkTrap(`${basePath}?color=red&size=${i}`, 1);
34
- }
35
- const result = detector.checkTrap(`${basePath}?color=blue&size=large`, 1);
36
- expect(result.risk).toBeGreaterThan(0.9);
37
- expect(result.type).toBe('faceted_navigation');
38
- });
39
- });
@@ -1,46 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
- import { Crawl_HTML } from '../src/report/crawl_template.js';
3
- import { Graph } from '../src/graph/graph.js';
4
- import { computePageRank } from '../src/graph/pagerank.js';
5
-
6
- describe('Visualization Data & Template', () => {
7
- it('should include pageRankScore in graph JSON output after PageRank computation', () => {
8
- const graph = new Graph();
9
- graph.addNode('https://a.com', 0, 200);
10
- graph.addNode('https://b.com', 1, 200);
11
- graph.addEdge('https://a.com', 'https://b.com');
12
-
13
- computePageRank(graph);
14
-
15
- const json = graph.toJSON();
16
- const nodeA = json.nodes.find(n => n.url === 'https://a.com');
17
- const nodeB = json.nodes.find(n => n.url === 'https://b.com');
18
-
19
- expect(nodeA).toBeDefined();
20
- expect(nodeB).toBeDefined();
21
- expect(typeof nodeA?.pageRankScore).toBe('number');
22
- expect(typeof nodeB?.pageRankScore).toBe('number');
23
- });
24
-
25
- it('should contain UI toggle buttons for Authority Mode', () => {
26
- expect(Crawl_HTML).toContain('id="btn-auth-pagerank"');
27
- expect(Crawl_HTML).toContain('id="btn-auth-structural"');
28
- });
29
-
30
- it('should contain setAuthorityMode function', () => {
31
- // Use regex to be flexible with whitespace
32
- expect(Crawl_HTML).toMatch(/function\s+setAuthorityMode\s*\(mode,\s*btn\)/);
33
- expect(Crawl_HTML).toContain('n.authority = mode === \'pagerank\' ? n.pageRankAuthority : n.structuralAuthority');
34
- });
35
-
36
- it('should contain logic to calculate pageRankAuthority from pageRankScore', () => {
37
- expect(Crawl_HTML).toContain('n.pageRankAuthority = n.pageRankScore / 100');
38
- expect(Crawl_HTML).toContain('n.structuralAuthority = Math.log(1 + n.inLinks)');
39
- });
40
-
41
- it('should update details panel to show both metrics', () => {
42
- expect(Crawl_HTML).toContain('id="d-auth-container"');
43
- expect(Crawl_HTML).toContain('In-Degree: ${structVal}');
44
- expect(Crawl_HTML).toContain('PR: <strong>${prVal}</strong>');
45
- });
46
- });
package/tsconfig.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "extends": "../../tsconfig.json",
3
- "compilerOptions": {
4
- "outDir": "dist",
5
- "rootDir": "src",
6
- "declaration": true
7
- },
8
- "include": [
9
- "src"
10
- ]
11
- }