@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import { describe, expect, test } from 'vitest';
|
|
2
|
-
import { analyzeTitle, analyzeMetaDescription, applyDuplicateStatuses, analyzeH1 } from '../src/analysis/seo.js';
|
|
3
|
-
import { analyzeContent, calculateThinContentScore } from '../src/analysis/content.js';
|
|
4
|
-
import { analyzeStructuredData } from '../src/analysis/structuredData.js';
|
|
5
|
-
import { analyzeLinks } from '../src/analysis/links.js';
|
|
6
|
-
import { analyzeImageAlts } from '../src/analysis/images.js';
|
|
7
|
-
|
|
8
|
-
describe('SEO module', () => {
|
|
9
|
-
test('analyze title edge cases', () => {
|
|
10
|
-
expect(analyzeTitle('<html></html>').status).toBe('missing');
|
|
11
|
-
expect(analyzeTitle('<title>short</title>').status).toBe('too_short');
|
|
12
|
-
expect(analyzeTitle(`<title>${'a'.repeat(61)}</title>`).status).toBe('too_long');
|
|
13
|
-
expect(analyzeTitle(`<title>${'a'.repeat(55)}</title>`).status).toBe('ok');
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
test('duplicate detection', () => {
|
|
17
|
-
const values = applyDuplicateStatuses([
|
|
18
|
-
{ value: 'Same', length: 4, status: 'ok' as const },
|
|
19
|
-
{ value: 'same', length: 4, status: 'ok' as const },
|
|
20
|
-
{ value: null, length: 0, status: 'missing' as const }
|
|
21
|
-
]);
|
|
22
|
-
expect(values[0].status).toBe('duplicate');
|
|
23
|
-
expect(values[1].status).toBe('duplicate');
|
|
24
|
-
expect(values[2].status).toBe('missing');
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
test('meta description boundaries', () => {
|
|
28
|
-
expect(analyzeMetaDescription('<meta name="description" content="">').status).toBe('missing');
|
|
29
|
-
expect(analyzeMetaDescription('<html></html>').status).toBe('missing');
|
|
30
|
-
expect(analyzeMetaDescription('<meta name="description" content="short">').status).toBe('too_short');
|
|
31
|
-
expect(analyzeMetaDescription(`<meta name="description" content="${'x'.repeat(150)}">`).status).toBe('ok');
|
|
32
|
-
expect(analyzeMetaDescription(`<meta name="description" content="${'x'.repeat(170)}">`).status).toBe('too_long');
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
test('h1 variations', () => {
|
|
36
|
-
expect(analyzeH1('<h1>One</h1>', 'Title').status).toBe('ok');
|
|
37
|
-
expect(analyzeH1('<h1>One</h1><h1>Two</h1>', 'Title').status).toBe('warning');
|
|
38
|
-
const noH1 = analyzeH1('<p>none</p>', 'Title');
|
|
39
|
-
expect(noH1.status).toBe('critical');
|
|
40
|
-
const same = analyzeH1('<h1>same</h1>', 'Same');
|
|
41
|
-
expect(same.matchesTitle).toBe(true);
|
|
42
|
-
});
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
describe('content module', () => {
|
|
46
|
-
test('word count strips nav/footer/script/style', () => {
|
|
47
|
-
const html = '<body><nav>skip me</nav><p>keep words here</p><footer>skip</footer><script>var x</script><style>.x{}</style></body>';
|
|
48
|
-
const result = analyzeContent(html);
|
|
49
|
-
expect(result.wordCount).toBe(3);
|
|
50
|
-
expect(result.uniqueSentenceCount).toBe(1);
|
|
51
|
-
expect(result.textHtmlRatio).toBeGreaterThan(0);
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
test('thin score boundaries', () => {
|
|
55
|
-
expect(calculateThinContentScore({ wordCount: 600, textHtmlRatio: 0.5, uniqueSentenceCount: 4 }, 0)).toBe(0);
|
|
56
|
-
expect(calculateThinContentScore({ wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 1 }, 100)).toBe(100);
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
test('content handles malformed/empty html', () => {
|
|
60
|
-
expect(analyzeContent('').wordCount).toBe(0);
|
|
61
|
-
expect(analyzeContent('<div><span>broken').wordCount).toBeGreaterThanOrEqual(1);
|
|
62
|
-
});
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
describe('structured data', () => {
|
|
66
|
-
test('valid and invalid JSON-LD parsing', () => {
|
|
67
|
-
const valid = analyzeStructuredData('<script type="application/ld+json">{"@type":"Article"}</script>');
|
|
68
|
-
expect(valid.present).toBe(true);
|
|
69
|
-
expect(valid.valid).toBe(true);
|
|
70
|
-
expect(valid.types).toContain('Article');
|
|
71
|
-
|
|
72
|
-
const invalid = analyzeStructuredData('<script type="application/ld+json">{invalid}</script>');
|
|
73
|
-
expect(invalid.present).toBe(true);
|
|
74
|
-
expect(invalid.valid).toBe(false);
|
|
75
|
-
|
|
76
|
-
const missing = analyzeStructuredData('<p>none</p>');
|
|
77
|
-
expect(missing.present).toBe(false);
|
|
78
|
-
});
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
describe('links and images', () => {
|
|
82
|
-
test('link ratio calculation', () => {
|
|
83
|
-
const html = '<a href="/a">A</a><a href="https://other.com">B</a><a href="https://other.com" rel="nofollow">C</a>';
|
|
84
|
-
const links = analyzeLinks(html, 'https://example.com/page', 'https://example.com');
|
|
85
|
-
expect(links.internalLinks).toBe(1);
|
|
86
|
-
expect(links.externalLinks).toBe(2);
|
|
87
|
-
expect(links.nofollowCount).toBe(1);
|
|
88
|
-
expect(links.externalRatio).toBeCloseTo(2 / 3);
|
|
89
|
-
});
|
|
90
|
-
|
|
91
|
-
test('image alt detection', () => {
|
|
92
|
-
const html = '<img src="a"><img src="b" alt=""><img src="c" alt="ok">';
|
|
93
|
-
const imgs = analyzeImageAlts(html);
|
|
94
|
-
expect(imgs.totalImages).toBe(3);
|
|
95
|
-
expect(imgs.missingAlt).toBe(1);
|
|
96
|
-
expect(imgs.emptyAlt).toBe(1);
|
|
97
|
-
});
|
|
98
|
-
});
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import { describe, expect, test } from 'vitest';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import fs from 'node:fs/promises';
|
|
4
|
-
import { analyzeSite, renderAnalysisHtml } from '../src/analysis/analyze.js';
|
|
5
|
-
|
|
6
|
-
describe('analyze integration', () => {
|
|
7
|
-
const fixturePath = path.resolve(import.meta.dirname, 'fixtures/analyze-crawl.json');
|
|
8
|
-
|
|
9
|
-
test('analyzes full crawl fixture and schema', async () => {
|
|
10
|
-
const result = await analyzeSite('https://example.com', { fromCrawl: fixturePath });
|
|
11
|
-
|
|
12
|
-
expect(result.site_summary.pages_analyzed).toBe(3);
|
|
13
|
-
expect(result.site_summary.duplicate_titles).toBe(2);
|
|
14
|
-
expect(result.site_summary.avg_seo_score).toBeGreaterThanOrEqual(0);
|
|
15
|
-
expect(result.pages[0]).toHaveProperty('title');
|
|
16
|
-
expect(result.pages[0]).toHaveProperty('content');
|
|
17
|
-
expect(result.pages[0]).toHaveProperty('links');
|
|
18
|
-
expect(result.site_scores.overallScore).toBeGreaterThanOrEqual(0);
|
|
19
|
-
expect(result.site_scores.overallScore).toBeLessThanOrEqual(100);
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
test('module filter flags behavior', async () => {
|
|
23
|
-
const seoOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, seo: true });
|
|
24
|
-
expect(seoOnly.pages[0].content.wordCount).toBe(0);
|
|
25
|
-
expect(seoOnly.pages[0].images.totalImages).toBe(0);
|
|
26
|
-
|
|
27
|
-
const contentOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, content: true });
|
|
28
|
-
expect(contentOnly.pages[0].title.status).toBe('missing');
|
|
29
|
-
expect(contentOnly.pages[0].thinScore).toBeGreaterThanOrEqual(0);
|
|
30
|
-
|
|
31
|
-
const accessibilityOnly = await analyzeSite('https://example.com', { fromCrawl: fixturePath, accessibility: true });
|
|
32
|
-
expect(accessibilityOnly.pages[0].images.totalImages).toBeGreaterThan(0);
|
|
33
|
-
expect(accessibilityOnly.pages[0].title.status).toBe('missing');
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
test('html report generation', async () => {
|
|
37
|
-
const result = await analyzeSite('https://example.com', { fromCrawl: fixturePath });
|
|
38
|
-
const html = renderAnalysisHtml(result);
|
|
39
|
-
expect(html).toContain('<table');
|
|
40
|
-
expect(html).toContain('Analysis');
|
|
41
|
-
});
|
|
42
|
-
|
|
43
|
-
test('default database loading', async () => {
|
|
44
|
-
// Force in-memory DB for this test
|
|
45
|
-
process.env.CRAWLITH_DB_PATH = ':memory:';
|
|
46
|
-
|
|
47
|
-
// Close existing DB connection if any to ensure fresh start
|
|
48
|
-
const { getDb, closeDb } = await import('../src/db/index.js');
|
|
49
|
-
closeDb();
|
|
50
|
-
|
|
51
|
-
// Setup repositories
|
|
52
|
-
const { SiteRepository } = await import('../src/db/repositories/SiteRepository.js');
|
|
53
|
-
const { SnapshotRepository } = await import('../src/db/repositories/SnapshotRepository.js');
|
|
54
|
-
const { PageRepository } = await import('../src/db/repositories/PageRepository.js');
|
|
55
|
-
|
|
56
|
-
const db = getDb();
|
|
57
|
-
const siteRepo = new SiteRepository(db);
|
|
58
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
59
|
-
const pageRepo = new PageRepository(db);
|
|
60
|
-
|
|
61
|
-
// Create site and snapshot
|
|
62
|
-
const siteId = siteRepo.createSite('example.com');
|
|
63
|
-
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
|
|
64
|
-
|
|
65
|
-
// Parse fixture and load pages into db
|
|
66
|
-
const rawYaml = await fs.readFile(fixturePath, 'utf-8');
|
|
67
|
-
const rawData = JSON.parse(rawYaml);
|
|
68
|
-
(rawData.pages || rawData.nodes).forEach((p: any) => {
|
|
69
|
-
pageRepo.upsertPage({
|
|
70
|
-
site_id: siteId,
|
|
71
|
-
normalized_url: p.url,
|
|
72
|
-
last_seen_snapshot_id: snapshotId,
|
|
73
|
-
http_status: p.status || 200,
|
|
74
|
-
html: p.html || '',
|
|
75
|
-
depth: p.depth || 0,
|
|
76
|
-
});
|
|
77
|
-
});
|
|
78
|
-
|
|
79
|
-
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', { node_count: 3, edge_count: 0 });
|
|
80
|
-
|
|
81
|
-
try {
|
|
82
|
-
const result = await analyzeSite('https://example.com', {});
|
|
83
|
-
expect(result.site_summary.pages_analyzed).toBe(3);
|
|
84
|
-
} finally {
|
|
85
|
-
closeDb();
|
|
86
|
-
delete process.env.CRAWLITH_DB_PATH;
|
|
87
|
-
}
|
|
88
|
-
});
|
|
89
|
-
|
|
90
|
-
test('handles large html and js-only content', async () => {
|
|
91
|
-
const hugeText = '<html><body><script>document.write("x")</script>' + '<p>word </p>'.repeat(1000) + '</body></html>';
|
|
92
|
-
const tmpFile = path.resolve(import.meta.dirname, 'fixtures/large-analyze.json');
|
|
93
|
-
await fs.writeFile(tmpFile, JSON.stringify({ pages: [{ url: 'https://example.com/', status: 200, depth: 0, html: hugeText }] }));
|
|
94
|
-
const result = await analyzeSite('https://example.com', { fromCrawl: tmpFile });
|
|
95
|
-
expect(result.pages[0].content.wordCount).toBe(1000);
|
|
96
|
-
await fs.unlink(tmpFile);
|
|
97
|
-
});
|
|
98
|
-
});
|
package/tests/audit/dns.test.ts
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi } from 'vitest';
|
|
2
|
-
import { resolveDns } from '../../src/audit/dns.js';
|
|
3
|
-
import dns from 'node:dns/promises';
|
|
4
|
-
|
|
5
|
-
vi.mock('node:dns/promises');
|
|
6
|
-
|
|
7
|
-
describe('DNS Diagnostics', () => {
|
|
8
|
-
it('should resolve all records', async () => {
|
|
9
|
-
vi.spyOn(dns, 'resolve4').mockResolvedValue(['1.1.1.1']);
|
|
10
|
-
vi.spyOn(dns, 'resolve6').mockResolvedValue(['2606::1']);
|
|
11
|
-
vi.spyOn(dns, 'resolveCname').mockRejectedValue(new Error('ENODATA'));
|
|
12
|
-
vi.spyOn(dns, 'reverse').mockResolvedValue(['one.one.one.one']);
|
|
13
|
-
|
|
14
|
-
const result = await resolveDns('example.com');
|
|
15
|
-
expect(result.a).toEqual(['1.1.1.1']);
|
|
16
|
-
expect(result.aaaa).toEqual(['2606::1']);
|
|
17
|
-
expect(result.ipv6Support).toBe(true);
|
|
18
|
-
expect(result.reverse).toEqual(['one.one.one.one']);
|
|
19
|
-
expect(result.resolutionTime).toBeGreaterThanOrEqual(0);
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
it('should handle failures gracefully', async () => {
|
|
23
|
-
vi.spyOn(dns, 'resolve4').mockRejectedValue(new Error('ENOTFOUND'));
|
|
24
|
-
vi.spyOn(dns, 'resolve6').mockRejectedValue(new Error('ENOTFOUND'));
|
|
25
|
-
vi.spyOn(dns, 'resolveCname').mockRejectedValue(new Error('ENOTFOUND'));
|
|
26
|
-
|
|
27
|
-
const result = await resolveDns('invalid.com');
|
|
28
|
-
expect(result.a).toEqual([]);
|
|
29
|
-
expect(result.ipCount).toBe(0);
|
|
30
|
-
});
|
|
31
|
-
});
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { analyzeHeaders } from '../../src/audit/headers.js';
|
|
3
|
-
|
|
4
|
-
describe('Headers Analysis', () => {
|
|
5
|
-
it('should detect all secure headers', () => {
|
|
6
|
-
const headers = {
|
|
7
|
-
'strict-transport-security': 'max-age=31536000; includeSubDomains',
|
|
8
|
-
'content-security-policy': "default-src 'self'",
|
|
9
|
-
'x-frame-options': 'DENY',
|
|
10
|
-
'x-content-type-options': 'nosniff',
|
|
11
|
-
'referrer-policy': 'strict-origin-when-cross-origin',
|
|
12
|
-
'permissions-policy': 'geolocation=()'
|
|
13
|
-
};
|
|
14
|
-
const result = analyzeHeaders(headers);
|
|
15
|
-
expect(result.score).toBe(100);
|
|
16
|
-
expect(result.strictTransportSecurity.valid).toBe(true);
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
it('should handle missing headers', () => {
|
|
20
|
-
const headers = {};
|
|
21
|
-
const result = analyzeHeaders(headers);
|
|
22
|
-
expect(result.score).toBe(0);
|
|
23
|
-
expect(result.strictTransportSecurity.present).toBe(false);
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
it('should validate HSTS properly', () => {
|
|
27
|
-
const headers = {
|
|
28
|
-
'strict-transport-security': 'max-age=0'
|
|
29
|
-
};
|
|
30
|
-
// valid requires max-age
|
|
31
|
-
const result = analyzeHeaders(headers);
|
|
32
|
-
expect(result.strictTransportSecurity.valid).toBe(true);
|
|
33
|
-
// Wait, checkHSTS: includes('max-age=') is true. includes('includeSubDomains') is false.
|
|
34
|
-
// Issues will contain 'Missing includeSubDomains'.
|
|
35
|
-
expect(result.strictTransportSecurity.issues).toContain('Missing includeSubDomains');
|
|
36
|
-
});
|
|
37
|
-
|
|
38
|
-
it('should validate invalid HSTS', () => {
|
|
39
|
-
const headers = {
|
|
40
|
-
'strict-transport-security': 'invalid'
|
|
41
|
-
};
|
|
42
|
-
const result = analyzeHeaders(headers);
|
|
43
|
-
expect(result.strictTransportSecurity.valid).toBe(false);
|
|
44
|
-
});
|
|
45
|
-
});
|
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { calculateScore } from '../../src/audit/scoring.js';
|
|
3
|
-
import { TransportDiagnostics, DnsDiagnostics, SecurityHeadersResult, PerformanceMetrics, AuditIssue } from '../../src/audit/types.js';
|
|
4
|
-
|
|
5
|
-
describe('Scoring Engine', () => {
|
|
6
|
-
const mockTransport: TransportDiagnostics = {
|
|
7
|
-
tlsVersion: 'TLSv1.3',
|
|
8
|
-
cipherSuite: 'TLS_AES_256_GCM_SHA384',
|
|
9
|
-
alpnProtocol: 'h2',
|
|
10
|
-
certificate: {
|
|
11
|
-
issuer: 'Let\'s Encrypt',
|
|
12
|
-
subject: 'example.com',
|
|
13
|
-
validFrom: '2023-01-01',
|
|
14
|
-
validTo: '2024-01-01',
|
|
15
|
-
daysUntilExpiry: 60,
|
|
16
|
-
isSelfSigned: false,
|
|
17
|
-
isValidChain: true,
|
|
18
|
-
fingerprint: 'SHA256:...'
|
|
19
|
-
} as any,
|
|
20
|
-
httpVersion: '2.0',
|
|
21
|
-
compression: ['gzip'],
|
|
22
|
-
keepAlive: true,
|
|
23
|
-
transferEncoding: null,
|
|
24
|
-
redirectCount: 0,
|
|
25
|
-
redirects: [],
|
|
26
|
-
serverHeader: 'nginx',
|
|
27
|
-
headers: {}
|
|
28
|
-
};
|
|
29
|
-
|
|
30
|
-
const mockDns: DnsDiagnostics = {
|
|
31
|
-
a: ['1.1.1.1', '1.0.0.1'],
|
|
32
|
-
aaaa: ['2606:4700:4700::1111'],
|
|
33
|
-
cname: [],
|
|
34
|
-
reverse: [],
|
|
35
|
-
ipCount: 3,
|
|
36
|
-
ipv6Support: true,
|
|
37
|
-
resolutionTime: 10
|
|
38
|
-
};
|
|
39
|
-
|
|
40
|
-
const mockHeaders: SecurityHeadersResult = {
|
|
41
|
-
strictTransportSecurity: { present: true, valid: true, value: 'max-age=31536000' },
|
|
42
|
-
contentSecurityPolicy: { present: true, valid: true, value: "default-src 'self'" },
|
|
43
|
-
xFrameOptions: { present: true, valid: true, value: 'DENY' },
|
|
44
|
-
xContentTypeOptions: { present: true, valid: true, value: 'nosniff' },
|
|
45
|
-
referrerPolicy: { present: true, valid: true, value: 'strict-origin' },
|
|
46
|
-
permissionsPolicy: { present: true, valid: true, value: 'geolocation=()' },
|
|
47
|
-
details: {},
|
|
48
|
-
score: 100
|
|
49
|
-
};
|
|
50
|
-
|
|
51
|
-
const mockPerformance: PerformanceMetrics = {
|
|
52
|
-
dnsLookupTime: 10,
|
|
53
|
-
tcpConnectTime: 20,
|
|
54
|
-
tlsHandshakeTime: 30,
|
|
55
|
-
ttfb: 100,
|
|
56
|
-
totalTime: 200,
|
|
57
|
-
htmlSize: 50000,
|
|
58
|
-
headerSize: 500,
|
|
59
|
-
redirectTime: 0
|
|
60
|
-
};
|
|
61
|
-
|
|
62
|
-
it('should give perfect score for perfect inputs', () => {
|
|
63
|
-
const result = calculateScore(mockTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
64
|
-
expect(result.score).toBe(100);
|
|
65
|
-
expect(result.grade).toBe('A');
|
|
66
|
-
expect(result.issues).toHaveLength(0);
|
|
67
|
-
});
|
|
68
|
-
|
|
69
|
-
it('should penalize TLS < 1.2', () => {
|
|
70
|
-
const badTransport = { ...mockTransport, tlsVersion: 'TLSv1.1' };
|
|
71
|
-
const result = calculateScore(badTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
72
|
-
expect(result.score).toBeLessThan(100);
|
|
73
|
-
expect(result.categoryScores.transport).toBeLessThan(30);
|
|
74
|
-
expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'tls-old' })]));
|
|
75
|
-
});
|
|
76
|
-
|
|
77
|
-
it('should penalize missing HTTPS', () => {
|
|
78
|
-
const badTransport = { ...mockTransport, tlsVersion: null, certificate: null };
|
|
79
|
-
const result = calculateScore(badTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
80
|
-
expect(result.score).toBeLessThan(50); // Critical
|
|
81
|
-
expect(result.grade).toBe('F');
|
|
82
|
-
expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'no-https' })]));
|
|
83
|
-
});
|
|
84
|
-
|
|
85
|
-
it('should fail on expired cert', () => {
|
|
86
|
-
const expiredTransport = {
|
|
87
|
-
...mockTransport,
|
|
88
|
-
certificate: { ...mockTransport.certificate!, daysUntilExpiry: -5, validTo: '2023-01-01' }
|
|
89
|
-
};
|
|
90
|
-
const result = calculateScore(expiredTransport, mockDns, mockHeaders, mockPerformance, []);
|
|
91
|
-
expect(result.grade).toBe('F');
|
|
92
|
-
expect(result.score).toBeLessThanOrEqual(40);
|
|
93
|
-
expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'cert-expired' })]));
|
|
94
|
-
});
|
|
95
|
-
|
|
96
|
-
it('should penalize missing security headers', () => {
|
|
97
|
-
// If score is 50, it means we lost 50 points in headers category (internal score)
|
|
98
|
-
// headers category is 20 points total. So we lose 10 points.
|
|
99
|
-
const badHeaders = { ...mockHeaders, score: 50, strictTransportSecurity: { present: false, valid: false, value: null } };
|
|
100
|
-
const result = calculateScore(mockTransport, mockDns, badHeaders, mockPerformance, []);
|
|
101
|
-
expect(result.categoryScores.security).toBe(10);
|
|
102
|
-
expect(result.score).toBe(90); // 100 - 10
|
|
103
|
-
expect(result.issues).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'hsts-missing' })]));
|
|
104
|
-
});
|
|
105
|
-
|
|
106
|
-
it('should penalize poor performance', () => {
|
|
107
|
-
const badPerf = { ...mockPerformance, ttfb: 1000, htmlSize: 2000000 };
|
|
108
|
-
const result = calculateScore(mockTransport, mockDns, mockHeaders, badPerf, []);
|
|
109
|
-
// TTFB > 800: Lose 10 pts
|
|
110
|
-
// HTML > 1MB: Lose 5 pts
|
|
111
|
-
// Total perf score (30) -> 15.
|
|
112
|
-
expect(result.categoryScores.performance).toBe(15);
|
|
113
|
-
expect(result.score).toBe(85);
|
|
114
|
-
expect(result.issues).toEqual(expect.arrayContaining([
|
|
115
|
-
expect.objectContaining({ id: 'slow-ttfb' }),
|
|
116
|
-
expect.objectContaining({ id: 'large-html' })
|
|
117
|
-
]));
|
|
118
|
-
});
|
|
119
|
-
|
|
120
|
-
it('should penalize infrastructure issues', () => {
|
|
121
|
-
const badDns = { ...mockDns, ipv6Support: false, ipCount: 1 };
|
|
122
|
-
const result = calculateScore(mockTransport, badDns, mockHeaders, mockPerformance, []);
|
|
123
|
-
// No IPv6: Lose 10 pts
|
|
124
|
-
// Single IP: Lose 10 pts
|
|
125
|
-
// Infra score (20) -> 0.
|
|
126
|
-
expect(result.categoryScores.infrastructure).toBe(0);
|
|
127
|
-
expect(result.score).toBe(80);
|
|
128
|
-
expect(result.issues).toEqual(expect.arrayContaining([
|
|
129
|
-
expect.objectContaining({ id: 'no-ipv6' }),
|
|
130
|
-
expect.objectContaining({ id: 'single-ip' })
|
|
131
|
-
]));
|
|
132
|
-
});
|
|
133
|
-
});
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { auditUrl } from '../../src/audit/index.js';
|
|
3
|
-
|
|
4
|
-
describe('Audit Security', () => {
|
|
5
|
-
it('should block audits of internal IP addresses', async () => {
|
|
6
|
-
await expect(auditUrl('http://127.0.0.1')).rejects.toThrow('Access to internal or private infrastructure is prohibited');
|
|
7
|
-
});
|
|
8
|
-
|
|
9
|
-
it('should block audits of link-local addresses', async () => {
|
|
10
|
-
await expect(auditUrl('http://169.254.169.254')).rejects.toThrow('Access to internal or private infrastructure is prohibited');
|
|
11
|
-
});
|
|
12
|
-
});
|
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi, afterEach } from 'vitest';
|
|
2
|
-
import { analyzeTransport } from '../../src/audit/transport.js';
|
|
3
|
-
import https from 'node:https';
|
|
4
|
-
import http from 'node:http';
|
|
5
|
-
import tls from 'node:tls';
|
|
6
|
-
import { EventEmitter } from 'events';
|
|
7
|
-
|
|
8
|
-
vi.mock('node:https');
|
|
9
|
-
vi.mock('node:http');
|
|
10
|
-
|
|
11
|
-
describe('Transport Diagnostics', () => {
|
|
12
|
-
afterEach(() => {
|
|
13
|
-
vi.clearAllMocks();
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
it('should analyze HTTPS transport', async () => {
|
|
17
|
-
// Mock Response
|
|
18
|
-
const mockRes = new EventEmitter() as any;
|
|
19
|
-
mockRes.statusCode = 200;
|
|
20
|
-
mockRes.statusMessage = 'OK';
|
|
21
|
-
mockRes.headers = {
|
|
22
|
-
'content-encoding': 'gzip',
|
|
23
|
-
'server': 'nginx',
|
|
24
|
-
'connection': 'keep-alive'
|
|
25
|
-
};
|
|
26
|
-
mockRes.httpVersion = '1.1';
|
|
27
|
-
|
|
28
|
-
const mockSocket = new EventEmitter();
|
|
29
|
-
Object.setPrototypeOf(mockSocket, tls.TLSSocket.prototype);
|
|
30
|
-
(mockSocket as any).getPeerCertificate = () => ({
|
|
31
|
-
subject: { CN: 'example.com' },
|
|
32
|
-
issuer: { CN: 'Let\'s Encrypt' },
|
|
33
|
-
valid_from: 'Jan 1 2023',
|
|
34
|
-
valid_to: 'Jan 1 2024',
|
|
35
|
-
fingerprint: 'SHA256:...'
|
|
36
|
-
});
|
|
37
|
-
(mockSocket as any).getProtocol = () => 'TLSv1.3';
|
|
38
|
-
(mockSocket as any).getCipher = () => ({ name: 'TLS_AES_...' });
|
|
39
|
-
(mockSocket as any).alpnProtocol = 'h2';
|
|
40
|
-
(mockSocket as any).authorized = true;
|
|
41
|
-
|
|
42
|
-
mockRes.socket = mockSocket;
|
|
43
|
-
|
|
44
|
-
// Mock Request
|
|
45
|
-
const mockReq = new EventEmitter() as any;
|
|
46
|
-
mockReq.end = vi.fn();
|
|
47
|
-
mockReq.destroy = vi.fn();
|
|
48
|
-
|
|
49
|
-
// Mock https.request
|
|
50
|
-
vi.spyOn(https, 'request').mockImplementation((url, options, cb) => {
|
|
51
|
-
if (cb) cb(mockRes);
|
|
52
|
-
// Simulate socket events
|
|
53
|
-
setTimeout(() => {
|
|
54
|
-
mockReq.emit('socket', mockRes.socket);
|
|
55
|
-
mockRes.socket.emit('lookup');
|
|
56
|
-
mockRes.socket.emit('connect');
|
|
57
|
-
mockRes.socket.emit('secureConnect');
|
|
58
|
-
mockReq.emit('finish');
|
|
59
|
-
// Response data
|
|
60
|
-
mockRes.emit('data', Buffer.from('<html></html>'));
|
|
61
|
-
mockRes.emit('end');
|
|
62
|
-
}, 10);
|
|
63
|
-
return mockReq;
|
|
64
|
-
});
|
|
65
|
-
|
|
66
|
-
const result = await analyzeTransport('https://example.com', 1000);
|
|
67
|
-
expect(result.transport.tlsVersion).toBe('TLSv1.3');
|
|
68
|
-
expect(result.transport.httpVersion).toBe('1.1');
|
|
69
|
-
expect(result.performance.htmlSize).toBeGreaterThan(0);
|
|
70
|
-
expect(result.transport.headers['server']).toBe('nginx');
|
|
71
|
-
});
|
|
72
|
-
|
|
73
|
-
it('should handle redirects', async () => {
|
|
74
|
-
const req1 = new EventEmitter() as any; req1.end = vi.fn(); req1.destroy = vi.fn();
|
|
75
|
-
const res1 = new EventEmitter() as any; res1.statusCode = 301; res1.headers = { location: 'https://example.com/' };
|
|
76
|
-
res1.socket = new EventEmitter(); Object.setPrototypeOf(res1.socket, tls.TLSSocket.prototype);
|
|
77
|
-
|
|
78
|
-
const req2 = new EventEmitter() as any; req2.end = vi.fn(); req2.destroy = vi.fn();
|
|
79
|
-
const res2 = new EventEmitter() as any; res2.statusCode = 200; res2.headers = {};
|
|
80
|
-
res2.socket = new EventEmitter(); Object.setPrototypeOf(res2.socket, tls.TLSSocket.prototype);
|
|
81
|
-
|
|
82
|
-
// Setup res2 socket for TLS checks
|
|
83
|
-
res2.socket.getPeerCertificate = () => ({});
|
|
84
|
-
res2.socket.getProtocol = () => 'TLSv1.2';
|
|
85
|
-
res2.socket.getCipher = () => ({ name: 'AES' });
|
|
86
|
-
|
|
87
|
-
const requestSpy = vi.spyOn(https, 'request');
|
|
88
|
-
requestSpy
|
|
89
|
-
.mockImplementationOnce((url, options, cb) => {
|
|
90
|
-
if (cb) cb(res1);
|
|
91
|
-
setTimeout(() => {
|
|
92
|
-
req1.emit('socket', res1.socket);
|
|
93
|
-
res1.emit('data', Buffer.from('redirecting'));
|
|
94
|
-
res1.emit('end');
|
|
95
|
-
}, 10);
|
|
96
|
-
return req1;
|
|
97
|
-
})
|
|
98
|
-
.mockImplementationOnce((url, options, cb) => {
|
|
99
|
-
if (cb) cb(res2);
|
|
100
|
-
setTimeout(() => {
|
|
101
|
-
req2.emit('socket', res2.socket);
|
|
102
|
-
res2.emit('data', Buffer.from('ok'));
|
|
103
|
-
res2.emit('end');
|
|
104
|
-
}, 10);
|
|
105
|
-
return req2;
|
|
106
|
-
});
|
|
107
|
-
|
|
108
|
-
const result = await analyzeTransport('https://redirect.com', 1000);
|
|
109
|
-
expect(result.transport.redirectCount).toBe(1);
|
|
110
|
-
expect(result.transport.redirects[0].location).toBe('https://example.com/');
|
|
111
|
-
});
|
|
112
|
-
});
|
package/tests/clustering.test.ts
DELETED
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, beforeEach } from 'vitest';
|
|
2
|
-
import { Graph } from '../src/graph/graph.js';
|
|
3
|
-
import { detectContentClusters } from '../src/graph/cluster.js';
|
|
4
|
-
|
|
5
|
-
describe('Content Clustering', () => {
|
|
6
|
-
let graph: Graph;
|
|
7
|
-
|
|
8
|
-
beforeEach(() => {
|
|
9
|
-
graph = new Graph();
|
|
10
|
-
});
|
|
11
|
-
|
|
12
|
-
it('should group similar pages into a cluster', () => {
|
|
13
|
-
// Mock simhashes for similar pages (Hamming distance 1)
|
|
14
|
-
const h1 = 0b101010n;
|
|
15
|
-
const h2 = 0b101011n;
|
|
16
|
-
const h3 = 0b101001n;
|
|
17
|
-
|
|
18
|
-
graph.addNode('https://example.com/p1', 0, 200);
|
|
19
|
-
graph.addNode('https://example.com/p2', 0, 200);
|
|
20
|
-
graph.addNode('https://example.com/p3', 0, 200);
|
|
21
|
-
|
|
22
|
-
graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
|
|
23
|
-
graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
|
|
24
|
-
graph.updateNodeData('https://example.com/p3', { simhash: h3.toString() });
|
|
25
|
-
|
|
26
|
-
const clusters = detectContentClusters(graph, 2, 2);
|
|
27
|
-
|
|
28
|
-
expect(clusters.length).toBe(1);
|
|
29
|
-
expect(clusters[0].count).toBe(3);
|
|
30
|
-
expect(graph.nodes.get('https://example.com/p1')?.clusterId).toBe(1);
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
it('should separate dissimilar pages', () => {
|
|
34
|
-
// Mock simhashes for very different pages
|
|
35
|
-
const h1 = 0b1111111111n;
|
|
36
|
-
const h2 = 0b0000000000n;
|
|
37
|
-
|
|
38
|
-
graph.addNode('https://example.com/p1', 0, 200);
|
|
39
|
-
graph.addNode('https://example.com/p2', 0, 200);
|
|
40
|
-
|
|
41
|
-
graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
|
|
42
|
-
graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
|
|
43
|
-
|
|
44
|
-
const clusters = detectContentClusters(graph, 2, 2);
|
|
45
|
-
|
|
46
|
-
expect(clusters.length).toBe(0); // None meet minSize 2
|
|
47
|
-
});
|
|
48
|
-
|
|
49
|
-
it('should respect minClusterSize', () => {
|
|
50
|
-
const h1 = 0b1n;
|
|
51
|
-
const h2 = 0b0n;
|
|
52
|
-
|
|
53
|
-
graph.addNode('https://example.com/p1', 0, 200);
|
|
54
|
-
graph.addNode('https://example.com/p2', 0, 200);
|
|
55
|
-
|
|
56
|
-
graph.updateNodeData('https://example.com/p1', { simhash: h1.toString() });
|
|
57
|
-
graph.updateNodeData('https://example.com/p2', { simhash: h2.toString() });
|
|
58
|
-
|
|
59
|
-
const clusters = detectContentClusters(graph, 1, 3);
|
|
60
|
-
expect(clusters.length).toBe(0);
|
|
61
|
-
});
|
|
62
|
-
|
|
63
|
-
it('should identify shared path prefixes (silos)', () => {
|
|
64
|
-
graph.addNode('https://example.com/blog/seo-tips', 0, 200);
|
|
65
|
-
graph.addNode('https://example.com/blog/link-building', 0, 200);
|
|
66
|
-
graph.addNode('https://example.com/blog/technical-seo', 0, 200);
|
|
67
|
-
|
|
68
|
-
const h = 0b111n;
|
|
69
|
-
graph.updateNodeData('https://example.com/blog/seo-tips', { simhash: h.toString() });
|
|
70
|
-
graph.updateNodeData('https://example.com/blog/link-building', { simhash: h.toString() });
|
|
71
|
-
graph.updateNodeData('https://example.com/blog/technical-seo', { simhash: h.toString() });
|
|
72
|
-
|
|
73
|
-
const clusters = detectContentClusters(graph, 0, 3);
|
|
74
|
-
expect(clusters[0].sharedPathPrefix).toBe('/blog');
|
|
75
|
-
});
|
|
76
|
-
|
|
77
|
-
it('should be deterministic with unstable input order', () => {
|
|
78
|
-
// We'll add nodes in different orders and check if cluster primary is same
|
|
79
|
-
const h = 0b111n;
|
|
80
|
-
graph.addNode('https://example.com/z', 0, 200);
|
|
81
|
-
graph.addNode('https://example.com/a', 0, 200);
|
|
82
|
-
graph.addNode('https://example.com/m', 0, 200);
|
|
83
|
-
|
|
84
|
-
graph.updateNodeData('https://example.com/z', { simhash: h.toString(), pageRank: 10 });
|
|
85
|
-
graph.updateNodeData('https://example.com/a', { simhash: h.toString(), pageRank: 10 });
|
|
86
|
-
graph.updateNodeData('https://example.com/m', { simhash: h.toString(), pageRank: 10 });
|
|
87
|
-
|
|
88
|
-
const clusters = detectContentClusters(graph, 0, 3);
|
|
89
|
-
// a should be primary because it's shortest/lexicographic first since PageRanks are same
|
|
90
|
-
expect(clusters[0].primaryUrl).toBe('https://example.com/a');
|
|
91
|
-
});
|
|
92
|
-
|
|
93
|
-
it('should use band optimization correctly (heuristic nature)', () => {
|
|
94
|
-
// Create many nodes in 2 groups
|
|
95
|
-
// Group 1: Matches in band 0
|
|
96
|
-
// Group 2: Matches in band 1
|
|
97
|
-
for (let i = 0; i < 5; i++) {
|
|
98
|
-
const url = `https://example.com/g1/${i}`;
|
|
99
|
-
graph.addNode(url, 0, 200);
|
|
100
|
-
// Simhash that matches in first 16 bits (0xAAAA)
|
|
101
|
-
const hash = BigInt(0xAAAA) | (BigInt(i) << 16n);
|
|
102
|
-
graph.updateNodeData(url, { simhash: hash.toString() });
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
for (let i = 0; i < 5; i++) {
|
|
106
|
-
const url = `https://example.com/g2/${i}`;
|
|
107
|
-
graph.addNode(url, 0, 200);
|
|
108
|
-
// Simhash that matches in second 16 bits (0xBBBB << 16)
|
|
109
|
-
const hash = (BigInt(0xBBBB) << 16n) | BigInt(i);
|
|
110
|
-
graph.updateNodeData(url, { simhash: hash.toString() });
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
const clusters = detectContentClusters(graph, 5, 3);
|
|
114
|
-
expect(clusters.length).toBe(2);
|
|
115
|
-
expect(clusters[0].count).toBe(5);
|
|
116
|
-
expect(clusters[1].count).toBe(5);
|
|
117
|
-
});
|
|
118
|
-
});
|