@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/tests/scope.test.ts
DELETED
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { DomainFilter } from '../src/core/scope/domainFilter.js';
|
|
3
|
-
import { SubdomainPolicy } from '../src/core/scope/subdomainPolicy.js';
|
|
4
|
-
import { ScopeManager } from '../src/core/scope/scopeManager.js';
|
|
5
|
-
|
|
6
|
-
describe('DomainFilter', () => {
|
|
7
|
-
it('should normalize hostnames', () => {
|
|
8
|
-
const filter = new DomainFilter(['EXAMPLE.COM.'], ['DENY.COM.']);
|
|
9
|
-
expect(filter.isAllowed('example.com')).toBe(true);
|
|
10
|
-
expect(filter.isAllowed('deny.com')).toBe(false);
|
|
11
|
-
});
|
|
12
|
-
|
|
13
|
-
it('should respect precedence (deny wins)', () => {
|
|
14
|
-
const filter = new DomainFilter(['example.com'], ['example.com']);
|
|
15
|
-
expect(filter.isAllowed('example.com')).toBe(false);
|
|
16
|
-
});
|
|
17
|
-
|
|
18
|
-
it('should handle punycode', () => {
|
|
19
|
-
// xn--80ak6aa92e.com is punycode for пример.com
|
|
20
|
-
const filter = new DomainFilter(['xn--80ak6aa92e.com']);
|
|
21
|
-
expect(filter.isAllowed('XN--80AK6AA92E.COM')).toBe(true);
|
|
22
|
-
});
|
|
23
|
-
|
|
24
|
-
it('should block if not in allow list (when list not empty)', () => {
|
|
25
|
-
const filter = new DomainFilter(['allowed.com']);
|
|
26
|
-
expect(filter.isAllowed('other.com')).toBe(false);
|
|
27
|
-
});
|
|
28
|
-
|
|
29
|
-
it('should fallback to raw string on invalid hostname', () => {
|
|
30
|
-
// '[' and 'http://denied-invalid-[' causes new URL() to throw
|
|
31
|
-
const filter = new DomainFilter(['['], ['denied-invalid-[']);
|
|
32
|
-
expect(filter.isAllowed('[')).toBe(true);
|
|
33
|
-
expect(filter.isAllowed('denied-invalid-[')).toBe(false);
|
|
34
|
-
});
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
describe('SubdomainPolicy', () => {
|
|
38
|
-
it('should enforce exact match by default', () => {
|
|
39
|
-
const policy = new SubdomainPolicy('https://example.com');
|
|
40
|
-
expect(policy.isAllowed('example.com')).toBe(true);
|
|
41
|
-
expect(policy.isAllowed('sub.example.com')).toBe(false);
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
it('should allow valid subdomains when enabled', () => {
|
|
45
|
-
const policy = new SubdomainPolicy('https://example.com', true);
|
|
46
|
-
expect(policy.isAllowed('example.com')).toBe(true);
|
|
47
|
-
expect(policy.isAllowed('sub.example.com')).toBe(true);
|
|
48
|
-
expect(policy.isAllowed('deep.sub.example.com')).toBe(true);
|
|
49
|
-
});
|
|
50
|
-
|
|
51
|
-
it('should reject malicious suffix matches', () => {
|
|
52
|
-
const policy = new SubdomainPolicy('https://example.com', true);
|
|
53
|
-
expect(policy.isAllowed('evil-example.com')).toBe(false);
|
|
54
|
-
expect(policy.isAllowed('example.com.evil.com')).toBe(false);
|
|
55
|
-
});
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
describe('ScopeManager', () => {
|
|
59
|
-
it('should compose policies correctly', () => {
|
|
60
|
-
const manager = new ScopeManager({
|
|
61
|
-
rootUrl: 'https://example.com',
|
|
62
|
-
allowedDomains: ['example.com', 'sub.example.com', 'other.com'],
|
|
63
|
-
deniedDomains: ['bad.example.com'],
|
|
64
|
-
includeSubdomains: true
|
|
65
|
-
});
|
|
66
|
-
|
|
67
|
-
expect(manager.isUrlEligible('https://example.com/')).toBe('allowed');
|
|
68
|
-
expect(manager.isUrlEligible('https://sub.example.com/')).toBe('allowed');
|
|
69
|
-
expect(manager.isUrlEligible('https://bad.example.com/')).toBe('blocked_by_domain_filter');
|
|
70
|
-
expect(manager.isUrlEligible('https://other.com/')).toBe('allowed');
|
|
71
|
-
expect(manager.isUrlEligible('https://google.com/')).toBe('blocked_by_domain_filter');
|
|
72
|
-
});
|
|
73
|
-
|
|
74
|
-
it('should handle trailing dots in hostnames', () => {
|
|
75
|
-
const manager = new ScopeManager({
|
|
76
|
-
rootUrl: 'https://example.com',
|
|
77
|
-
allowedDomains: ['example.com.'],
|
|
78
|
-
includeSubdomains: false
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
expect(manager.isUrlEligible('https://example.com./')).toBe('allowed');
|
|
82
|
-
expect(manager.isUrlEligible('https://example.com/')).toBe('allowed');
|
|
83
|
-
});
|
|
84
|
-
});
|
package/tests/scoring.test.ts
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import { expect, test } from 'vitest';
|
|
2
|
-
import { scorePageSeo } from '../src/analysis/scoring.js';
|
|
3
|
-
import { PageAnalysis } from '../src/analysis/analyze.js';
|
|
4
|
-
|
|
5
|
-
const basePage: PageAnalysis = {
|
|
6
|
-
url: 'https://example.com',
|
|
7
|
-
status: 200,
|
|
8
|
-
title: { value: 'x'.repeat(55), length: 55, status: 'ok' },
|
|
9
|
-
metaDescription: { value: 'x'.repeat(150), length: 150, status: 'ok' },
|
|
10
|
-
h1: { count: 1, status: 'ok', matchesTitle: false },
|
|
11
|
-
content: { wordCount: 700, textHtmlRatio: 0.3, uniqueSentenceCount: 8 },
|
|
12
|
-
thinScore: 0,
|
|
13
|
-
images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
|
|
14
|
-
links: { internalLinks: 5, externalLinks: 2, nofollowCount: 1, externalRatio: 2 / 7 },
|
|
15
|
-
structuredData: { present: true, valid: true, types: ['Article'] },
|
|
16
|
-
seoScore: 0,
|
|
17
|
-
meta: { noindex: false, nofollow: false }
|
|
18
|
-
};
|
|
19
|
-
|
|
20
|
-
test('page score stays in 0-100', () => {
|
|
21
|
-
expect(scorePageSeo(basePage)).toBeGreaterThanOrEqual(0);
|
|
22
|
-
expect(scorePageSeo(basePage)).toBeLessThanOrEqual(100);
|
|
23
|
-
|
|
24
|
-
const badPage: PageAnalysis = {
|
|
25
|
-
...basePage,
|
|
26
|
-
title: { value: null, length: 0, status: 'missing' },
|
|
27
|
-
metaDescription: { value: null, length: 0, status: 'missing' },
|
|
28
|
-
h1: { count: 0, status: 'critical', matchesTitle: false },
|
|
29
|
-
content: { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
|
|
30
|
-
thinScore: 100,
|
|
31
|
-
images: { totalImages: 2, missingAlt: 2, emptyAlt: 0 },
|
|
32
|
-
structuredData: { present: false, valid: false, types: [] },
|
|
33
|
-
links: { internalLinks: 0, externalLinks: 9, nofollowCount: 9, externalRatio: 1 }
|
|
34
|
-
};
|
|
35
|
-
expect(scorePageSeo(badPage)).toBeLessThan(50);
|
|
36
|
-
});
|
|
37
|
-
|
|
38
|
-
// test('aggregate site score includes existing metrics signals', () => {
|
|
39
|
-
// const score = aggregateSiteScore({
|
|
40
|
-
// totalPages: 2,
|
|
41
|
-
// totalEdges: 1,
|
|
42
|
-
// orphanPages: ['https://example.com/x'],
|
|
43
|
-
// nearOrphans: [],
|
|
44
|
-
// deepPages: [],
|
|
45
|
-
// topAuthorityPages: [{ url: 'a', authority: 1 }],
|
|
46
|
-
// averageOutDegree: 1,
|
|
47
|
-
// maxDepthFound: 1,
|
|
48
|
-
// crawlEfficiencyScore: 0.8,
|
|
49
|
-
// averageDepth: 1,
|
|
50
|
-
// structuralEntropy: 2,
|
|
51
|
-
// limitReached: false
|
|
52
|
-
// }, [
|
|
53
|
-
// { ...basePage, seoScore: 70 },
|
|
54
|
-
// { ...basePage, seoScore: 90, url: 'https://example.com/2' }
|
|
55
|
-
// ]);
|
|
56
|
-
|
|
57
|
-
// expect(score.seoHealthScore).toBe(80);
|
|
58
|
-
// expect(score.overallScore).toBeGreaterThan(0);
|
|
59
|
-
// expect(score.overallScore).toBeLessThanOrEqual(100);
|
|
60
|
-
// });
|
package/tests/sitemap.test.ts
DELETED
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
import { test, expect, beforeEach, vi } from 'vitest';
|
|
2
|
-
import { Sitemap } from '../src/crawler/sitemap.js';
|
|
3
|
-
import { MockAgent, setGlobalDispatcher } from 'undici';
|
|
4
|
-
import { EngineContext } from '../src/events.js';
|
|
5
|
-
|
|
6
|
-
let mockAgent: MockAgent;
|
|
7
|
-
|
|
8
|
-
beforeEach(() => {
|
|
9
|
-
mockAgent = new MockAgent();
|
|
10
|
-
mockAgent.disableNetConnect();
|
|
11
|
-
setGlobalDispatcher(mockAgent);
|
|
12
|
-
});
|
|
13
|
-
|
|
14
|
-
test('fetches and parses simple sitemap', async () => {
|
|
15
|
-
const client = mockAgent.get('https://example.com');
|
|
16
|
-
client.intercept({
|
|
17
|
-
path: '/sitemap.xml',
|
|
18
|
-
method: 'GET'
|
|
19
|
-
}).reply(200, `
|
|
20
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
|
21
|
-
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
22
|
-
<url>
|
|
23
|
-
<loc>https://example.com/page1</loc>
|
|
24
|
-
</url>
|
|
25
|
-
<url>
|
|
26
|
-
<loc>https://example.com/page2</loc>
|
|
27
|
-
</url>
|
|
28
|
-
</urlset>
|
|
29
|
-
`);
|
|
30
|
-
|
|
31
|
-
const sitemap = new Sitemap();
|
|
32
|
-
const urls = await sitemap.fetch('https://example.com/sitemap.xml');
|
|
33
|
-
expect(urls).toContain('https://example.com/page1');
|
|
34
|
-
expect(urls).toContain('https://example.com/page2');
|
|
35
|
-
expect(urls.length).toBe(2);
|
|
36
|
-
});
|
|
37
|
-
|
|
38
|
-
test('handles sitemap index recursively', async () => {
|
|
39
|
-
const client = mockAgent.get('https://example.com');
|
|
40
|
-
|
|
41
|
-
// Index
|
|
42
|
-
client.intercept({
|
|
43
|
-
path: '/sitemap-index.xml',
|
|
44
|
-
method: 'GET'
|
|
45
|
-
}).reply(200, `
|
|
46
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
|
47
|
-
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
48
|
-
<sitemap>
|
|
49
|
-
<loc>https://example.com/sitemap1.xml</loc>
|
|
50
|
-
</sitemap>
|
|
51
|
-
</sitemapindex>
|
|
52
|
-
`);
|
|
53
|
-
|
|
54
|
-
// Child sitemap
|
|
55
|
-
client.intercept({
|
|
56
|
-
path: '/sitemap1.xml',
|
|
57
|
-
method: 'GET'
|
|
58
|
-
}).reply(200, `
|
|
59
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
|
60
|
-
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
61
|
-
<url>
|
|
62
|
-
<loc>https://example.com/page3</loc>
|
|
63
|
-
</url>
|
|
64
|
-
</urlset>
|
|
65
|
-
`);
|
|
66
|
-
|
|
67
|
-
const sitemap = new Sitemap();
|
|
68
|
-
const urls = await sitemap.fetch('https://example.com/sitemap-index.xml');
|
|
69
|
-
expect(urls).toContain('https://example.com/page3');
|
|
70
|
-
expect(urls.length).toBe(1);
|
|
71
|
-
});
|
|
72
|
-
|
|
73
|
-
test('handles invalid xml gracefully', async () => {
|
|
74
|
-
const client = mockAgent.get('https://example.com');
|
|
75
|
-
client.intercept({ path: '/bad.xml', method: 'GET' }).reply(200, 'Not XML');
|
|
76
|
-
|
|
77
|
-
const sitemap = new Sitemap();
|
|
78
|
-
const urls = await sitemap.fetch('https://example.com/bad.xml');
|
|
79
|
-
expect(urls.length).toBe(0);
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
test('handles fetch errors gracefully', async () => {
|
|
83
|
-
const client = mockAgent.get('https://example.com');
|
|
84
|
-
client.intercept({ path: '/error.xml', method: 'GET' }).reply(500, 'Error');
|
|
85
|
-
|
|
86
|
-
const sitemap = new Sitemap();
|
|
87
|
-
const urls = await sitemap.fetch('https://example.com/error.xml');
|
|
88
|
-
expect(urls.length).toBe(0);
|
|
89
|
-
});
|
|
90
|
-
|
|
91
|
-
test('emits warning on fetch error', async () => {
|
|
92
|
-
const client = mockAgent.get('https://example.com');
|
|
93
|
-
client.intercept({ path: '/error.xml', method: 'GET' }).replyWithError(new Error('Network error'));
|
|
94
|
-
|
|
95
|
-
const mockContext: EngineContext = { emit: vi.fn() };
|
|
96
|
-
const sitemap = new Sitemap(mockContext);
|
|
97
|
-
await sitemap.fetch('https://example.com/error.xml');
|
|
98
|
-
|
|
99
|
-
expect(mockContext.emit).toHaveBeenCalledWith(expect.objectContaining({ type: 'warn' }));
|
|
100
|
-
});
|
package/tests/soft404.test.ts
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { Parser } from '../src/crawler/parser.js';
|
|
3
|
-
|
|
4
|
-
describe('Soft 404 Detection', () => {
|
|
5
|
-
const parser = new Parser();
|
|
6
|
-
const baseUrl = 'https://example.com';
|
|
7
|
-
|
|
8
|
-
it('should detect soft 404 by title pattern', () => {
|
|
9
|
-
const html = '<html><head><title>Page Not Found</title></head><body>Welcome to the site</body></html>';
|
|
10
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
11
|
-
expect(result.soft404Score).toBeGreaterThan(0.3);
|
|
12
|
-
expect(result.soft404Signals).toContain('title_pattern_not_found');
|
|
13
|
-
});
|
|
14
|
-
|
|
15
|
-
it('should detect soft 404 by H1 pattern', () => {
|
|
16
|
-
const html = '<html><body><h1>404 Error</h1></body></html>';
|
|
17
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
18
|
-
expect(result.soft404Score).toBeGreaterThan(0.2);
|
|
19
|
-
expect(result.soft404Signals).toContain('h1_pattern_404');
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
it('should detect soft 404 by very low word count', () => {
|
|
23
|
-
const html = '<html><body>Short text</body></html>';
|
|
24
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
25
|
-
expect(result.soft404Score).toBeGreaterThan(0.2);
|
|
26
|
-
expect(result.soft404Signals).toContain('very_low_word_count');
|
|
27
|
-
});
|
|
28
|
-
|
|
29
|
-
it('should detect soft 404 by lack of outbound links', () => {
|
|
30
|
-
const html = '<html><body>A page with some text but no links.</body></html>';
|
|
31
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
32
|
-
expect(result.soft404Signals).toContain('no_outbound_links');
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
it('should combine multiple signals for high score', () => {
|
|
36
|
-
const html = '<html><head><title>Error</title></head><body><h1>Not Found</h1><p>The requested page was not found.</p></body></html>';
|
|
37
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
38
|
-
// title (0.4) + h1 (0.3) + body phrase (0.2) + low word count (0.3) = 1.2 -> capped at 1.0
|
|
39
|
-
expect(result.soft404Score).toBe(1.0);
|
|
40
|
-
});
|
|
41
|
-
});
|
package/tests/ssrf_fix.test.ts
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
-
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
3
|
-
import { request } from 'undici';
|
|
4
|
-
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
5
|
-
|
|
6
|
-
// Mock undici request to fail with EBLOCKED
|
|
7
|
-
vi.mock('undici', () => {
|
|
8
|
-
return {
|
|
9
|
-
request: vi.fn(),
|
|
10
|
-
Agent: class {
|
|
11
|
-
dispatch = vi.fn();
|
|
12
|
-
},
|
|
13
|
-
Dispatcher: class {}
|
|
14
|
-
};
|
|
15
|
-
});
|
|
16
|
-
|
|
17
|
-
// Mock IPGuard.validateHost to pass
|
|
18
|
-
vi.mock('../src/core/security/ipGuard.js', async () => {
|
|
19
|
-
const original = await vi.importActual('../src/core/security/ipGuard.js');
|
|
20
|
-
return {
|
|
21
|
-
...original as any,
|
|
22
|
-
IPGuard: {
|
|
23
|
-
...original.IPGuard,
|
|
24
|
-
validateHost: vi.fn().mockResolvedValue(true), // Pass step 1
|
|
25
|
-
getSecureDispatcher: vi.fn()
|
|
26
|
-
}
|
|
27
|
-
};
|
|
28
|
-
});
|
|
29
|
-
|
|
30
|
-
describe('SSRF Fix Reproduction', () => {
|
|
31
|
-
let fetcher: Fetcher;
|
|
32
|
-
|
|
33
|
-
beforeEach(() => {
|
|
34
|
-
vi.clearAllMocks();
|
|
35
|
-
// Setup default mock return for dispatcher
|
|
36
|
-
vi.mocked(IPGuard.getSecureDispatcher).mockReturnValue({} as any);
|
|
37
|
-
fetcher = new Fetcher({ rate: 100 });
|
|
38
|
-
});
|
|
39
|
-
|
|
40
|
-
it('should return blocked_internal_ip when secureDispatcher blocks', async () => {
|
|
41
|
-
const mockRequest = vi.mocked(request);
|
|
42
|
-
const mockGetSecureDispatcher = vi.mocked(IPGuard.getSecureDispatcher);
|
|
43
|
-
const mockDispatcher = { dispatch: vi.fn() } as any;
|
|
44
|
-
mockGetSecureDispatcher.mockReturnValue(mockDispatcher);
|
|
45
|
-
|
|
46
|
-
// Re-initialize fetcher so it calls getSecureDispatcher and gets our specific mock
|
|
47
|
-
fetcher = new Fetcher({ rate: 100 });
|
|
48
|
-
|
|
49
|
-
// Simulate secureDispatcher blocking via undici request throwing EBLOCKED
|
|
50
|
-
const blockedError = new Error('Blocked internal IP: 127.0.0.1');
|
|
51
|
-
(blockedError as any).code = 'EBLOCKED';
|
|
52
|
-
|
|
53
|
-
mockRequest.mockRejectedValueOnce(blockedError);
|
|
54
|
-
|
|
55
|
-
const res = await fetcher.fetch('http://example.com');
|
|
56
|
-
|
|
57
|
-
// Now we expect correct handling
|
|
58
|
-
expect(res.status).toBe('blocked_internal_ip');
|
|
59
|
-
|
|
60
|
-
// Verify that the secure dispatcher was indeed used
|
|
61
|
-
expect(mockGetSecureDispatcher).toHaveBeenCalled();
|
|
62
|
-
expect(mockRequest).toHaveBeenCalledWith(
|
|
63
|
-
expect.stringContaining('http://example.com'),
|
|
64
|
-
expect.objectContaining({
|
|
65
|
-
dispatcher: mockDispatcher
|
|
66
|
-
})
|
|
67
|
-
);
|
|
68
|
-
});
|
|
69
|
-
});
|
package/tests/trap.test.ts
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { TrapDetector } from '../src/crawler/trap.js';
|
|
3
|
-
|
|
4
|
-
describe('TrapDetector', () => {
|
|
5
|
-
const detector = new TrapDetector();
|
|
6
|
-
|
|
7
|
-
it('should detect session ID traps', () => {
|
|
8
|
-
const result = detector.checkTrap('https://example.com/page?sid=12345', 1);
|
|
9
|
-
expect(result.risk).toBeGreaterThan(0.8);
|
|
10
|
-
expect(result.type).toBe('session_trap');
|
|
11
|
-
});
|
|
12
|
-
|
|
13
|
-
it('should detect calendar patterns', () => {
|
|
14
|
-
const result = detector.checkTrap('https://example.com/archive/2023/12/01/', 1);
|
|
15
|
-
expect(result.risk).toBeGreaterThan(0.6);
|
|
16
|
-
expect(result.type).toBe('calendar_trap');
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
it('should detect pagination loops', () => {
|
|
20
|
-
// Simulate many pages
|
|
21
|
-
for (let i = 1; i <= 60; i++) {
|
|
22
|
-
detector.checkTrap(`https://example.com/blog?page=${i}`, 1);
|
|
23
|
-
}
|
|
24
|
-
const result = detector.checkTrap('https://example.com/blog?page=61', 1);
|
|
25
|
-
expect(result.risk).toBeGreaterThan(0.8);
|
|
26
|
-
expect(result.type).toBe('pagination_loop');
|
|
27
|
-
});
|
|
28
|
-
|
|
29
|
-
it('should detect faceted navigation / parameter explosion', () => {
|
|
30
|
-
detector.reset();
|
|
31
|
-
const basePath = 'https://example.com/products';
|
|
32
|
-
for (let i = 1; i <= 35; i++) {
|
|
33
|
-
detector.checkTrap(`${basePath}?color=red&size=${i}`, 1);
|
|
34
|
-
}
|
|
35
|
-
const result = detector.checkTrap(`${basePath}?color=blue&size=large`, 1);
|
|
36
|
-
expect(result.risk).toBeGreaterThan(0.9);
|
|
37
|
-
expect(result.type).toBe('faceted_navigation');
|
|
38
|
-
});
|
|
39
|
-
});
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { Crawl_HTML } from '../src/report/crawl_template.js';
|
|
3
|
-
import { Graph } from '../src/graph/graph.js';
|
|
4
|
-
import { computePageRank } from '../src/graph/pagerank.js';
|
|
5
|
-
|
|
6
|
-
describe('Visualization Data & Template', () => {
|
|
7
|
-
it('should include pageRankScore in graph JSON output after PageRank computation', () => {
|
|
8
|
-
const graph = new Graph();
|
|
9
|
-
graph.addNode('https://a.com', 0, 200);
|
|
10
|
-
graph.addNode('https://b.com', 1, 200);
|
|
11
|
-
graph.addEdge('https://a.com', 'https://b.com');
|
|
12
|
-
|
|
13
|
-
computePageRank(graph);
|
|
14
|
-
|
|
15
|
-
const json = graph.toJSON();
|
|
16
|
-
const nodeA = json.nodes.find(n => n.url === 'https://a.com');
|
|
17
|
-
const nodeB = json.nodes.find(n => n.url === 'https://b.com');
|
|
18
|
-
|
|
19
|
-
expect(nodeA).toBeDefined();
|
|
20
|
-
expect(nodeB).toBeDefined();
|
|
21
|
-
expect(typeof nodeA?.pageRankScore).toBe('number');
|
|
22
|
-
expect(typeof nodeB?.pageRankScore).toBe('number');
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
it('should contain UI toggle buttons for Authority Mode', () => {
|
|
26
|
-
expect(Crawl_HTML).toContain('id="btn-auth-pagerank"');
|
|
27
|
-
expect(Crawl_HTML).toContain('id="btn-auth-structural"');
|
|
28
|
-
});
|
|
29
|
-
|
|
30
|
-
it('should contain setAuthorityMode function', () => {
|
|
31
|
-
// Use regex to be flexible with whitespace
|
|
32
|
-
expect(Crawl_HTML).toMatch(/function\s+setAuthorityMode\s*\(mode,\s*btn\)/);
|
|
33
|
-
expect(Crawl_HTML).toContain('n.authority = mode === \'pagerank\' ? n.pageRankAuthority : n.structuralAuthority');
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
it('should contain logic to calculate pageRankAuthority from pageRankScore', () => {
|
|
37
|
-
expect(Crawl_HTML).toContain('n.pageRankAuthority = n.pageRankScore / 100');
|
|
38
|
-
expect(Crawl_HTML).toContain('n.structuralAuthority = Math.log(1 + n.inLinks)');
|
|
39
|
-
});
|
|
40
|
-
|
|
41
|
-
it('should update details panel to show both metrics', () => {
|
|
42
|
-
expect(Crawl_HTML).toContain('id="d-auth-container"');
|
|
43
|
-
expect(Crawl_HTML).toContain('In-Degree: ${structVal}');
|
|
44
|
-
expect(Crawl_HTML).toContain('PR: <strong>${prVal}</strong>');
|
|
45
|
-
});
|
|
46
|
-
});
|