@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
-
import { Fetcher } from '../src/crawler/fetcher.js';
|
|
3
|
-
import { request } from 'undici';
|
|
4
|
-
|
|
5
|
-
vi.mock('undici', () => {
|
|
6
|
-
return {
|
|
7
|
-
request: vi.fn(),
|
|
8
|
-
Agent: class {
|
|
9
|
-
dispatch = vi.fn();
|
|
10
|
-
},
|
|
11
|
-
Dispatcher: class {}
|
|
12
|
-
};
|
|
13
|
-
});
|
|
14
|
-
|
|
15
|
-
describe('Fetcher Safety Integration', () => {
|
|
16
|
-
let fetcher: Fetcher;
|
|
17
|
-
|
|
18
|
-
beforeEach(() => {
|
|
19
|
-
vi.clearAllMocks();
|
|
20
|
-
fetcher = new Fetcher({ rate: 100 }); // High rate for tests
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
it('should block internal IPs', async () => {
|
|
24
|
-
const res = await fetcher.fetch('http://127.0.0.1');
|
|
25
|
-
expect(res.status).toBe('blocked_internal_ip');
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
it('should block internal IPs in redirects', async () => {
|
|
29
|
-
const mockRequest = vi.mocked(request);
|
|
30
|
-
|
|
31
|
-
// First request is fine, returns redirect
|
|
32
|
-
mockRequest.mockResolvedValueOnce({
|
|
33
|
-
statusCode: 301,
|
|
34
|
-
headers: { location: 'http://192.168.1.1' },
|
|
35
|
-
body: { dump: vi.fn(), text: vi.fn().mockResolvedValue('') }
|
|
36
|
-
} as any);
|
|
37
|
-
|
|
38
|
-
const res = await fetcher.fetch('http://example.com');
|
|
39
|
-
expect(res.status).toBe('blocked_internal_ip');
|
|
40
|
-
expect(res.redirectChain).toHaveLength(1); // Records the redirect that led to block
|
|
41
|
-
expect(res.redirectChain[0].target).toBe('http://192.168.1.1/');
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
it('should enforce max bytes', async () => {
|
|
45
|
-
const mockRequest = vi.mocked(request);
|
|
46
|
-
|
|
47
|
-
mockRequest.mockResolvedValueOnce({
|
|
48
|
-
statusCode: 200,
|
|
49
|
-
headers: {},
|
|
50
|
-
body: {
|
|
51
|
-
on: vi.fn((event, cb) => {
|
|
52
|
-
if (event === 'data') {
|
|
53
|
-
cb(Buffer.alloc(1000));
|
|
54
|
-
cb(Buffer.alloc(1000));
|
|
55
|
-
}
|
|
56
|
-
return { on: vi.fn() };
|
|
57
|
-
}),
|
|
58
|
-
destroy: vi.fn(),
|
|
59
|
-
dump: vi.fn()
|
|
60
|
-
}
|
|
61
|
-
} as any);
|
|
62
|
-
|
|
63
|
-
const res = await fetcher.fetch('http://example.com', { maxBytes: 500 });
|
|
64
|
-
expect(res.status).toBe('oversized');
|
|
65
|
-
});
|
|
66
|
-
|
|
67
|
-
it('should retry on 500', async () => {
|
|
68
|
-
const mockRequest = vi.mocked(request);
|
|
69
|
-
|
|
70
|
-
mockRequest
|
|
71
|
-
.mockResolvedValueOnce({
|
|
72
|
-
statusCode: 500,
|
|
73
|
-
headers: {},
|
|
74
|
-
body: { dump: vi.fn().mockResolvedValue(undefined) }
|
|
75
|
-
} as any)
|
|
76
|
-
.mockResolvedValueOnce({
|
|
77
|
-
statusCode: 200,
|
|
78
|
-
headers: {},
|
|
79
|
-
body: {
|
|
80
|
-
on: vi.fn((event, cb) => {
|
|
81
|
-
if (event === 'data') cb(Buffer.from('ok'));
|
|
82
|
-
if (event === 'end') cb();
|
|
83
|
-
})
|
|
84
|
-
}
|
|
85
|
-
} as any);
|
|
86
|
-
|
|
87
|
-
const res = await fetcher.fetch('http://example.com');
|
|
88
|
-
expect(res.status).toBe(200);
|
|
89
|
-
expect(res.retries).toBe(1);
|
|
90
|
-
});
|
|
91
|
-
});
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"pages": [
|
|
3
|
-
{
|
|
4
|
-
"url": "https://example.com/",
|
|
5
|
-
"status": 200,
|
|
6
|
-
"depth": 0,
|
|
7
|
-
"html": "<html><head><title>Example Home Page SEO Title For Strong Ranking Signals 12345</title><meta name='description' content='This is an intentionally long and descriptive meta description designed to fit ideal search snippet lengths with rich context for users and engines.'/></head><body><h1>Home</h1><nav><a href='/skip'>Nav</a></nav><p>Welcome to the homepage. This page contains meaningful content. Another sentence here.</p><img src='/a.jpg' alt='hero'><img src='/b.jpg'><a href='/about'>About</a><a href='https://external.com' rel='nofollow noopener'>External</a><script type='application/ld+json'>{\"@context\":\"https://schema.org\",\"@type\":\"WebSite\"}</script></body></html>"
|
|
8
|
-
},
|
|
9
|
-
{
|
|
10
|
-
"url": "https://example.com/about",
|
|
11
|
-
"status": 200,
|
|
12
|
-
"depth": 1,
|
|
13
|
-
"html": "<html><head><title>Example Home Page SEO Title For Strong Ranking Signals 12345</title><meta name='description' content='short desc'/></head><body><h1>Example Home Page SEO Title For Strong Ranking Signals 12345</h1><h1>Second</h1><p>Duplicate body sentence. Duplicate body sentence.</p><img src='/c.jpg' alt=''><script type='application/ld+json'>not-json</script><a href='https://example.com/'>Home</a></body></html>"
|
|
14
|
-
},
|
|
15
|
-
{
|
|
16
|
-
"url": "https://example.com/empty",
|
|
17
|
-
"status": 200,
|
|
18
|
-
"depth": 2,
|
|
19
|
-
"html": ""
|
|
20
|
-
}
|
|
21
|
-
],
|
|
22
|
-
"edges": [
|
|
23
|
-
{ "source": "https://example.com/", "target": "https://example.com/about" },
|
|
24
|
-
{ "source": "https://example.com/about", "target": "https://example.com/" }
|
|
25
|
-
]
|
|
26
|
-
}
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, beforeEach } from 'vitest';
|
|
2
|
-
import { Graph } from '../../src/graph/graph.js';
|
|
3
|
-
|
|
4
|
-
describe('Graph', () => {
|
|
5
|
-
let graph: Graph;
|
|
6
|
-
|
|
7
|
-
beforeEach(() => {
|
|
8
|
-
graph = new Graph();
|
|
9
|
-
});
|
|
10
|
-
|
|
11
|
-
it('should add a new node', () => {
|
|
12
|
-
graph.addNode('http://example.com', 0, 200);
|
|
13
|
-
const node = graph.nodes.get('http://example.com');
|
|
14
|
-
expect(node).toBeDefined();
|
|
15
|
-
expect(node?.depth).toBe(0);
|
|
16
|
-
expect(node?.status).toBe(200);
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
it('should update existing node status if non-zero', () => {
|
|
20
|
-
graph.addNode('http://example.com', 0, 0);
|
|
21
|
-
graph.addNode('http://example.com', 1, 200); // Should update status, but not depth?
|
|
22
|
-
// Wait, addNode implementation:
|
|
23
|
-
// if (!existing) { ... } else { if (status !== 0) existing.status = status; }
|
|
24
|
-
|
|
25
|
-
const node = graph.nodes.get('http://example.com');
|
|
26
|
-
expect(node?.status).toBe(200);
|
|
27
|
-
expect(node?.depth).toBe(0); // Depth should not change
|
|
28
|
-
});
|
|
29
|
-
|
|
30
|
-
it('should add an edge', () => {
|
|
31
|
-
graph.addNode('http://a.com', 0);
|
|
32
|
-
graph.addNode('http://b.com', 1);
|
|
33
|
-
graph.addEdge('http://a.com', 'http://b.com', 0.5);
|
|
34
|
-
|
|
35
|
-
const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
|
|
36
|
-
expect(graph.edges.has(edgeKey)).toBe(true);
|
|
37
|
-
expect(graph.edges.get(edgeKey)).toBe(0.5);
|
|
38
|
-
|
|
39
|
-
const source = graph.nodes.get('http://a.com');
|
|
40
|
-
const target = graph.nodes.get('http://b.com');
|
|
41
|
-
expect(source?.outLinks).toBe(1);
|
|
42
|
-
expect(target?.inLinks).toBe(1);
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
it('should update edge weight if new weight is higher', () => {
|
|
46
|
-
graph.addNode('http://a.com', 0);
|
|
47
|
-
graph.addNode('http://b.com', 1);
|
|
48
|
-
graph.addEdge('http://a.com', 'http://b.com', 0.5);
|
|
49
|
-
graph.addEdge('http://a.com', 'http://b.com', 0.8);
|
|
50
|
-
|
|
51
|
-
const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
|
|
52
|
-
expect(graph.edges.get(edgeKey)).toBe(0.8);
|
|
53
|
-
|
|
54
|
-
// Should not increment link counts again
|
|
55
|
-
const source = graph.nodes.get('http://a.com');
|
|
56
|
-
expect(source?.outLinks).toBe(1);
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
it('should not update edge weight if new weight is lower', () => {
|
|
60
|
-
graph.addNode('http://a.com', 0);
|
|
61
|
-
graph.addNode('http://b.com', 1);
|
|
62
|
-
graph.addEdge('http://a.com', 'http://b.com', 0.8);
|
|
63
|
-
graph.addEdge('http://a.com', 'http://b.com', 0.5);
|
|
64
|
-
|
|
65
|
-
const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
|
|
66
|
-
expect(graph.edges.get(edgeKey)).toBe(0.8);
|
|
67
|
-
});
|
|
68
|
-
|
|
69
|
-
it('should serialize to JSON and deserialize from JSON', () => {
|
|
70
|
-
graph.addNode('http://a.com', 0, 200);
|
|
71
|
-
graph.addNode('http://b.com', 1, 200);
|
|
72
|
-
graph.addEdge('http://a.com', 'http://b.com', 1.0);
|
|
73
|
-
graph.duplicateClusters = [{ id: '1', type: 'exact', size: 2, representative: 'http://a.com', severity: 'high' }];
|
|
74
|
-
graph.contentClusters = [{ id: 1, count: 2, primaryUrl: 'http://a.com', risk: 'high' }];
|
|
75
|
-
|
|
76
|
-
const json = graph.toJSON();
|
|
77
|
-
const newGraph = Graph.fromJSON(json);
|
|
78
|
-
|
|
79
|
-
expect(newGraph.nodes.size).toBe(2);
|
|
80
|
-
expect(newGraph.edges.size).toBe(1);
|
|
81
|
-
expect(newGraph.duplicateClusters).toHaveLength(1);
|
|
82
|
-
expect(newGraph.contentClusters).toHaveLength(1);
|
|
83
|
-
|
|
84
|
-
const nodeA = newGraph.nodes.get('http://a.com');
|
|
85
|
-
expect(nodeA?.status).toBe(200);
|
|
86
|
-
|
|
87
|
-
const edgeKey = Graph.getEdgeKey('http://a.com', 'http://b.com');
|
|
88
|
-
expect(newGraph.edges.get(edgeKey)).toBe(1.0);
|
|
89
|
-
});
|
|
90
|
-
|
|
91
|
-
it('should handle partial JSON in fromJSON', () => {
|
|
92
|
-
const json = {
|
|
93
|
-
nodes: [{ url: 'http://a.com', depth: 0, status: 200, inLinks: 0, outLinks: 0 }],
|
|
94
|
-
// missing edges, clusters
|
|
95
|
-
};
|
|
96
|
-
const newGraph = Graph.fromJSON(json);
|
|
97
|
-
expect(newGraph.nodes.size).toBe(1);
|
|
98
|
-
expect(newGraph.edges.size).toBe(0);
|
|
99
|
-
});
|
|
100
|
-
});
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
-
import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
|
|
3
|
-
import { getDb, closeDb } from '../src/db/index.js';
|
|
4
|
-
import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
|
|
5
|
-
import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
|
|
6
|
-
import { PageRepository } from '../src/db/repositories/PageRepository.js';
|
|
7
|
-
import { MetricsRepository } from '../src/db/repositories/MetricsRepository.js';
|
|
8
|
-
import { Database } from 'better-sqlite3';
|
|
9
|
-
|
|
10
|
-
describe('GraphLoader', () => {
|
|
11
|
-
let db: Database;
|
|
12
|
-
|
|
13
|
-
beforeEach(() => {
|
|
14
|
-
process.env.NODE_ENV = 'test';
|
|
15
|
-
closeDb();
|
|
16
|
-
db = getDb();
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
afterEach(() => {
|
|
20
|
-
closeDb();
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
it('should load graph with metrics correctly', () => {
|
|
24
|
-
const siteRepo = new SiteRepository(db);
|
|
25
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
26
|
-
const pageRepo = new PageRepository(db);
|
|
27
|
-
const metricsRepo = new MetricsRepository(db);
|
|
28
|
-
|
|
29
|
-
const siteId = siteRepo.createSite('example.com');
|
|
30
|
-
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
|
|
31
|
-
const url = 'http://example.com/page1';
|
|
32
|
-
|
|
33
|
-
// Create Page
|
|
34
|
-
pageRepo.upsertPage({
|
|
35
|
-
site_id: siteId,
|
|
36
|
-
normalized_url: url,
|
|
37
|
-
last_seen_snapshot_id: snapshotId,
|
|
38
|
-
http_status: 200,
|
|
39
|
-
depth: 0
|
|
40
|
-
});
|
|
41
|
-
const page = pageRepo.getPage(siteId, url)!;
|
|
42
|
-
|
|
43
|
-
// Insert Metrics
|
|
44
|
-
metricsRepo.insertMetrics({
|
|
45
|
-
snapshot_id: snapshotId,
|
|
46
|
-
page_id: page.id,
|
|
47
|
-
authority_score: 0.5,
|
|
48
|
-
hub_score: 0.2,
|
|
49
|
-
pagerank: 0.8,
|
|
50
|
-
pagerank_score: 80.0,
|
|
51
|
-
link_role: 'authority',
|
|
52
|
-
crawl_status: 'fetched',
|
|
53
|
-
word_count: 500,
|
|
54
|
-
thin_content_score: 10,
|
|
55
|
-
external_link_ratio: 0.1,
|
|
56
|
-
orphan_score: 5,
|
|
57
|
-
duplicate_cluster_id: null,
|
|
58
|
-
duplicate_type: null,
|
|
59
|
-
is_cluster_primary: 1
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
// Load Graph
|
|
63
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
64
|
-
const node = graph.nodes.get(url);
|
|
65
|
-
|
|
66
|
-
expect(node).toBeDefined();
|
|
67
|
-
expect(node?.authorityScore).toBe(0.5);
|
|
68
|
-
expect(node?.hubScore).toBe(0.2);
|
|
69
|
-
// Verify new fields
|
|
70
|
-
expect(node?.crawlStatus).toBe('fetched');
|
|
71
|
-
expect(node?.wordCount).toBe(500);
|
|
72
|
-
expect(node?.thinContentScore).toBe(10);
|
|
73
|
-
expect(node?.externalLinkRatio).toBe(0.1);
|
|
74
|
-
expect(node?.orphanScore).toBe(5);
|
|
75
|
-
});
|
|
76
|
-
|
|
77
|
-
it('should handle null metrics gracefully', () => {
|
|
78
|
-
const siteRepo = new SiteRepository(db);
|
|
79
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
80
|
-
const pageRepo = new PageRepository(db);
|
|
81
|
-
const metricsRepo = new MetricsRepository(db);
|
|
82
|
-
|
|
83
|
-
const siteId = siteRepo.createSite('example.com');
|
|
84
|
-
const snapshotId = snapshotRepo.createSnapshot(siteId, 'full');
|
|
85
|
-
const url = 'http://example.com/page2';
|
|
86
|
-
|
|
87
|
-
pageRepo.upsertPage({
|
|
88
|
-
site_id: siteId,
|
|
89
|
-
normalized_url: url,
|
|
90
|
-
last_seen_snapshot_id: snapshotId,
|
|
91
|
-
http_status: 200,
|
|
92
|
-
depth: 1
|
|
93
|
-
});
|
|
94
|
-
const page = pageRepo.getPage(siteId, url)!;
|
|
95
|
-
|
|
96
|
-
// Insert Metrics with nulls
|
|
97
|
-
metricsRepo.insertMetrics({
|
|
98
|
-
snapshot_id: snapshotId,
|
|
99
|
-
page_id: page.id,
|
|
100
|
-
authority_score: null,
|
|
101
|
-
hub_score: null,
|
|
102
|
-
pagerank: null,
|
|
103
|
-
pagerank_score: null,
|
|
104
|
-
link_role: null,
|
|
105
|
-
crawl_status: null,
|
|
106
|
-
word_count: null,
|
|
107
|
-
thin_content_score: null,
|
|
108
|
-
external_link_ratio: null,
|
|
109
|
-
orphan_score: null,
|
|
110
|
-
duplicate_cluster_id: null,
|
|
111
|
-
duplicate_type: null,
|
|
112
|
-
is_cluster_primary: 0
|
|
113
|
-
});
|
|
114
|
-
|
|
115
|
-
const graph = loadGraphFromSnapshot(snapshotId);
|
|
116
|
-
const node = graph.nodes.get(url);
|
|
117
|
-
|
|
118
|
-
expect(node).toBeDefined();
|
|
119
|
-
// Check undefined
|
|
120
|
-
expect(node?.crawlStatus).toBeUndefined();
|
|
121
|
-
expect(node?.wordCount).toBeUndefined();
|
|
122
|
-
expect(node?.thinContentScore).toBeUndefined();
|
|
123
|
-
});
|
|
124
|
-
});
|
package/tests/hits.test.ts
DELETED
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { Graph } from '../src/graph/graph.js';
|
|
3
|
-
import { computeHITS } from '../src/scoring/hits.js';
|
|
4
|
-
|
|
5
|
-
describe('HITS Scoring', () => {
|
|
6
|
-
it('should compute scores for a simple star topology', () => {
|
|
7
|
-
const graph = new Graph();
|
|
8
|
-
// Hub
|
|
9
|
-
graph.addNode('http://hub.com', 0, 200);
|
|
10
|
-
// Authorities
|
|
11
|
-
graph.addNode('http://auth1.com', 1, 200);
|
|
12
|
-
graph.addNode('http://auth2.com', 1, 200);
|
|
13
|
-
graph.addNode('http://auth3.com', 1, 200);
|
|
14
|
-
|
|
15
|
-
graph.addEdge('http://hub.com', 'http://auth1.com');
|
|
16
|
-
graph.addEdge('http://hub.com', 'http://auth2.com');
|
|
17
|
-
graph.addEdge('http://hub.com', 'http://auth3.com');
|
|
18
|
-
|
|
19
|
-
computeHITS(graph, { iterations: 10 });
|
|
20
|
-
|
|
21
|
-
const hub = graph.nodes.get('http://hub.com')!;
|
|
22
|
-
const auth1 = graph.nodes.get('http://auth1.com')!;
|
|
23
|
-
|
|
24
|
-
// In a star topology:
|
|
25
|
-
// Hub should have max hub score
|
|
26
|
-
// Authorities should have max authority scores
|
|
27
|
-
expect(hub.hubScore).toBeGreaterThan(0.9);
|
|
28
|
-
expect(hub.authorityScore).toBe(0); // No one links to hub
|
|
29
|
-
|
|
30
|
-
expect(auth1.authorityScore).toBeGreaterThan(0.5);
|
|
31
|
-
expect(auth1.hubScore).toBe(0); // Auth1 links to no one
|
|
32
|
-
});
|
|
33
|
-
|
|
34
|
-
it('should handle exclusion rules', () => {
|
|
35
|
-
const graph = new Graph();
|
|
36
|
-
graph.addNode('http://valid.com', 0, 200);
|
|
37
|
-
graph.addNode('http://noindex.com', 0, 200);
|
|
38
|
-
graph.updateNodeData('http://noindex.com', { noindex: true });
|
|
39
|
-
graph.addNode('http://redirect.com', 0, 200);
|
|
40
|
-
graph.updateNodeData('http://redirect.com', { redirectChain: ['http://target.com'] });
|
|
41
|
-
graph.addNode('http://external.com', 0, 200); // Eligibility check marks it as eligible if status is 200
|
|
42
|
-
// but typically external wouldn't have status 200 in the graph if we don't crawl them or they are marked as external.
|
|
43
|
-
// The current hits logic relies on: status === 200 && no redirectChain && !noindex
|
|
44
|
-
|
|
45
|
-
graph.addEdge('http://valid.com', 'http://noindex.com');
|
|
46
|
-
graph.addEdge('http://valid.com', 'http://redirect.com');
|
|
47
|
-
|
|
48
|
-
computeHITS(graph);
|
|
49
|
-
|
|
50
|
-
expect(graph.nodes.get('http://noindex.com')?.hubScore).toBeUndefined();
|
|
51
|
-
expect(graph.nodes.get('http://redirect.com')?.hubScore).toBeUndefined();
|
|
52
|
-
expect(graph.nodes.get('http://valid.com')?.hubScore).toBe(0); // Valid hub but its targets are ineligible
|
|
53
|
-
});
|
|
54
|
-
|
|
55
|
-
it('should respect edge weights', () => {
|
|
56
|
-
const graph = new Graph();
|
|
57
|
-
graph.addNode('http://hub.com', 0, 200);
|
|
58
|
-
graph.addNode('http://auth-high.com', 1, 200);
|
|
59
|
-
graph.addNode('http://auth-low.com', 1, 200);
|
|
60
|
-
|
|
61
|
-
graph.addEdge('http://hub.com', 'http://auth-high.com', 1.0);
|
|
62
|
-
graph.addEdge('http://hub.com', 'http://auth-low.com', 0.1);
|
|
63
|
-
|
|
64
|
-
computeHITS(graph, { iterations: 10 });
|
|
65
|
-
|
|
66
|
-
const authHigh = graph.nodes.get('http://auth-high.com')!;
|
|
67
|
-
const authLow = graph.nodes.get('http://auth-low.com')!;
|
|
68
|
-
|
|
69
|
-
expect(authHigh.authorityScore).toBeGreaterThan(authLow.authorityScore!);
|
|
70
|
-
});
|
|
71
|
-
|
|
72
|
-
it('should classify link roles correctly', () => {
|
|
73
|
-
const graph = new Graph();
|
|
74
|
-
for (let i = 0; i < 11; i++) {
|
|
75
|
-
graph.addNode(`http://node${i}.com`, 0, 200);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
// AUTHORITY: node1 (linked by 0,2,3... no outlinks)
|
|
79
|
-
graph.addEdge('http://node0.com', 'http://node1.com');
|
|
80
|
-
graph.addEdge('http://node2.com', 'http://node1.com');
|
|
81
|
-
graph.addEdge('http://node3.com', 'http://node1.com');
|
|
82
|
-
graph.addEdge('http://node4.com', 'http://node1.com');
|
|
83
|
-
|
|
84
|
-
// HUB: node4 (links to 1,5,6,7... few inlinks)
|
|
85
|
-
graph.addEdge('http://node4.com', 'http://node5.com');
|
|
86
|
-
graph.addEdge('http://node4.com', 'http://node6.com');
|
|
87
|
-
graph.addEdge('http://node4.com', 'http://node7.com');
|
|
88
|
-
|
|
89
|
-
// POWER: node2 (linked by 0, power is often recursive... link to authority and be linked by hub)
|
|
90
|
-
graph.addEdge('http://node0.com', 'http://node2.com');
|
|
91
|
-
graph.addEdge('http://node2.com', 'http://node1.com');
|
|
92
|
-
graph.addEdge('http://node2.com', 'http://node5.com');
|
|
93
|
-
|
|
94
|
-
// PERIPHERAL: node10 (no links)
|
|
95
|
-
// Some filler nodes to push medians down
|
|
96
|
-
graph.addEdge('http://node8.com', 'http://node9.com');
|
|
97
|
-
|
|
98
|
-
computeHITS(graph, { iterations: 20 });
|
|
99
|
-
|
|
100
|
-
const roles = graph.getNodes().map(n => n.linkRole).filter(Boolean);
|
|
101
|
-
expect(roles).toContain('authority');
|
|
102
|
-
expect(roles).toContain('hub');
|
|
103
|
-
expect(roles).toContain('power');
|
|
104
|
-
expect(roles).toContain('peripheral');
|
|
105
|
-
});
|
|
106
|
-
|
|
107
|
-
it('should handle large synthetic graphs (Performance Test)', () => {
|
|
108
|
-
const graph = new Graph();
|
|
109
|
-
const nodeCount = 5000;
|
|
110
|
-
|
|
111
|
-
// Create 5000 nodes
|
|
112
|
-
for (let i = 0; i < nodeCount; i++) {
|
|
113
|
-
graph.addNode(`http://page${i}.com`, 1, 200);
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
// Create random edges (avg 10 per node)
|
|
117
|
-
for (let i = 0; i < nodeCount; i++) {
|
|
118
|
-
for (let j = 0; j < 10; j++) {
|
|
119
|
-
const target = Math.floor(Math.random() * nodeCount);
|
|
120
|
-
if (i !== target) {
|
|
121
|
-
graph.addEdge(`http://page${i}.com`, `http://page${target}.com`);
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
const start = Date.now();
|
|
127
|
-
computeHITS(graph, { iterations: 20 });
|
|
128
|
-
const duration = Date.now() - start;
|
|
129
|
-
|
|
130
|
-
console.log(`HITS on 5000 nodes took ${duration}ms`);
|
|
131
|
-
expect(duration).toBeLessThan(2000); // Should be very fast, but allow buffer for CI environments
|
|
132
|
-
expect(graph.nodes.get('http://page0.com')?.hubScore).toBeDefined();
|
|
133
|
-
});
|
|
134
|
-
});
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import { describe, expect, test } from 'vitest';
|
|
2
|
-
import { renderAnalysisHtml, AnalysisResult, PageAnalysis } from '../src/analysis/analyze.js';
|
|
3
|
-
|
|
4
|
-
const mockPage: PageAnalysis = {
|
|
5
|
-
url: 'https://example.com',
|
|
6
|
-
status: 200,
|
|
7
|
-
seoScore: 85,
|
|
8
|
-
thinScore: 10,
|
|
9
|
-
title: { value: 'Example Title', length: 13, status: 'ok' },
|
|
10
|
-
metaDescription: { value: 'Example Desc', length: 12, status: 'ok' },
|
|
11
|
-
h1: { count: 1, status: 'ok', matchesTitle: true },
|
|
12
|
-
content: { wordCount: 500, uniqueSentenceCount: 50, textHtmlRatio: 0.6 },
|
|
13
|
-
images: { totalImages: 2, missingAlt: 0, emptyAlt: 0 },
|
|
14
|
-
links: { internalLinks: 5, externalLinks: 2, nofollowCount: 0, externalRatio: 0.2 },
|
|
15
|
-
structuredData: { present: true, valid: true, types: ['Article'] },
|
|
16
|
-
meta: { canonical: 'https://example.com', noindex: false, nofollow: false }
|
|
17
|
-
};
|
|
18
|
-
|
|
19
|
-
const mockResult: AnalysisResult = {
|
|
20
|
-
site_summary: {
|
|
21
|
-
pages_analyzed: 1,
|
|
22
|
-
avg_seo_score: 85,
|
|
23
|
-
thin_pages: 0,
|
|
24
|
-
duplicate_titles: 0,
|
|
25
|
-
site_score: 90
|
|
26
|
-
},
|
|
27
|
-
site_scores: { overallScore: 90, seoHealthScore: 85 },
|
|
28
|
-
pages: [mockPage],
|
|
29
|
-
active_modules: { seo: true, content: true, accessibility: true }
|
|
30
|
-
};
|
|
31
|
-
|
|
32
|
-
describe('HTML Report Generation', () => {
|
|
33
|
-
test('generates single page report correctly', () => {
|
|
34
|
-
// If pages length is 1, it renders single page report
|
|
35
|
-
const html = renderAnalysisHtml(mockResult);
|
|
36
|
-
expect(html).toContain('<!DOCTYPE html>');
|
|
37
|
-
expect(html).toContain('Analysis for https://example.com');
|
|
38
|
-
expect(html).toContain('Example Title');
|
|
39
|
-
expect(html).toContain('Example Desc');
|
|
40
|
-
expect(html).toContain('500 words');
|
|
41
|
-
expect(html).toContain('<span class="status-ok">Valid</span>');
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
test('generates list report correctly', () => {
|
|
45
|
-
// Modify result to have 2 pages to trigger list view
|
|
46
|
-
const listResult: AnalysisResult = {
|
|
47
|
-
...mockResult,
|
|
48
|
-
pages: [mockPage, { ...mockPage, url: 'https://example.com/2' }]
|
|
49
|
-
};
|
|
50
|
-
const html = renderAnalysisHtml(listResult);
|
|
51
|
-
|
|
52
|
-
expect(html).toContain('<!DOCTYPE html>');
|
|
53
|
-
expect(html).toContain('Crawlith Analysis Report');
|
|
54
|
-
expect(html).toContain('Pages: 1'); // site_summary.pages_analyzed is 1 in mockResult
|
|
55
|
-
expect(html).toContain('https://example.com');
|
|
56
|
-
expect(html).toContain('https://example.com/2');
|
|
57
|
-
expect(html).toContain('<td>85</td>'); // seoScore
|
|
58
|
-
});
|
|
59
|
-
});
|
package/tests/ipGuard.test.ts
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi } from 'vitest';
|
|
2
|
-
import { IPGuard } from '../src/core/security/ipGuard.js';
|
|
3
|
-
import * as dns from 'dns';
|
|
4
|
-
|
|
5
|
-
vi.mock('dns', () => ({
|
|
6
|
-
lookup: vi.fn(),
|
|
7
|
-
resolve4: vi.fn(),
|
|
8
|
-
resolve6: vi.fn(),
|
|
9
|
-
}));
|
|
10
|
-
|
|
11
|
-
describe('IPGuard Secure Lookup', () => {
|
|
12
|
-
it('should resolve safe IPs', () => {
|
|
13
|
-
const lookupMock = vi.mocked(dns.lookup);
|
|
14
|
-
// Mock successful resolution
|
|
15
|
-
lookupMock.mockImplementation((hostname, options, callback) => {
|
|
16
|
-
callback(null, '8.8.8.8', 4);
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
const callback = vi.fn();
|
|
20
|
-
IPGuard.secureLookup('google.com', {}, callback);
|
|
21
|
-
|
|
22
|
-
expect(callback).toHaveBeenCalledWith(null, '8.8.8.8', 4);
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
it('should block internal IPs', () => {
|
|
26
|
-
const lookupMock = vi.mocked(dns.lookup);
|
|
27
|
-
// Mock internal IP resolution
|
|
28
|
-
lookupMock.mockImplementation((hostname, options, callback) => {
|
|
29
|
-
callback(null, '127.0.0.1', 4);
|
|
30
|
-
});
|
|
31
|
-
|
|
32
|
-
const callback = vi.fn();
|
|
33
|
-
IPGuard.secureLookup('localhost', {}, callback);
|
|
34
|
-
|
|
35
|
-
expect(callback).toHaveBeenCalledWith(expect.any(Error), '127.0.0.1', 4);
|
|
36
|
-
const error = callback.mock.calls[0][0];
|
|
37
|
-
expect(error.message).toContain('Blocked internal IP');
|
|
38
|
-
expect(error.code).toBe('EBLOCKED');
|
|
39
|
-
});
|
|
40
|
-
|
|
41
|
-
it('should handle array of IPs (IPv4)', () => {
|
|
42
|
-
const lookupMock = vi.mocked(dns.lookup);
|
|
43
|
-
// Mock array resolution
|
|
44
|
-
lookupMock.mockImplementation((hostname, options, callback) => {
|
|
45
|
-
// Mocking address array structure
|
|
46
|
-
const addresses = [
|
|
47
|
-
{ address: '1.1.1.1', family: 4 },
|
|
48
|
-
{ address: '127.0.0.1', family: 4 }
|
|
49
|
-
];
|
|
50
|
-
callback(null, addresses as any, 4);
|
|
51
|
-
});
|
|
52
|
-
|
|
53
|
-
const callback = vi.fn();
|
|
54
|
-
IPGuard.secureLookup('mixed.com', { all: true } as any, callback);
|
|
55
|
-
|
|
56
|
-
expect(callback).toHaveBeenCalledWith(expect.any(Error), expect.anything(), 4);
|
|
57
|
-
const error = callback.mock.calls[0][0];
|
|
58
|
-
expect(error.message).toContain('Blocked internal IP');
|
|
59
|
-
});
|
|
60
|
-
|
|
61
|
-
it('should pass through DNS errors', () => {
|
|
62
|
-
const lookupMock = vi.mocked(dns.lookup);
|
|
63
|
-
const dnsError = new Error('ENOTFOUND');
|
|
64
|
-
lookupMock.mockImplementation((hostname, options, callback) => {
|
|
65
|
-
callback(dnsError as any, undefined as any, 0);
|
|
66
|
-
});
|
|
67
|
-
|
|
68
|
-
const callback = vi.fn();
|
|
69
|
-
IPGuard.secureLookup('invalid.domain', {}, callback);
|
|
70
|
-
|
|
71
|
-
expect(callback).toHaveBeenCalledWith(dnsError, undefined, 0);
|
|
72
|
-
});
|
|
73
|
-
});
|