@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/tests/metrics.test.ts
DELETED
|
@@ -1,196 +0,0 @@
|
|
|
1
|
-
import { Graph } from '../src/graph/graph.js';
|
|
2
|
-
import { calculateMetrics } from '../src/graph/metrics.js';
|
|
3
|
-
import { test, expect } from 'vitest';
|
|
4
|
-
|
|
5
|
-
test('graph metrics basic', () => {
|
|
6
|
-
const g = new Graph();
|
|
7
|
-
|
|
8
|
-
// Structure:
|
|
9
|
-
// A -> B
|
|
10
|
-
// A -> C
|
|
11
|
-
// B -> C
|
|
12
|
-
// C -> A
|
|
13
|
-
|
|
14
|
-
g.addNode('A', 0, 200);
|
|
15
|
-
g.addNode('B', 1, 200);
|
|
16
|
-
g.addNode('C', 1, 200);
|
|
17
|
-
|
|
18
|
-
g.addEdge('A', 'B');
|
|
19
|
-
g.addEdge('A', 'C');
|
|
20
|
-
g.addEdge('B', 'C');
|
|
21
|
-
g.addEdge('C', 'A');
|
|
22
|
-
|
|
23
|
-
const metrics = calculateMetrics(g, 5);
|
|
24
|
-
|
|
25
|
-
expect(metrics.totalPages).toBe(3);
|
|
26
|
-
expect(metrics.totalEdges).toBe(4);
|
|
27
|
-
|
|
28
|
-
// Check degrees on nodes directly
|
|
29
|
-
const nodeA = g.nodes.get('A');
|
|
30
|
-
expect(nodeA?.inLinks).toBe(1);
|
|
31
|
-
expect(nodeA?.outLinks).toBe(2);
|
|
32
|
-
|
|
33
|
-
const nodeC = g.nodes.get('C');
|
|
34
|
-
expect(nodeC?.inLinks).toBe(2);
|
|
35
|
-
expect(nodeC?.outLinks).toBe(1);
|
|
36
|
-
|
|
37
|
-
expect(metrics.averageOutDegree).toBeCloseTo(4/3);
|
|
38
|
-
|
|
39
|
-
// Top authority should be C with 2 in-links, authority = 1
|
|
40
|
-
expect(metrics.topAuthorityPages[0].url).toBe('C');
|
|
41
|
-
expect(metrics.topAuthorityPages[0].authority).toBeCloseTo(1);
|
|
42
|
-
|
|
43
|
-
// Max depth found
|
|
44
|
-
expect(metrics.maxDepthFound).toBe(1);
|
|
45
|
-
|
|
46
|
-
// Orphan pages (none)
|
|
47
|
-
expect(metrics.orphanPages).toEqual([]);
|
|
48
|
-
});
|
|
49
|
-
|
|
50
|
-
test('orphan pages', () => {
|
|
51
|
-
const g = new Graph();
|
|
52
|
-
g.addNode('Root', 0, 200);
|
|
53
|
-
g.addNode('Orphan', 1, 200);
|
|
54
|
-
// Orphan is at depth 1 but no incoming edges recorded (maybe missed or filtered)
|
|
55
|
-
|
|
56
|
-
const metrics = calculateMetrics(g, 5);
|
|
57
|
-
expect(metrics.orphanPages).toContain('Orphan');
|
|
58
|
-
expect(metrics.orphanPages).not.toContain('Root');
|
|
59
|
-
});
|
|
60
|
-
test('metrics v2 calculations', () => {
|
|
61
|
-
const g = new Graph();
|
|
62
|
-
|
|
63
|
-
// Root (depth 0, in=0, out=2)
|
|
64
|
-
g.addNode('root', 0, 200);
|
|
65
|
-
|
|
66
|
-
// A (depth 1, in=1, out=1)
|
|
67
|
-
g.addNode('A', 1, 200);
|
|
68
|
-
g.addEdge('root', 'A');
|
|
69
|
-
|
|
70
|
-
// B (depth 1, in=1, out=0)
|
|
71
|
-
g.addNode('B', 1, 200);
|
|
72
|
-
g.addEdge('root', 'B');
|
|
73
|
-
|
|
74
|
-
// C (depth 2, in=1, out=0)
|
|
75
|
-
g.addNode('C', 2, 200);
|
|
76
|
-
g.addEdge('A', 'C');
|
|
77
|
-
|
|
78
|
-
// Orphan (depth 1, in=0) - e.g. added but no edge to it?
|
|
79
|
-
// If it's in graph with depth > 0 and inLinks=0, it's an orphan.
|
|
80
|
-
g.addNode('orphan', 1, 200);
|
|
81
|
-
|
|
82
|
-
// Near Orphan (depth 3, in=1)
|
|
83
|
-
g.addNode('D', 2, 200);
|
|
84
|
-
g.addNode('nearOrphan', 3, 200);
|
|
85
|
-
g.addEdge('C', 'D'); // C->D
|
|
86
|
-
g.addEdge('D', 'nearOrphan'); // D->nearOrphan
|
|
87
|
-
|
|
88
|
-
// Deep page (depth 4)
|
|
89
|
-
g.addNode('deep', 4, 200);
|
|
90
|
-
g.addEdge('nearOrphan', 'deep');
|
|
91
|
-
|
|
92
|
-
// Nodes: root(0), A(1), B(1), C(2), orphan(1), D(2), nearOrphan(3), deep(4)
|
|
93
|
-
// Total pages: 8
|
|
94
|
-
|
|
95
|
-
// Edges: root->A, root->B, A->C, C->D, D->nearOrphan, nearOrphan->deep
|
|
96
|
-
// Total edges: 6
|
|
97
|
-
|
|
98
|
-
// InLinks:
|
|
99
|
-
// root: 0
|
|
100
|
-
// A: 1
|
|
101
|
-
// B: 1
|
|
102
|
-
// C: 1
|
|
103
|
-
// orphan: 0
|
|
104
|
-
// D: 1
|
|
105
|
-
// nearOrphan: 1
|
|
106
|
-
// deep: 1
|
|
107
|
-
|
|
108
|
-
// Max InLinks = 1.
|
|
109
|
-
// Authority Score = log(1 + in) / log(1 + maxIn)
|
|
110
|
-
// If maxIn = 1, log(2).
|
|
111
|
-
// For A: log(2)/log(2) = 1.
|
|
112
|
-
// For root: log(1)/log(2) = 0.
|
|
113
|
-
|
|
114
|
-
// Let's make maxIn > 1 to test better.
|
|
115
|
-
g.addNode('popular', 1, 200);
|
|
116
|
-
g.addEdge('root', 'popular');
|
|
117
|
-
g.addEdge('A', 'popular');
|
|
118
|
-
// popular inLinks = 2. MaxIn = 2.
|
|
119
|
-
// Authority popular = log(3)/log(3) = 1.
|
|
120
|
-
// Authority A = log(2)/log(3) approx 0.63
|
|
121
|
-
|
|
122
|
-
const metrics = calculateMetrics(g, 10); // maxDepth arg (not used for calculation logic of deepPages which is hardcoded >=4 per prompt?)
|
|
123
|
-
// Prompt says "deepPages: depth >= 4".
|
|
124
|
-
// Existing calculateMetrics takes maxDepth arg.
|
|
125
|
-
// Existing: deepPages = nodes.filter(n => n.depth >= maxDepth)
|
|
126
|
-
// New requirement: deepPages: depth >= 4.
|
|
127
|
-
// I should probably ignore the argument or update the requirement interpretation.
|
|
128
|
-
// "deepPages: depth >= 4" implies fixed threshold.
|
|
129
|
-
|
|
130
|
-
// Orphan pages: inLinks === 0 && depth > 0
|
|
131
|
-
expect(metrics.orphanPages).toContain('orphan');
|
|
132
|
-
expect(metrics.orphanPages).not.toContain('root'); // depth 0
|
|
133
|
-
|
|
134
|
-
// Near orphans: inLinks === 1 && depth >= 3
|
|
135
|
-
expect(metrics.nearOrphans).toContain('nearOrphan'); // depth 3, in 1
|
|
136
|
-
expect(metrics.nearOrphans).toContain('deep'); // depth 4, in 1 (from nearOrphan)
|
|
137
|
-
expect(metrics.nearOrphans).not.toContain('D'); // depth 2
|
|
138
|
-
|
|
139
|
-
// Deep pages: depth >= 4
|
|
140
|
-
expect(metrics.deepPages).toContain('deep');
|
|
141
|
-
expect(metrics.deepPages).not.toContain('nearOrphan');
|
|
142
|
-
|
|
143
|
-
// Crawl Efficiency Score: 1 - (deepPagesCount / totalPages)
|
|
144
|
-
// Total: 9 nodes (root, A, B, C, orphan, D, nearOrphan, deep, popular)
|
|
145
|
-
// Deep: 1 (deep)
|
|
146
|
-
// Score: 1 - 1/9 = 8/9 = 0.888...
|
|
147
|
-
expect(metrics.crawlEfficiencyScore).toBeCloseTo(8/9);
|
|
148
|
-
|
|
149
|
-
// Average Depth: sum(depth) / totalPages
|
|
150
|
-
// Depths: 0, 1, 1, 2, 1, 2, 3, 4, 1
|
|
151
|
-
// Sum: 15
|
|
152
|
-
// Avg: 15/9 = 1.666...
|
|
153
|
-
expect(metrics.averageDepth).toBeCloseTo(15/9);
|
|
154
|
-
|
|
155
|
-
// Structural Entropy
|
|
156
|
-
// OutDegrees:
|
|
157
|
-
// root: 3 (A, B, popular)
|
|
158
|
-
// A: 2 (C, popular)
|
|
159
|
-
// B: 0
|
|
160
|
-
// C: 1 (D)
|
|
161
|
-
// orphan: 0
|
|
162
|
-
// D: 1 (nearOrphan)
|
|
163
|
-
// nearOrphan: 1 (deep)
|
|
164
|
-
// deep: 0
|
|
165
|
-
// popular: 0
|
|
166
|
-
|
|
167
|
-
// Distribution:
|
|
168
|
-
// 0: 4 nodes (B, orphan, deep, popular)
|
|
169
|
-
// 1: 3 nodes (C, D, nearOrphan)
|
|
170
|
-
// 2: 1 node (A)
|
|
171
|
-
// 3: 1 node (root)
|
|
172
|
-
|
|
173
|
-
// P(0) = 4/9
|
|
174
|
-
// P(1) = 3/9
|
|
175
|
-
// P(2) = 1/9
|
|
176
|
-
// P(3) = 1/9
|
|
177
|
-
|
|
178
|
-
// Entropy = - (4/9 log2(4/9) + 3/9 log2(3/9) + 1/9 log2(1/9) + 1/9 log2(1/9))
|
|
179
|
-
// = - (0.444 * -1.17 + 0.333 * -1.58 + 0.111 * -3.17 + 0.111 * -3.17)
|
|
180
|
-
// approx 1.75
|
|
181
|
-
|
|
182
|
-
// Let's compute exact expected value
|
|
183
|
-
const p0 = 4/9;
|
|
184
|
-
const p1 = 3/9;
|
|
185
|
-
const p2 = 1/9;
|
|
186
|
-
const p3 = 1/9;
|
|
187
|
-
const entropy = - (p0 * Math.log2(p0) + p1 * Math.log2(p1) + p2 * Math.log2(p2) + p3 * Math.log2(p3));
|
|
188
|
-
|
|
189
|
-
expect(metrics.structuralEntropy).toBeCloseTo(entropy);
|
|
190
|
-
|
|
191
|
-
// Limit Reached
|
|
192
|
-
expect(metrics.limitReached).toBe(false);
|
|
193
|
-
g.limitReached = true;
|
|
194
|
-
const metrics2 = calculateMetrics(g, 10);
|
|
195
|
-
expect(metrics2.limitReached).toBe(true);
|
|
196
|
-
});
|
package/tests/normalize.test.ts
DELETED
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
import { normalizeUrl } from '../src/crawler/normalize.js';
|
|
2
|
-
import { extractLinks } from '../src/crawler/extract.js';
|
|
3
|
-
import { test, expect } from 'vitest';
|
|
4
|
-
|
|
5
|
-
test('normalizeUrl', () => {
|
|
6
|
-
expect(normalizeUrl('https://Example.com/Foo/', '')).toBe('https://example.com/Foo');
|
|
7
|
-
expect(normalizeUrl('http://example.com:80/bar', '')).toBe('http://example.com/bar');
|
|
8
|
-
expect(normalizeUrl('https://example.com/baz#frag', '')).toBe('https://example.com/baz');
|
|
9
|
-
expect(normalizeUrl('https://example.com/qux?a=1', '', { stripQuery: true })).toBe('https://example.com/qux');
|
|
10
|
-
expect(normalizeUrl('https://example.com/qux?a=1', '', { stripQuery: false })).toBe('https://example.com/qux?a=1');
|
|
11
|
-
expect(normalizeUrl('https://example.com/', '')).toBe('https://example.com/');
|
|
12
|
-
});
|
|
13
|
-
|
|
14
|
-
test('extractLinks', () => {
|
|
15
|
-
const html = `
|
|
16
|
-
<html>
|
|
17
|
-
<body>
|
|
18
|
-
<a href="/foo">Foo</a>
|
|
19
|
-
<a href="bar">Bar</a>
|
|
20
|
-
<a href="https://other.com/baz">Baz</a>
|
|
21
|
-
<a href="#top">Top</a>
|
|
22
|
-
</body>
|
|
23
|
-
</html>
|
|
24
|
-
`;
|
|
25
|
-
const links = extractLinks(html, 'https://example.com/page/');
|
|
26
|
-
expect(links).toContain('https://example.com/foo');
|
|
27
|
-
expect(links).toContain('https://example.com/page/bar');
|
|
28
|
-
expect(links).toContain('https://other.com/baz');
|
|
29
|
-
expect(links).not.toContain('https://example.com/page/#top');
|
|
30
|
-
expect(links).toContain('https://example.com/page/'); // #top resolves to base url without fragment
|
|
31
|
-
});
|
|
32
|
-
test('normalizeUrl: absolute resolution', () => {
|
|
33
|
-
expect(normalizeUrl('/foo', 'https://example.com')).toBe('https://example.com/foo');
|
|
34
|
-
expect(normalizeUrl('bar', 'https://example.com/baz/')).toBe('https://example.com/baz/bar');
|
|
35
|
-
expect(normalizeUrl('//other.com/foo', 'https://example.com')).toBe('https://other.com/foo');
|
|
36
|
-
});
|
|
37
|
-
|
|
38
|
-
test('normalizeUrl: only http/https', () => {
|
|
39
|
-
expect(normalizeUrl('ftp://example.com/file', 'https://example.com')).toBeNull();
|
|
40
|
-
expect(normalizeUrl('mailto:user@example.com', 'https://example.com')).toBeNull();
|
|
41
|
-
expect(normalizeUrl('javascript:alert(1)', 'https://example.com')).toBeNull();
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
test('normalizeUrl: lowercase hostname', () => {
|
|
45
|
-
expect(normalizeUrl('https://EXAMPLE.com/foo', '')).toBe('https://example.com/foo');
|
|
46
|
-
});
|
|
47
|
-
|
|
48
|
-
test('normalizeUrl: remove default ports', () => {
|
|
49
|
-
expect(normalizeUrl('http://example.com:80/foo', '')).toBe('http://example.com/foo');
|
|
50
|
-
expect(normalizeUrl('https://example.com:443/foo', '')).toBe('https://example.com/foo');
|
|
51
|
-
expect(normalizeUrl('http://example.com:8080/foo', '')).toBe('http://example.com:8080/foo');
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
test('normalizeUrl: remove hash fragments', () => {
|
|
55
|
-
expect(normalizeUrl('https://example.com/foo#bar', '')).toBe('https://example.com/foo');
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
test('normalizeUrl: strip query', () => {
|
|
59
|
-
expect(normalizeUrl('https://example.com/foo?a=1&b=2', '', { stripQuery: true })).toBe('https://example.com/foo');
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
test('normalizeUrl: filter tracking params', () => {
|
|
63
|
-
const url = 'https://example.com/foo?utm_source=google&utm_medium=cpc&a=1&fbclid=123';
|
|
64
|
-
expect(normalizeUrl(url, '', { stripQuery: false })).toBe('https://example.com/foo?a=1');
|
|
65
|
-
|
|
66
|
-
const url2 = 'https://example.com/foo?gclid=abc&msclkid=def';
|
|
67
|
-
expect(normalizeUrl(url2, '', { stripQuery: false })).toBe('https://example.com/foo');
|
|
68
|
-
});
|
|
69
|
-
|
|
70
|
-
test('normalizeUrl: trailing slash', () => {
|
|
71
|
-
expect(normalizeUrl('https://example.com/foo/', '')).toBe('https://example.com/foo');
|
|
72
|
-
expect(normalizeUrl('https://example.com/', '')).toBe('https://example.com/');
|
|
73
|
-
});
|
|
74
|
-
|
|
75
|
-
test('normalizeUrl: collapse duplicate slashes', () => {
|
|
76
|
-
expect(normalizeUrl('https://example.com/foo//bar', '')).toBe('https://example.com/foo/bar');
|
|
77
|
-
expect(normalizeUrl('https://example.com//foo///bar', '')).toBe('https://example.com/foo/bar');
|
|
78
|
-
});
|
|
79
|
-
|
|
80
|
-
test('normalizeUrl: skip non-HTML assets', () => {
|
|
81
|
-
expect(normalizeUrl('https://example.com/file.pdf', '')).toBeNull();
|
|
82
|
-
expect(normalizeUrl('https://example.com/image.jpg', '')).toBeNull();
|
|
83
|
-
expect(normalizeUrl('https://example.com/image.png', '')).toBeNull();
|
|
84
|
-
expect(normalizeUrl('https://example.com/image.svg', '')).toBeNull();
|
|
85
|
-
expect(normalizeUrl('https://example.com/image.webp', '')).toBeNull();
|
|
86
|
-
expect(normalizeUrl('https://example.com/image.gif', '')).toBeNull();
|
|
87
|
-
expect(normalizeUrl('https://example.com/archive.zip', '')).toBeNull();
|
|
88
|
-
expect(normalizeUrl('https://example.com/data.xml', '')).toBeNull();
|
|
89
|
-
expect(normalizeUrl('https://example.com/data.json', '')).toBeNull();
|
|
90
|
-
expect(normalizeUrl('https://example.com/video.mp4', '')).toBeNull();
|
|
91
|
-
|
|
92
|
-
// HTML extensions should pass (or no extension)
|
|
93
|
-
expect(normalizeUrl('https://example.com/page.html', '')).toBe('https://example.com/page.html');
|
|
94
|
-
expect(normalizeUrl('https://example.com/page.htm', '')).toBe('https://example.com/page.htm');
|
|
95
|
-
expect(normalizeUrl('https://example.com/page', '')).toBe('https://example.com/page');
|
|
96
|
-
});
|
|
97
|
-
|
|
98
|
-
test('normalizeUrl: return format', () => {
|
|
99
|
-
const res = normalizeUrl('https://example.com/foo?a=1', '');
|
|
100
|
-
expect(res).toBe('https://example.com/foo?a=1');
|
|
101
|
-
});
|
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
import { describe, expect, test } from 'vitest';
|
|
2
|
-
import { annotateOrphans, calculateOrphanSeverity, mapImpactLevel, type SitegraphNode, type SitegraphEdge } from '../src/scoring/orphanSeverity.js';
|
|
3
|
-
|
|
4
|
-
function baseNode(url: string, overrides: Partial<SitegraphNode> = {}): SitegraphNode {
|
|
5
|
-
return {
|
|
6
|
-
url,
|
|
7
|
-
depth: 1,
|
|
8
|
-
inLinks: 0,
|
|
9
|
-
outLinks: 0,
|
|
10
|
-
status: 200,
|
|
11
|
-
...overrides
|
|
12
|
-
};
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
describe('orphan detection and severity scoring', () => {
|
|
16
|
-
test('hard orphan detection and homepage exclusion', () => {
|
|
17
|
-
const nodes: SitegraphNode[] = [
|
|
18
|
-
baseNode('https://example.com/', { depth: 0, inLinks: 0 }),
|
|
19
|
-
baseNode('https://example.com/orphan', { inLinks: 0 })
|
|
20
|
-
];
|
|
21
|
-
const edges: SitegraphEdge[] = [];
|
|
22
|
-
|
|
23
|
-
const result = annotateOrphans(nodes, edges, {
|
|
24
|
-
enabled: true,
|
|
25
|
-
severityEnabled: false,
|
|
26
|
-
includeSoftOrphans: false,
|
|
27
|
-
minInbound: 2,
|
|
28
|
-
rootUrl: 'https://example.com/'
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
expect(result[0]).toMatchObject({ orphan: false });
|
|
32
|
-
expect(result[1]).toMatchObject({ orphan: true, orphanType: 'hard' });
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
test('near orphan threshold override', () => {
|
|
36
|
-
const nodes = [baseNode('https://example.com/near', { inLinks: 2 })];
|
|
37
|
-
const edges: SitegraphEdge[] = [];
|
|
38
|
-
|
|
39
|
-
const resultDefault = annotateOrphans(nodes, edges, {
|
|
40
|
-
enabled: true,
|
|
41
|
-
severityEnabled: false,
|
|
42
|
-
includeSoftOrphans: false,
|
|
43
|
-
minInbound: 2
|
|
44
|
-
});
|
|
45
|
-
const resultStrict = annotateOrphans(nodes, edges, {
|
|
46
|
-
enabled: true,
|
|
47
|
-
severityEnabled: false,
|
|
48
|
-
includeSoftOrphans: false,
|
|
49
|
-
minInbound: 1
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
expect(resultDefault[0]).toMatchObject({ orphan: true, orphanType: 'near' });
|
|
53
|
-
expect(resultStrict[0]).toMatchObject({ orphan: false });
|
|
54
|
-
});
|
|
55
|
-
|
|
56
|
-
test('soft orphan detection only when enabled and inbound only from low-value sources', () => {
|
|
57
|
-
const nodes: SitegraphNode[] = [
|
|
58
|
-
baseNode('https://example.com/tag/seo', { pageType: 'tag', outLinks: 1 }),
|
|
59
|
-
baseNode('https://example.com/list?page=2', { pageType: 'pagination', outLinks: 1 }),
|
|
60
|
-
baseNode('https://example.com/target', { inLinks: 2 }),
|
|
61
|
-
baseNode('https://example.com/normal', { outLinks: 1 })
|
|
62
|
-
];
|
|
63
|
-
|
|
64
|
-
const edges: SitegraphEdge[] = [
|
|
65
|
-
{ source: 'https://example.com/tag/seo', target: 'https://example.com/target' },
|
|
66
|
-
{ source: 'https://example.com/list?page=2', target: 'https://example.com/target' }
|
|
67
|
-
];
|
|
68
|
-
|
|
69
|
-
const withSoft = annotateOrphans(nodes, edges, {
|
|
70
|
-
enabled: true,
|
|
71
|
-
severityEnabled: false,
|
|
72
|
-
includeSoftOrphans: true,
|
|
73
|
-
minInbound: 1
|
|
74
|
-
});
|
|
75
|
-
|
|
76
|
-
const withoutSoft = annotateOrphans(nodes, edges, {
|
|
77
|
-
enabled: true,
|
|
78
|
-
severityEnabled: false,
|
|
79
|
-
includeSoftOrphans: false,
|
|
80
|
-
minInbound: 1
|
|
81
|
-
});
|
|
82
|
-
|
|
83
|
-
expect(withSoft.find((n) => n.url.endsWith('/target'))).toMatchObject({ orphan: true, orphanType: 'soft' });
|
|
84
|
-
expect(withoutSoft.find((n) => n.url.endsWith('/target'))).toMatchObject({ orphan: false });
|
|
85
|
-
});
|
|
86
|
-
|
|
87
|
-
test('crawl-only orphan detection', () => {
|
|
88
|
-
const nodes = [baseNode('https://example.com/sitemap-only', { inLinks: 0, discoveredViaSitemap: true })];
|
|
89
|
-
const result = annotateOrphans(nodes, [], {
|
|
90
|
-
enabled: true,
|
|
91
|
-
severityEnabled: false,
|
|
92
|
-
includeSoftOrphans: false,
|
|
93
|
-
minInbound: 2
|
|
94
|
-
});
|
|
95
|
-
|
|
96
|
-
expect(result[0]).toMatchObject({ orphan: true, orphanType: 'crawl-only' });
|
|
97
|
-
});
|
|
98
|
-
|
|
99
|
-
test('severity calculation modifiers and score clamping', () => {
|
|
100
|
-
const high = calculateOrphanSeverity('hard', baseNode('https://example.com/high', {
|
|
101
|
-
inLinks: 0,
|
|
102
|
-
wordCount: 1500,
|
|
103
|
-
hasStructuredData: true,
|
|
104
|
-
depth: 1,
|
|
105
|
-
isProductOrCommercial: true
|
|
106
|
-
}));
|
|
107
|
-
|
|
108
|
-
const low = calculateOrphanSeverity('hard', baseNode('https://example.com/low', {
|
|
109
|
-
inLinks: 0,
|
|
110
|
-
wordCount: 120,
|
|
111
|
-
noindex: true,
|
|
112
|
-
duplicateContent: true,
|
|
113
|
-
pageType: 'archive'
|
|
114
|
-
}));
|
|
115
|
-
|
|
116
|
-
expect(high).toBe(100);
|
|
117
|
-
expect(low).toBe(80);
|
|
118
|
-
});
|
|
119
|
-
|
|
120
|
-
test('impact level mapping', () => {
|
|
121
|
-
expect(mapImpactLevel(0)).toBe('low');
|
|
122
|
-
expect(mapImpactLevel(39)).toBe('low');
|
|
123
|
-
expect(mapImpactLevel(40)).toBe('medium');
|
|
124
|
-
expect(mapImpactLevel(69)).toBe('medium');
|
|
125
|
-
expect(mapImpactLevel(70)).toBe('high');
|
|
126
|
-
expect(mapImpactLevel(89)).toBe('high');
|
|
127
|
-
expect(mapImpactLevel(90)).toBe('critical');
|
|
128
|
-
expect(mapImpactLevel(100)).toBe('critical');
|
|
129
|
-
});
|
|
130
|
-
|
|
131
|
-
test('canonical consolidation, robots exclusion, and deterministic JSON output snapshot', () => {
|
|
132
|
-
const nodes: SitegraphNode[] = [
|
|
133
|
-
baseNode('https://example.com/canonical', { inLinks: 0 }),
|
|
134
|
-
baseNode('https://example.com/variant?a=1', { canonicalUrl: 'https://example.com/canonical', inLinks: 1 }),
|
|
135
|
-
baseNode('https://example.com/blocked', { inLinks: 0, robotsExcluded: true }),
|
|
136
|
-
baseNode('https://example.com/redirect-target', { inLinks: 1 })
|
|
137
|
-
];
|
|
138
|
-
|
|
139
|
-
const edges: SitegraphEdge[] = [
|
|
140
|
-
{ source: 'https://example.com/redirect-source', target: 'https://example.com/redirect-target' }
|
|
141
|
-
];
|
|
142
|
-
|
|
143
|
-
const options = {
|
|
144
|
-
enabled: true,
|
|
145
|
-
severityEnabled: true,
|
|
146
|
-
includeSoftOrphans: true,
|
|
147
|
-
minInbound: 2
|
|
148
|
-
};
|
|
149
|
-
|
|
150
|
-
const first = annotateOrphans(nodes, edges, options);
|
|
151
|
-
const second = annotateOrphans(nodes, edges, options);
|
|
152
|
-
|
|
153
|
-
expect(first).toEqual(second);
|
|
154
|
-
expect(first.find((n) => n.url.endsWith('/canonical'))).toMatchObject({ orphan: true, orphanType: 'near' });
|
|
155
|
-
expect(first.find((n) => n.url.endsWith('/blocked'))).toMatchObject({ orphan: false });
|
|
156
|
-
|
|
157
|
-
const normalized = JSON.stringify(first, null, 2).replace(/\r\n/g, '\n');
|
|
158
|
-
expect(normalized).toMatchSnapshot();
|
|
159
|
-
});
|
|
160
|
-
});
|
package/tests/pagerank.test.ts
DELETED
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { Graph } from '../src/graph/graph.js';
|
|
3
|
-
import { computePageRank } from '../src/graph/pagerank.js';
|
|
4
|
-
|
|
5
|
-
describe('PageRank Engine', () => {
|
|
6
|
-
it('should calculate identical PageRank for a simple loop', () => {
|
|
7
|
-
const graph = new Graph();
|
|
8
|
-
graph.addNode('https://a.com', 0, 200);
|
|
9
|
-
graph.addNode('https://b.com', 1, 200);
|
|
10
|
-
graph.addEdge('https://a.com', 'https://b.com');
|
|
11
|
-
graph.addEdge('https://b.com', 'https://a.com');
|
|
12
|
-
|
|
13
|
-
computePageRank(graph);
|
|
14
|
-
const nodes = graph.getNodes();
|
|
15
|
-
|
|
16
|
-
expect(nodes[0].pageRank).toBeCloseTo(0.5, 4);
|
|
17
|
-
expect(nodes[1].pageRank).toBeCloseTo(0.5, 4);
|
|
18
|
-
expect(nodes[0].pageRankScore).toBe(100);
|
|
19
|
-
expect(nodes[1].pageRankScore).toBe(100);
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
it('should identify the center of a star graph as most important', () => {
|
|
23
|
-
const graph = new Graph();
|
|
24
|
-
graph.addNode('https://center.com', 0, 200);
|
|
25
|
-
graph.addNode('https://p1.com', 1, 200);
|
|
26
|
-
graph.addNode('https://p2.com', 1, 200);
|
|
27
|
-
graph.addNode('https://p3.com', 1, 200);
|
|
28
|
-
|
|
29
|
-
// Star in: all link to center
|
|
30
|
-
graph.addEdge('https://p1.com', 'https://center.com');
|
|
31
|
-
graph.addEdge('https://p2.com', 'https://center.com');
|
|
32
|
-
graph.addEdge('https://p3.com', 'https://center.com');
|
|
33
|
-
|
|
34
|
-
computePageRank(graph);
|
|
35
|
-
const nodes = graph.getNodes();
|
|
36
|
-
|
|
37
|
-
const center = nodes.find(n => n.url.includes('center'))!;
|
|
38
|
-
const leaves = nodes.filter(n => !n.url.includes('center'));
|
|
39
|
-
|
|
40
|
-
expect(center.pageRankScore).toBe(100);
|
|
41
|
-
leaves.forEach(leaf => {
|
|
42
|
-
expect(leaf.pageRankScore).toBeLessThan(100);
|
|
43
|
-
expect(leaf.pageRank!).toBeLessThan(center.pageRank!);
|
|
44
|
-
});
|
|
45
|
-
});
|
|
46
|
-
|
|
47
|
-
it('should respect link weights (Body > Nav > Footer)', () => {
|
|
48
|
-
const graph = new Graph();
|
|
49
|
-
graph.addNode('https://source.com', 0, 200);
|
|
50
|
-
graph.addNode('https://body-target.com', 1, 200);
|
|
51
|
-
graph.addNode('https://footer-target.com', 1, 200);
|
|
52
|
-
|
|
53
|
-
// Body weight 1.0, Footer weight 0.4
|
|
54
|
-
graph.addEdge('https://source.com', 'https://body-target.com', 1.0);
|
|
55
|
-
graph.addEdge('https://source.com', 'https://footer-target.com', 0.4);
|
|
56
|
-
|
|
57
|
-
computePageRank(graph);
|
|
58
|
-
|
|
59
|
-
const bodyTarget = graph.nodes.get('https://body-target.com')!;
|
|
60
|
-
const footerTarget = graph.nodes.get('https://footer-target.com')!;
|
|
61
|
-
|
|
62
|
-
expect(bodyTarget.pageRank!).toBeGreaterThan(footerTarget.pageRank!);
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
it('should handle sink nodes by redistributing rank', () => {
|
|
66
|
-
const graph = new Graph();
|
|
67
|
-
graph.addNode('https://a.com', 0, 200);
|
|
68
|
-
graph.addNode('https://b.com', 1, 200); // b is a sink
|
|
69
|
-
graph.addEdge('https://a.com', 'https://b.com');
|
|
70
|
-
|
|
71
|
-
computePageRank(graph);
|
|
72
|
-
|
|
73
|
-
const nodeA = graph.nodes.get('https://a.com')!;
|
|
74
|
-
const nodeB = graph.nodes.get('https://b.com')!;
|
|
75
|
-
|
|
76
|
-
// Without redistribution, A would lose all rank.
|
|
77
|
-
// With redistribution, A should still have some rank.
|
|
78
|
-
expect(nodeA.pageRank).toBeGreaterThan(0);
|
|
79
|
-
expect(nodeB.pageRank).toBeGreaterThan(nodeA.pageRank!);
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
it('should exclude noindex pages from receiving or passing rank', () => {
|
|
83
|
-
const graph = new Graph();
|
|
84
|
-
graph.addNode('https://a.com', 0, 200);
|
|
85
|
-
graph.addNode('https://no-index.com', 1, 200);
|
|
86
|
-
graph.nodes.get('https://no-index.com')!.noindex = true;
|
|
87
|
-
|
|
88
|
-
graph.addEdge('https://a.com', 'https://no-index.com');
|
|
89
|
-
|
|
90
|
-
computePageRank(graph);
|
|
91
|
-
|
|
92
|
-
const nodeA = graph.nodes.get('https://a.com')!;
|
|
93
|
-
const nodeNoIndex = graph.nodes.get('https://no-index.com')!;
|
|
94
|
-
|
|
95
|
-
expect(nodeNoIndex.pageRank).toBeUndefined();
|
|
96
|
-
expect(nodeA.pageRank).toBe(1.0); // Only one eligible node
|
|
97
|
-
});
|
|
98
|
-
});
|
package/tests/parser.test.ts
DELETED
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
import { test, expect } from 'vitest';
|
|
2
|
-
import { Parser } from '../src/crawler/parser.js';
|
|
3
|
-
|
|
4
|
-
const parser = new Parser();
|
|
5
|
-
const baseUrl = 'https://example.com';
|
|
6
|
-
|
|
7
|
-
test('extracts links correctly', () => {
|
|
8
|
-
const html = `
|
|
9
|
-
<html>
|
|
10
|
-
<body>
|
|
11
|
-
<a href="/page1">Page 1</a>
|
|
12
|
-
<a href="https://other.com">Other</a>
|
|
13
|
-
<a href="#hash">Hash</a>
|
|
14
|
-
<a href="javascript:void(0)">JS</a>
|
|
15
|
-
</body>
|
|
16
|
-
</html>
|
|
17
|
-
`;
|
|
18
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
19
|
-
const urls = result.links.map(l => l.url);
|
|
20
|
-
expect(urls).toContain('https://example.com/page1');
|
|
21
|
-
expect(urls).toContain('https://other.com/');
|
|
22
|
-
expect(urls).not.toContain('https://example.com/#hash');
|
|
23
|
-
// It also extracts the base URL itself from href="#hash"
|
|
24
|
-
expect(urls).toContain('https://example.com/');
|
|
25
|
-
expect(result.links.length).toBe(3);
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
test('respects nofollow on links', () => {
|
|
29
|
-
const html = `
|
|
30
|
-
<html>
|
|
31
|
-
<body>
|
|
32
|
-
<a href="/page1" rel="nofollow">Page 1</a>
|
|
33
|
-
<a href="/page2">Page 2</a>
|
|
34
|
-
</body>
|
|
35
|
-
</html>
|
|
36
|
-
`;
|
|
37
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
38
|
-
const urls = result.links.map(l => l.url);
|
|
39
|
-
expect(urls).not.toContain('https://example.com/page1');
|
|
40
|
-
expect(urls).toContain('https://example.com/page2');
|
|
41
|
-
});
|
|
42
|
-
|
|
43
|
-
test('respects meta robots nofollow', () => {
|
|
44
|
-
const html = `
|
|
45
|
-
<html>
|
|
46
|
-
<head>
|
|
47
|
-
<meta name="robots" content="nofollow">
|
|
48
|
-
</head>
|
|
49
|
-
<body>
|
|
50
|
-
<a href="/page1">Page 1</a>
|
|
51
|
-
</body>
|
|
52
|
-
</html>
|
|
53
|
-
`;
|
|
54
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
55
|
-
expect(result.nofollow).toBe(true);
|
|
56
|
-
expect(result.links.length).toBe(0);
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
test('detects canonical', () => {
|
|
60
|
-
const html = `
|
|
61
|
-
<html>
|
|
62
|
-
<head>
|
|
63
|
-
<link rel="canonical" href="https://example.com/canon">
|
|
64
|
-
</head>
|
|
65
|
-
</html>
|
|
66
|
-
`;
|
|
67
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
68
|
-
expect(result.canonical).toBe('https://example.com/canon');
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
test('detects relative canonical', () => {
|
|
72
|
-
const html = `
|
|
73
|
-
<html>
|
|
74
|
-
<head>
|
|
75
|
-
<link rel="canonical" href="/canon">
|
|
76
|
-
</head>
|
|
77
|
-
</html>
|
|
78
|
-
`;
|
|
79
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
80
|
-
expect(result.canonical).toBe('https://example.com/canon');
|
|
81
|
-
});
|
|
82
|
-
|
|
83
|
-
test('detects soft 404', () => {
|
|
84
|
-
const html = `
|
|
85
|
-
<html>
|
|
86
|
-
<head><title>Page Not Found</title></head>
|
|
87
|
-
<body>Sorry, the page you are looking for does not exist.</body>
|
|
88
|
-
</html>
|
|
89
|
-
`;
|
|
90
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
91
|
-
expect(result.soft404Score).toBeGreaterThanOrEqual(0.5);
|
|
92
|
-
});
|
|
93
|
-
|
|
94
|
-
test('content hash ignores scripts', () => {
|
|
95
|
-
const html1 = `
|
|
96
|
-
<html><body><script>var x=1;</script><p>Hello</p></body></html>
|
|
97
|
-
`;
|
|
98
|
-
const html2 = `
|
|
99
|
-
<html><body><script>var x=2;</script><p>Hello</p></body></html>
|
|
100
|
-
`;
|
|
101
|
-
const result1 = parser.parse(html1, baseUrl, 200);
|
|
102
|
-
const result2 = parser.parse(html2, baseUrl, 200);
|
|
103
|
-
expect(result1.contentHash).toBe(result2.contentHash);
|
|
104
|
-
});
|
|
105
|
-
|
|
106
|
-
test('detects meta robots noindex', () => {
|
|
107
|
-
const html = `
|
|
108
|
-
<html>
|
|
109
|
-
<head>
|
|
110
|
-
<meta name="robots" content="noindex, nofollow">
|
|
111
|
-
</head>
|
|
112
|
-
</html>
|
|
113
|
-
`;
|
|
114
|
-
const result = parser.parse(html, baseUrl, 200);
|
|
115
|
-
expect(result.noindex).toBe(true);
|
|
116
|
-
expect(result.nofollow).toBe(true);
|
|
117
|
-
});
|