@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/src/report/html.ts
CHANGED
|
@@ -1,227 +1,27 @@
|
|
|
1
1
|
import { Metrics } from '../graph/metrics.js';
|
|
2
|
+
import { Crawl_HTML } from './crawl_template.js';
|
|
2
3
|
|
|
3
4
|
function safeJson(data: any): string {
|
|
4
5
|
return JSON.stringify(data).replace(/</g, '\\u003c');
|
|
5
6
|
}
|
|
6
7
|
|
|
7
8
|
export function generateHtml(graphData: any, metrics: Metrics): string {
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
border: 1px solid #ccc;
|
|
23
|
-
padding: 10px;
|
|
24
|
-
pointer-events: none;
|
|
25
|
-
font-size: 12px;
|
|
26
|
-
box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
|
|
27
|
-
display: none;
|
|
28
|
-
}
|
|
29
|
-
#metrics {
|
|
30
|
-
position: absolute;
|
|
31
|
-
top: 10px;
|
|
32
|
-
left: 10px;
|
|
33
|
-
background: rgba(255, 255, 255, 0.9);
|
|
34
|
-
padding: 15px;
|
|
35
|
-
border-radius: 5px;
|
|
36
|
-
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
|
37
|
-
max-width: 320px;
|
|
38
|
-
max-height: 90vh;
|
|
39
|
-
overflow-y: auto;
|
|
40
|
-
z-index: 100;
|
|
41
|
-
}
|
|
42
|
-
h1 { font-size: 18px; margin-top: 0; }
|
|
43
|
-
h2 { font-size: 14px; margin: 15px 0 5px; border-bottom: 1px solid #ddd; }
|
|
44
|
-
ul { padding-left: 20px; margin: 5px 0; }
|
|
45
|
-
.legend { margin-top: 10px; font-size: 11px; }
|
|
46
|
-
.legend-item { display: flex; align-items: center; margin-bottom: 3px; }
|
|
47
|
-
.dot { width: 8px; height: 8px; border-radius: 50%; margin-right: 5px; }
|
|
48
|
-
.stat-row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
|
|
49
|
-
.stat-label { color: #666; }
|
|
50
|
-
.stat-value { font-weight: bold; }
|
|
51
|
-
</style>
|
|
52
|
-
</head>
|
|
53
|
-
<body>
|
|
54
|
-
<div id="metrics">
|
|
55
|
-
<h1>Crawlith Site Graph</h1>
|
|
56
|
-
|
|
57
|
-
<div class="stat-row">
|
|
58
|
-
<span class="stat-label">Discovered Pages:</span>
|
|
59
|
-
<span class="stat-value">${metrics.totalPages}</span>
|
|
60
|
-
</div>
|
|
61
|
-
${metrics.sessionStats ? `
|
|
62
|
-
<div class="stat-row">
|
|
63
|
-
<span class="stat-label">Session Crawl:</span>
|
|
64
|
-
<span class="stat-value">${metrics.sessionStats.pagesFetched} pages</span>
|
|
65
|
-
</div>
|
|
66
|
-
${metrics.sessionStats.pagesCached > 0 ? `
|
|
67
|
-
<div class="stat-row" style="font-size: 11px; margin-top: -3px;">
|
|
68
|
-
<span class="stat-label" style="padding-left: 10px;">- Reuse Cached:</span>
|
|
69
|
-
<span class="stat-value">${metrics.sessionStats.pagesCached}</span>
|
|
70
|
-
</div>` : ''}
|
|
71
|
-
` : ''}
|
|
72
|
-
<div class="stat-row">
|
|
73
|
-
<span class="stat-label">Total Edges:</span>
|
|
74
|
-
<span class="stat-value">${metrics.totalEdges}</span>
|
|
75
|
-
</div>
|
|
76
|
-
<div class="stat-row">
|
|
77
|
-
<span class="stat-label">Max Depth:</span>
|
|
78
|
-
<span class="stat-value">${metrics.maxDepthFound}</span>
|
|
79
|
-
</div>
|
|
80
|
-
<div class="stat-row">
|
|
81
|
-
<span class="stat-label">Avg Out-Degree:</span>
|
|
82
|
-
<span class="stat-value">${metrics.averageOutDegree.toFixed(2)}</span>
|
|
83
|
-
</div>
|
|
84
|
-
|
|
85
|
-
<div class="legend">
|
|
86
|
-
<div class="legend-item"><div class="dot" style="background: red;"></div>Orphan (In-Links: 0)</div>
|
|
87
|
-
<div class="legend-item"><div class="dot" style="background: orange;"></div>Deep (Depth >= 4)</div>
|
|
88
|
-
<div class="legend-item"><div class="dot" style="background: blue;"></div>Normal</div>
|
|
89
|
-
</div>
|
|
90
|
-
|
|
91
|
-
${metrics.topAuthorityPages.length > 0 ? `
|
|
92
|
-
<h3>Top Authority</h3>
|
|
93
|
-
<ul>
|
|
94
|
-
${metrics.topAuthorityPages.map(p => `<li><a href="${p.url}" target="_blank">${new URL(p.url).pathname}</a> (${p.authority.toFixed(2)})</li>`).join('')}
|
|
95
|
-
</ul>
|
|
96
|
-
` : ''}
|
|
97
|
-
|
|
98
|
-
${metrics.orphanPages.length > 0 ? `
|
|
99
|
-
<h3>Orphan Pages (${metrics.orphanPages.length})</h3>
|
|
100
|
-
<details>
|
|
101
|
-
<summary>Show list</summary>
|
|
102
|
-
<ul>
|
|
103
|
-
${metrics.orphanPages.slice(0, 20).map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('')}
|
|
104
|
-
${metrics.orphanPages.length > 20 ? `<li>... and ${metrics.orphanPages.length - 20} more</li>` : ''}
|
|
105
|
-
</ul>
|
|
106
|
-
</details>
|
|
107
|
-
` : ''}
|
|
108
|
-
</div>
|
|
109
|
-
<div id="graph"></div>
|
|
110
|
-
<div class="tooltip" id="tooltip"></div>
|
|
111
|
-
|
|
112
|
-
<script src="https://d3js.org/d3.v7.min.js"></script>
|
|
113
|
-
<script>
|
|
114
|
-
// Make data available globally
|
|
9
|
+
// Strip heavy HTML content from nodes to keep the report lightweight
|
|
10
|
+
const vizGraphData = {
|
|
11
|
+
...graphData,
|
|
12
|
+
nodes: graphData.nodes ? graphData.nodes.map((n: any) => {
|
|
13
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
14
|
+
const { html, ...rest } = n;
|
|
15
|
+
return rest;
|
|
16
|
+
}) : []
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
const graphJson = safeJson(vizGraphData);
|
|
20
|
+
const metricsJson = safeJson(metrics);
|
|
21
|
+
|
|
22
|
+
return Crawl_HTML.replace('</body>', `<script>
|
|
115
23
|
window.GRAPH_DATA = ${graphJson};
|
|
116
|
-
|
|
117
|
-
const data = window.GRAPH_DATA;
|
|
118
|
-
const width = window.innerWidth;
|
|
119
|
-
const height = window.innerHeight;
|
|
120
|
-
|
|
121
|
-
const svg = d3.select("#graph").append("svg")
|
|
122
|
-
.attr("width", width)
|
|
123
|
-
.attr("height", height)
|
|
124
|
-
.call(d3.zoom().on("zoom", (event) => {
|
|
125
|
-
g.attr("transform", event.transform);
|
|
126
|
-
}));
|
|
127
|
-
|
|
128
|
-
const g = svg.append("g");
|
|
129
|
-
|
|
130
|
-
// Define arrow marker
|
|
131
|
-
svg.append("defs").selectAll("marker")
|
|
132
|
-
.data(["arrow"])
|
|
133
|
-
.enter().append("marker")
|
|
134
|
-
.attr("id", d => d)
|
|
135
|
-
.attr("viewBox", "0 -5 10 10")
|
|
136
|
-
.attr("refX", 15)
|
|
137
|
-
.attr("refY", 0)
|
|
138
|
-
.attr("markerWidth", 6)
|
|
139
|
-
.attr("markerHeight", 6)
|
|
140
|
-
.attr("orient", "auto")
|
|
141
|
-
.append("path")
|
|
142
|
-
.attr("d", "M0,-5L10,0L0,5")
|
|
143
|
-
.attr("fill", "#999");
|
|
144
|
-
|
|
145
|
-
const simulation = d3.forceSimulation(data.nodes)
|
|
146
|
-
.force("link", d3.forceLink(data.edges).id(d => d.url).distance(100))
|
|
147
|
-
.force("charge", d3.forceManyBody().strength(-300))
|
|
148
|
-
.force("center", d3.forceCenter(width / 2, height / 2))
|
|
149
|
-
.force("collide", d3.forceCollide().radius(d => Math.sqrt((d.inLinks || 0) + 1) * 5 + 2));
|
|
150
|
-
|
|
151
|
-
const link = g.append("g")
|
|
152
|
-
.attr("stroke", "#999")
|
|
153
|
-
.attr("stroke-opacity", 0.6)
|
|
154
|
-
.selectAll("line")
|
|
155
|
-
.data(data.edges)
|
|
156
|
-
.join("line")
|
|
157
|
-
.attr("stroke-width", 1)
|
|
158
|
-
.attr("marker-end", "url(#arrow)");
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
const node = g.append("g")
|
|
162
|
-
.attr("stroke", "#fff")
|
|
163
|
-
.attr("stroke-width", 1.5)
|
|
164
|
-
.selectAll("circle")
|
|
165
|
-
.data(data.nodes)
|
|
166
|
-
.join("circle")
|
|
167
|
-
.attr("r", d => Math.sqrt((d.inLinks || 0) + 1) * 3 + 2)
|
|
168
|
-
.attr("fill", d => {
|
|
169
|
-
if (d.inLinks === 0 && d.depth > 0) return "red";
|
|
170
|
-
if (d.depth >= 4) return "orange";
|
|
171
|
-
return "blue";
|
|
172
|
-
})
|
|
173
|
-
.call(d3.drag()
|
|
174
|
-
.on("start", dragstarted)
|
|
175
|
-
.on("drag", dragged)
|
|
176
|
-
.on("end", dragended));
|
|
177
|
-
|
|
178
|
-
const tooltip = d3.select("#tooltip");
|
|
179
|
-
|
|
180
|
-
node.on("mouseover", (event, d) => {
|
|
181
|
-
tooltip.style("display", "block")
|
|
182
|
-
.html(\`
|
|
183
|
-
<strong>URL:</strong> \${d.url}<br>
|
|
184
|
-
<strong>Depth:</strong> \${d.depth}<br>
|
|
185
|
-
<strong>In-Links:</strong> \${d.inLinks}<br>
|
|
186
|
-
<strong>Out-Links:</strong> \${d.outLinks}<br>
|
|
187
|
-
<strong>Status:</strong> \${d.status}
|
|
188
|
-
\`)
|
|
189
|
-
.style("left", (event.pageX + 10) + "px")
|
|
190
|
-
.style("top", (event.pageY - 10) + "px");
|
|
191
|
-
})
|
|
192
|
-
.on("mouseout", () => {
|
|
193
|
-
tooltip.style("display", "none");
|
|
194
|
-
});
|
|
195
|
-
|
|
196
|
-
simulation.on("tick", () => {
|
|
197
|
-
link
|
|
198
|
-
.attr("x1", d => d.source.x)
|
|
199
|
-
.attr("y1", d => d.source.y)
|
|
200
|
-
.attr("x2", d => d.target.x)
|
|
201
|
-
.attr("y2", d => d.target.y);
|
|
202
|
-
|
|
203
|
-
node
|
|
204
|
-
.attr("cx", d => d.x)
|
|
205
|
-
.attr("cy", d => d.y);
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
function dragstarted(event, d) {
|
|
209
|
-
if (!event.active) simulation.alphaTarget(0.3).restart();
|
|
210
|
-
d.fx = d.x;
|
|
211
|
-
d.fy = d.y;
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
function dragged(event, d) {
|
|
215
|
-
d.fx = event.x;
|
|
216
|
-
d.fy = event.y;
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
function dragended(event, d) {
|
|
220
|
-
if (!event.active) simulation.alphaTarget(0);
|
|
221
|
-
d.fx = null;
|
|
222
|
-
d.fy = null;
|
|
223
|
-
}
|
|
24
|
+
window.METRICS_DATA = ${metricsJson};
|
|
224
25
|
</script>
|
|
225
|
-
</body
|
|
226
|
-
</html>`;
|
|
26
|
+
</body>`);
|
|
227
27
|
}
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import { Graph } from '../graph/graph.js';
|
|
2
|
+
import { Metrics } from '../graph/metrics.js';
|
|
3
|
+
import { analyzeContent } from '../analysis/content.js';
|
|
4
|
+
import { analyzeH1 } from '../analysis/seo.js';
|
|
5
|
+
import { analyzeImageAlts } from '../analysis/images.js';
|
|
6
|
+
import { analyzeLinks } from '../analysis/links.js';
|
|
7
|
+
|
|
8
|
+
export const THIN_CONTENT_THRESHOLD = 300;
|
|
9
|
+
export const LOW_INTERNAL_LINK_THRESHOLD = 2;
|
|
10
|
+
export const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
|
|
11
|
+
export const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
|
|
12
|
+
export const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
|
|
13
|
+
|
|
14
|
+
export interface HealthScoreWeights {
|
|
15
|
+
orphans: number;
|
|
16
|
+
brokenLinks: number;
|
|
17
|
+
redirectChains: number;
|
|
18
|
+
duplicateClusters: number;
|
|
19
|
+
thinContent: number;
|
|
20
|
+
missingH1: number;
|
|
21
|
+
noindexMisuse: number;
|
|
22
|
+
canonicalConflicts: number;
|
|
23
|
+
lowInternalLinks: number;
|
|
24
|
+
excessiveLinks: number;
|
|
25
|
+
blockedByRobots: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights = {
|
|
29
|
+
orphans: 50,
|
|
30
|
+
brokenLinks: 100,
|
|
31
|
+
redirectChains: 20,
|
|
32
|
+
duplicateClusters: 25,
|
|
33
|
+
thinContent: 15,
|
|
34
|
+
missingH1: 10,
|
|
35
|
+
noindexMisuse: 20,
|
|
36
|
+
canonicalConflicts: 10,
|
|
37
|
+
lowInternalLinks: 10,
|
|
38
|
+
excessiveLinks: 5,
|
|
39
|
+
blockedByRobots: 100
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
export interface CrawlIssueCounts {
|
|
43
|
+
orphanPages: number;
|
|
44
|
+
brokenInternalLinks: number;
|
|
45
|
+
redirectChains: number;
|
|
46
|
+
duplicateClusters: number;
|
|
47
|
+
canonicalConflicts: number;
|
|
48
|
+
accidentalNoindex: number;
|
|
49
|
+
missingH1: number;
|
|
50
|
+
thinContent: number;
|
|
51
|
+
lowInternalLinkCount: number;
|
|
52
|
+
excessiveInternalLinkCount: number;
|
|
53
|
+
highExternalLinkRatio: number;
|
|
54
|
+
imageAltMissing: number;
|
|
55
|
+
strongPagesUnderLinking: number;
|
|
56
|
+
cannibalizationClusters: number;
|
|
57
|
+
nearAuthorityThreshold: number;
|
|
58
|
+
underlinkedHighAuthorityPages: number;
|
|
59
|
+
externalLinks: number;
|
|
60
|
+
blockedByRobots: number;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export interface HealthScoreBreakdown {
|
|
64
|
+
score: number;
|
|
65
|
+
status: string;
|
|
66
|
+
weightedPenalties: Record<keyof HealthScoreWeights, number>;
|
|
67
|
+
weights: HealthScoreWeights;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function clamp(value: number, min: number, max: number): number {
|
|
71
|
+
return Math.min(max, Math.max(min, value));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export function healthStatusLabel(score: number, hasCritical: boolean = false): string {
|
|
75
|
+
if (hasCritical && score >= 75) return 'Needs Attention';
|
|
76
|
+
if (score >= 90) return 'Excellent';
|
|
77
|
+
if (score >= 75) return 'Good';
|
|
78
|
+
if (score >= 50) return 'Needs Attention';
|
|
79
|
+
return 'Critical';
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export function calculateHealthScore(
|
|
83
|
+
totalPages: number,
|
|
84
|
+
issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots'>,
|
|
85
|
+
weights: HealthScoreWeights = DEFAULT_HEALTH_WEIGHTS
|
|
86
|
+
): HealthScoreBreakdown {
|
|
87
|
+
const safePages = Math.max(totalPages, 1);
|
|
88
|
+
|
|
89
|
+
const weightedPenalties = {
|
|
90
|
+
orphans: clamp((issues.orphanPages / safePages) * weights.orphans, 0, weights.orphans),
|
|
91
|
+
brokenLinks: clamp((issues.brokenInternalLinks / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
|
|
92
|
+
redirectChains: clamp((issues.redirectChains / safePages) * weights.redirectChains, 0, weights.redirectChains),
|
|
93
|
+
duplicateClusters: clamp((issues.duplicateClusters / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
|
|
94
|
+
thinContent: clamp((issues.thinContent / safePages) * weights.thinContent, 0, weights.thinContent),
|
|
95
|
+
missingH1: clamp((issues.missingH1 / safePages) * weights.missingH1, 0, weights.missingH1),
|
|
96
|
+
noindexMisuse: clamp((issues.accidentalNoindex / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
|
|
97
|
+
canonicalConflicts: clamp((issues.canonicalConflicts / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
|
|
98
|
+
lowInternalLinks: clamp((issues.lowInternalLinkCount / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
|
|
99
|
+
excessiveLinks: clamp((issues.excessiveInternalLinkCount / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
|
|
100
|
+
blockedByRobots: clamp((issues.blockedByRobots / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots)
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
|
|
104
|
+
const score = Number(clamp(100 - totalPenalty, 0, 100).toFixed(1));
|
|
105
|
+
|
|
106
|
+
const hasCritical = (
|
|
107
|
+
issues.orphanPages > 0 ||
|
|
108
|
+
issues.brokenInternalLinks > 0 ||
|
|
109
|
+
issues.redirectChains > 0 ||
|
|
110
|
+
issues.duplicateClusters > 0 ||
|
|
111
|
+
issues.canonicalConflicts > 0 ||
|
|
112
|
+
issues.accidentalNoindex > 0 ||
|
|
113
|
+
issues.blockedByRobots > 0
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
score,
|
|
118
|
+
status: healthStatusLabel(score, hasCritical),
|
|
119
|
+
weightedPenalties,
|
|
120
|
+
weights
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export function collectCrawlIssues(graph: Graph, metrics: Metrics): CrawlIssueCounts {
|
|
125
|
+
const nodes = graph.getNodes();
|
|
126
|
+
|
|
127
|
+
let brokenInternalLinks = 0;
|
|
128
|
+
let redirectChains = 0;
|
|
129
|
+
let canonicalConflicts = 0;
|
|
130
|
+
let accidentalNoindex = 0;
|
|
131
|
+
let missingH1 = 0;
|
|
132
|
+
let thinContent = 0;
|
|
133
|
+
let highExternalLinkRatio = 0;
|
|
134
|
+
let imageAltMissing = 0;
|
|
135
|
+
let lowInternalLinkCount = 0;
|
|
136
|
+
let excessiveInternalLinkCount = 0;
|
|
137
|
+
let strongPagesUnderLinking = 0;
|
|
138
|
+
let nearAuthorityThreshold = 0;
|
|
139
|
+
let underlinkedHighAuthorityPages = 0;
|
|
140
|
+
let externalLinks = 0;
|
|
141
|
+
let blockedByRobots = 0;
|
|
142
|
+
|
|
143
|
+
for (const node of nodes) {
|
|
144
|
+
if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
|
|
145
|
+
blockedByRobots += 1;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
|
|
149
|
+
|
|
150
|
+
if (isConfirmedError) {
|
|
151
|
+
brokenInternalLinks += 1;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (node.brokenLinks) {
|
|
155
|
+
const actualBreaks = node.brokenLinks.filter(url => {
|
|
156
|
+
const target = graph.nodes.get(url);
|
|
157
|
+
return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
|
|
158
|
+
});
|
|
159
|
+
brokenInternalLinks += actualBreaks.length;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if ((node.redirectChain?.length || 0) > 1) {
|
|
163
|
+
redirectChains += 1;
|
|
164
|
+
}
|
|
165
|
+
if (node.canonical && node.canonical !== node.url) {
|
|
166
|
+
canonicalConflicts += 1;
|
|
167
|
+
}
|
|
168
|
+
if (node.noindex && node.status >= 200 && node.status < 300) {
|
|
169
|
+
accidentalNoindex += 1;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
if (node.inLinks < LOW_INTERNAL_LINK_THRESHOLD && node.depth > 0) {
|
|
173
|
+
lowInternalLinkCount += 1;
|
|
174
|
+
}
|
|
175
|
+
if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
|
|
176
|
+
excessiveInternalLinkCount += 1;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
if (!node.html) {
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const h1 = analyzeH1(node.html, '');
|
|
184
|
+
if (h1.count === 0) {
|
|
185
|
+
missingH1 += 1;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const content = analyzeContent(node.html);
|
|
189
|
+
if (content.wordCount < THIN_CONTENT_THRESHOLD) {
|
|
190
|
+
thinContent += 1;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const links = analyzeLinks(node.html, node.url, node.url);
|
|
194
|
+
externalLinks += links.externalLinks;
|
|
195
|
+
if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
|
|
196
|
+
highExternalLinkRatio += 1;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
const imageAlt = analyzeImageAlts(node.html);
|
|
200
|
+
if (imageAlt.missingAlt > 0) {
|
|
201
|
+
imageAltMissing += 1;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const duplicateClusters = graph.duplicateClusters?.length || 0;
|
|
206
|
+
const cannibalizationClusters = graph.duplicateClusters?.filter((cluster) => cluster.type === 'near').length || 0;
|
|
207
|
+
|
|
208
|
+
for (const node of nodes) {
|
|
209
|
+
const authority = node.pageRank || 0;
|
|
210
|
+
if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
|
|
211
|
+
strongPagesUnderLinking += 1;
|
|
212
|
+
}
|
|
213
|
+
if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
|
|
214
|
+
nearAuthorityThreshold += 1;
|
|
215
|
+
}
|
|
216
|
+
if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
|
|
217
|
+
underlinkedHighAuthorityPages += 1;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
return {
|
|
222
|
+
orphanPages: metrics.orphanPages.length,
|
|
223
|
+
brokenInternalLinks,
|
|
224
|
+
redirectChains,
|
|
225
|
+
duplicateClusters,
|
|
226
|
+
canonicalConflicts,
|
|
227
|
+
accidentalNoindex,
|
|
228
|
+
missingH1,
|
|
229
|
+
thinContent,
|
|
230
|
+
lowInternalLinkCount,
|
|
231
|
+
excessiveInternalLinkCount,
|
|
232
|
+
highExternalLinkRatio,
|
|
233
|
+
imageAltMissing,
|
|
234
|
+
strongPagesUnderLinking,
|
|
235
|
+
cannibalizationClusters,
|
|
236
|
+
nearAuthorityThreshold,
|
|
237
|
+
underlinkedHighAuthorityPages,
|
|
238
|
+
externalLinks,
|
|
239
|
+
blockedByRobots
|
|
240
|
+
};
|
|
241
|
+
}
|
package/src/scoring/hits.ts
CHANGED
|
@@ -7,94 +7,104 @@ export interface HITSOptions {
|
|
|
7
7
|
/**
|
|
8
8
|
* Computes Hub and Authority scores using the HITS algorithm.
|
|
9
9
|
* Operates purely on the internal link graph.
|
|
10
|
+
* Optimized for performance using array-based adjacency lists.
|
|
10
11
|
*/
|
|
11
12
|
export function computeHITS(graph: Graph, options: HITSOptions = {}): void {
|
|
12
13
|
const iterations = options.iterations || 20;
|
|
13
14
|
const nodes = graph.getNodes();
|
|
14
15
|
|
|
15
16
|
// 1. Filter eligible nodes
|
|
16
|
-
// Eligibility: status 200
|
|
17
|
+
// Eligibility: status 200 (crawled) or status 0 (discovered)
|
|
18
|
+
// Non-redirect, not noindex (if known), non-external
|
|
17
19
|
const eligibleNodes = nodes.filter(n =>
|
|
18
|
-
n.status === 200 &&
|
|
20
|
+
(n.status === 200 || n.status === 0) &&
|
|
19
21
|
(!n.redirectChain || n.redirectChain.length === 0) &&
|
|
20
22
|
!n.noindex
|
|
21
23
|
);
|
|
22
24
|
|
|
23
|
-
|
|
25
|
+
const N = eligibleNodes.length;
|
|
26
|
+
if (N === 0) return;
|
|
24
27
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
node.authorityScore = 1.0;
|
|
30
|
-
node.hubScore = 1.0;
|
|
28
|
+
// Map URL to Index for O(1) access
|
|
29
|
+
const urlToIndex = new Map<string, number>();
|
|
30
|
+
for (let i = 0; i < N; i++) {
|
|
31
|
+
urlToIndex.set(eligibleNodes[i].url, i);
|
|
31
32
|
}
|
|
32
33
|
|
|
33
|
-
|
|
34
|
-
//
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
urlToNode.has(e.target)
|
|
39
|
-
);
|
|
34
|
+
// Build Adjacency Lists (Indices)
|
|
35
|
+
// incoming[i] = list of { sourceIndex, weight }
|
|
36
|
+
// outgoing[i] = list of { targetIndex, weight }
|
|
37
|
+
const incoming: { sourceIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []);
|
|
38
|
+
const outgoing: { targetIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []);
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
const
|
|
43
|
-
|
|
40
|
+
const allEdges = graph.getEdges();
|
|
41
|
+
for (const edge of allEdges) {
|
|
42
|
+
if (edge.source === edge.target) continue;
|
|
44
43
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
incoming.get(edge.target)!.push({ source: edge.source, weight: edge.weight });
|
|
44
|
+
const sourceIndex = urlToIndex.get(edge.source);
|
|
45
|
+
const targetIndex = urlToIndex.get(edge.target);
|
|
48
46
|
|
|
49
|
-
if (
|
|
50
|
-
|
|
47
|
+
if (sourceIndex !== undefined && targetIndex !== undefined) {
|
|
48
|
+
incoming[targetIndex].push({ sourceIndex, weight: edge.weight });
|
|
49
|
+
outgoing[sourceIndex].push({ targetIndex, weight: edge.weight });
|
|
50
|
+
}
|
|
51
51
|
}
|
|
52
52
|
|
|
53
|
-
//
|
|
54
|
-
|
|
53
|
+
// Initialize Scores
|
|
54
|
+
const authScores = new Float64Array(N).fill(1.0);
|
|
55
|
+
const hubScores = new Float64Array(N).fill(1.0);
|
|
56
|
+
|
|
57
|
+
// 2. Iteration
|
|
58
|
+
for (let iter = 0; iter < iterations; iter++) {
|
|
55
59
|
// Update Authorities
|
|
56
60
|
let normAuth = 0;
|
|
57
|
-
for (
|
|
58
|
-
const inLinks = incoming
|
|
61
|
+
for (let i = 0; i < N; i++) {
|
|
62
|
+
const inLinks = incoming[i];
|
|
59
63
|
let newAuth = 0;
|
|
60
|
-
for (
|
|
61
|
-
const
|
|
62
|
-
newAuth +=
|
|
64
|
+
for (let j = 0; j < inLinks.length; j++) {
|
|
65
|
+
const link = inLinks[j];
|
|
66
|
+
newAuth += hubScores[link.sourceIndex] * link.weight;
|
|
63
67
|
}
|
|
64
|
-
|
|
68
|
+
authScores[i] = newAuth;
|
|
65
69
|
normAuth += newAuth * newAuth;
|
|
66
70
|
}
|
|
67
71
|
|
|
68
72
|
// Normalize Authorities (L2 norm)
|
|
69
73
|
normAuth = Math.sqrt(normAuth);
|
|
70
74
|
if (normAuth > 0) {
|
|
71
|
-
for (
|
|
72
|
-
|
|
75
|
+
for (let i = 0; i < N; i++) {
|
|
76
|
+
authScores[i] /= normAuth;
|
|
73
77
|
}
|
|
74
78
|
}
|
|
75
79
|
|
|
76
80
|
// Update Hubs
|
|
77
81
|
let normHub = 0;
|
|
78
|
-
for (
|
|
79
|
-
const outLinks = outgoing
|
|
82
|
+
for (let i = 0; i < N; i++) {
|
|
83
|
+
const outLinks = outgoing[i];
|
|
80
84
|
let newHub = 0;
|
|
81
|
-
for (
|
|
82
|
-
const
|
|
83
|
-
newHub +=
|
|
85
|
+
for (let j = 0; j < outLinks.length; j++) {
|
|
86
|
+
const link = outLinks[j];
|
|
87
|
+
newHub += authScores[link.targetIndex] * link.weight;
|
|
84
88
|
}
|
|
85
|
-
|
|
89
|
+
hubScores[i] = newHub;
|
|
86
90
|
normHub += newHub * newHub;
|
|
87
91
|
}
|
|
88
92
|
|
|
89
93
|
// Normalize Hubs (L2 norm)
|
|
90
94
|
normHub = Math.sqrt(normHub);
|
|
91
95
|
if (normHub > 0) {
|
|
92
|
-
for (
|
|
93
|
-
|
|
96
|
+
for (let i = 0; i < N; i++) {
|
|
97
|
+
hubScores[i] /= normHub;
|
|
94
98
|
}
|
|
95
99
|
}
|
|
96
100
|
}
|
|
97
101
|
|
|
102
|
+
// 3. Assign back to GraphNodes
|
|
103
|
+
for (let i = 0; i < N; i++) {
|
|
104
|
+
eligibleNodes[i].authorityScore = authScores[i];
|
|
105
|
+
eligibleNodes[i].hubScore = hubScores[i];
|
|
106
|
+
}
|
|
107
|
+
|
|
98
108
|
// 4. Classification Logic
|
|
99
109
|
classifyLinkRoles(eligibleNodes);
|
|
100
110
|
}
|
|
@@ -106,15 +116,27 @@ function classifyLinkRoles(nodes: GraphNode[]): void {
|
|
|
106
116
|
const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
|
|
107
117
|
|
|
108
118
|
// Use 75th percentile as "high" threshold
|
|
119
|
+
// Using median (50th percentile) as per original implementation,
|
|
120
|
+
// but the comment said "Use 75th percentile" while code used median.
|
|
121
|
+
// I'll stick to median to avoid breaking existing behavior, but correct the comment or logic?
|
|
122
|
+
// The original code:
|
|
123
|
+
// const medianAuth = authScores[Math.floor(authScores.length / 2)];
|
|
124
|
+
// const isHighAuth = auth > medianAuth && auth > 0.0001;
|
|
125
|
+
// So it uses median. I'll keep it as median.
|
|
126
|
+
|
|
109
127
|
const medianAuth = authScores[Math.floor(authScores.length / 2)];
|
|
110
128
|
const medianHub = hubScores[Math.floor(hubScores.length / 2)];
|
|
129
|
+
const maxAuth = authScores[authScores.length - 1];
|
|
130
|
+
const maxHub = hubScores[hubScores.length - 1];
|
|
111
131
|
|
|
112
132
|
for (const node of nodes) {
|
|
113
133
|
const auth = node.authorityScore || 0;
|
|
114
134
|
const hub = node.hubScore || 0;
|
|
115
135
|
|
|
116
|
-
|
|
117
|
-
|
|
136
|
+
// A node is high if it's above median, OR if it's the max (to handle uniform distributions)
|
|
137
|
+
// auth > 0 check is essential.
|
|
138
|
+
const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
|
|
139
|
+
const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
|
|
118
140
|
|
|
119
141
|
if (isHighAuth && isHighHub) {
|
|
120
142
|
node.linkRole = 'power';
|
|
@@ -122,7 +144,7 @@ function classifyLinkRoles(nodes: GraphNode[]): void {
|
|
|
122
144
|
node.linkRole = 'authority';
|
|
123
145
|
} else if (isHighHub) {
|
|
124
146
|
node.linkRole = 'hub';
|
|
125
|
-
} else if (auth > 0.
|
|
147
|
+
} else if (auth > 0.00001 && hub > 0.00001) {
|
|
126
148
|
node.linkRole = 'balanced';
|
|
127
149
|
} else {
|
|
128
150
|
node.linkRole = 'peripheral';
|