@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/dist/report/html.js
CHANGED
|
@@ -1,223 +1,22 @@
|
|
|
1
|
+
import { Crawl_HTML } from './crawl_template.js';
|
|
1
2
|
function safeJson(data) {
|
|
2
3
|
return JSON.stringify(data).replace(/</g, '\\u003c');
|
|
3
4
|
}
|
|
4
5
|
export function generateHtml(graphData, metrics) {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
background: white;
|
|
18
|
-
border: 1px solid #ccc;
|
|
19
|
-
padding: 10px;
|
|
20
|
-
pointer-events: none;
|
|
21
|
-
font-size: 12px;
|
|
22
|
-
box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
|
|
23
|
-
display: none;
|
|
24
|
-
}
|
|
25
|
-
#metrics {
|
|
26
|
-
position: absolute;
|
|
27
|
-
top: 10px;
|
|
28
|
-
left: 10px;
|
|
29
|
-
background: rgba(255, 255, 255, 0.9);
|
|
30
|
-
padding: 15px;
|
|
31
|
-
border-radius: 5px;
|
|
32
|
-
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
|
33
|
-
max-width: 320px;
|
|
34
|
-
max-height: 90vh;
|
|
35
|
-
overflow-y: auto;
|
|
36
|
-
z-index: 100;
|
|
37
|
-
}
|
|
38
|
-
h1 { font-size: 18px; margin-top: 0; }
|
|
39
|
-
h2 { font-size: 14px; margin: 15px 0 5px; border-bottom: 1px solid #ddd; }
|
|
40
|
-
ul { padding-left: 20px; margin: 5px 0; }
|
|
41
|
-
.legend { margin-top: 10px; font-size: 11px; }
|
|
42
|
-
.legend-item { display: flex; align-items: center; margin-bottom: 3px; }
|
|
43
|
-
.dot { width: 8px; height: 8px; border-radius: 50%; margin-right: 5px; }
|
|
44
|
-
.stat-row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
|
|
45
|
-
.stat-label { color: #666; }
|
|
46
|
-
.stat-value { font-weight: bold; }
|
|
47
|
-
</style>
|
|
48
|
-
</head>
|
|
49
|
-
<body>
|
|
50
|
-
<div id="metrics">
|
|
51
|
-
<h1>Crawlith Site Graph</h1>
|
|
52
|
-
|
|
53
|
-
<div class="stat-row">
|
|
54
|
-
<span class="stat-label">Discovered Pages:</span>
|
|
55
|
-
<span class="stat-value">${metrics.totalPages}</span>
|
|
56
|
-
</div>
|
|
57
|
-
${metrics.sessionStats ? `
|
|
58
|
-
<div class="stat-row">
|
|
59
|
-
<span class="stat-label">Session Crawl:</span>
|
|
60
|
-
<span class="stat-value">${metrics.sessionStats.pagesFetched} pages</span>
|
|
61
|
-
</div>
|
|
62
|
-
${metrics.sessionStats.pagesCached > 0 ? `
|
|
63
|
-
<div class="stat-row" style="font-size: 11px; margin-top: -3px;">
|
|
64
|
-
<span class="stat-label" style="padding-left: 10px;">- Reuse Cached:</span>
|
|
65
|
-
<span class="stat-value">${metrics.sessionStats.pagesCached}</span>
|
|
66
|
-
</div>` : ''}
|
|
67
|
-
` : ''}
|
|
68
|
-
<div class="stat-row">
|
|
69
|
-
<span class="stat-label">Total Edges:</span>
|
|
70
|
-
<span class="stat-value">${metrics.totalEdges}</span>
|
|
71
|
-
</div>
|
|
72
|
-
<div class="stat-row">
|
|
73
|
-
<span class="stat-label">Max Depth:</span>
|
|
74
|
-
<span class="stat-value">${metrics.maxDepthFound}</span>
|
|
75
|
-
</div>
|
|
76
|
-
<div class="stat-row">
|
|
77
|
-
<span class="stat-label">Avg Out-Degree:</span>
|
|
78
|
-
<span class="stat-value">${metrics.averageOutDegree.toFixed(2)}</span>
|
|
79
|
-
</div>
|
|
80
|
-
|
|
81
|
-
<div class="legend">
|
|
82
|
-
<div class="legend-item"><div class="dot" style="background: red;"></div>Orphan (In-Links: 0)</div>
|
|
83
|
-
<div class="legend-item"><div class="dot" style="background: orange;"></div>Deep (Depth >= 4)</div>
|
|
84
|
-
<div class="legend-item"><div class="dot" style="background: blue;"></div>Normal</div>
|
|
85
|
-
</div>
|
|
86
|
-
|
|
87
|
-
${metrics.topAuthorityPages.length > 0 ? `
|
|
88
|
-
<h3>Top Authority</h3>
|
|
89
|
-
<ul>
|
|
90
|
-
${metrics.topAuthorityPages.map(p => `<li><a href="${p.url}" target="_blank">${new URL(p.url).pathname}</a> (${p.authority.toFixed(2)})</li>`).join('')}
|
|
91
|
-
</ul>
|
|
92
|
-
` : ''}
|
|
93
|
-
|
|
94
|
-
${metrics.orphanPages.length > 0 ? `
|
|
95
|
-
<h3>Orphan Pages (${metrics.orphanPages.length})</h3>
|
|
96
|
-
<details>
|
|
97
|
-
<summary>Show list</summary>
|
|
98
|
-
<ul>
|
|
99
|
-
${metrics.orphanPages.slice(0, 20).map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('')}
|
|
100
|
-
${metrics.orphanPages.length > 20 ? `<li>... and ${metrics.orphanPages.length - 20} more</li>` : ''}
|
|
101
|
-
</ul>
|
|
102
|
-
</details>
|
|
103
|
-
` : ''}
|
|
104
|
-
</div>
|
|
105
|
-
<div id="graph"></div>
|
|
106
|
-
<div class="tooltip" id="tooltip"></div>
|
|
107
|
-
|
|
108
|
-
<script src="https://d3js.org/d3.v7.min.js"></script>
|
|
109
|
-
<script>
|
|
110
|
-
// Make data available globally
|
|
6
|
+
// Strip heavy HTML content from nodes to keep the report lightweight
|
|
7
|
+
const vizGraphData = {
|
|
8
|
+
...graphData,
|
|
9
|
+
nodes: graphData.nodes ? graphData.nodes.map((n) => {
|
|
10
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
11
|
+
const { html, ...rest } = n;
|
|
12
|
+
return rest;
|
|
13
|
+
}) : []
|
|
14
|
+
};
|
|
15
|
+
const graphJson = safeJson(vizGraphData);
|
|
16
|
+
const metricsJson = safeJson(metrics);
|
|
17
|
+
return Crawl_HTML.replace('</body>', `<script>
|
|
111
18
|
window.GRAPH_DATA = ${graphJson};
|
|
112
|
-
|
|
113
|
-
const data = window.GRAPH_DATA;
|
|
114
|
-
const width = window.innerWidth;
|
|
115
|
-
const height = window.innerHeight;
|
|
116
|
-
|
|
117
|
-
const svg = d3.select("#graph").append("svg")
|
|
118
|
-
.attr("width", width)
|
|
119
|
-
.attr("height", height)
|
|
120
|
-
.call(d3.zoom().on("zoom", (event) => {
|
|
121
|
-
g.attr("transform", event.transform);
|
|
122
|
-
}));
|
|
123
|
-
|
|
124
|
-
const g = svg.append("g");
|
|
125
|
-
|
|
126
|
-
// Define arrow marker
|
|
127
|
-
svg.append("defs").selectAll("marker")
|
|
128
|
-
.data(["arrow"])
|
|
129
|
-
.enter().append("marker")
|
|
130
|
-
.attr("id", d => d)
|
|
131
|
-
.attr("viewBox", "0 -5 10 10")
|
|
132
|
-
.attr("refX", 15)
|
|
133
|
-
.attr("refY", 0)
|
|
134
|
-
.attr("markerWidth", 6)
|
|
135
|
-
.attr("markerHeight", 6)
|
|
136
|
-
.attr("orient", "auto")
|
|
137
|
-
.append("path")
|
|
138
|
-
.attr("d", "M0,-5L10,0L0,5")
|
|
139
|
-
.attr("fill", "#999");
|
|
140
|
-
|
|
141
|
-
const simulation = d3.forceSimulation(data.nodes)
|
|
142
|
-
.force("link", d3.forceLink(data.edges).id(d => d.url).distance(100))
|
|
143
|
-
.force("charge", d3.forceManyBody().strength(-300))
|
|
144
|
-
.force("center", d3.forceCenter(width / 2, height / 2))
|
|
145
|
-
.force("collide", d3.forceCollide().radius(d => Math.sqrt((d.inLinks || 0) + 1) * 5 + 2));
|
|
146
|
-
|
|
147
|
-
const link = g.append("g")
|
|
148
|
-
.attr("stroke", "#999")
|
|
149
|
-
.attr("stroke-opacity", 0.6)
|
|
150
|
-
.selectAll("line")
|
|
151
|
-
.data(data.edges)
|
|
152
|
-
.join("line")
|
|
153
|
-
.attr("stroke-width", 1)
|
|
154
|
-
.attr("marker-end", "url(#arrow)");
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
const node = g.append("g")
|
|
158
|
-
.attr("stroke", "#fff")
|
|
159
|
-
.attr("stroke-width", 1.5)
|
|
160
|
-
.selectAll("circle")
|
|
161
|
-
.data(data.nodes)
|
|
162
|
-
.join("circle")
|
|
163
|
-
.attr("r", d => Math.sqrt((d.inLinks || 0) + 1) * 3 + 2)
|
|
164
|
-
.attr("fill", d => {
|
|
165
|
-
if (d.inLinks === 0 && d.depth > 0) return "red";
|
|
166
|
-
if (d.depth >= 4) return "orange";
|
|
167
|
-
return "blue";
|
|
168
|
-
})
|
|
169
|
-
.call(d3.drag()
|
|
170
|
-
.on("start", dragstarted)
|
|
171
|
-
.on("drag", dragged)
|
|
172
|
-
.on("end", dragended));
|
|
173
|
-
|
|
174
|
-
const tooltip = d3.select("#tooltip");
|
|
175
|
-
|
|
176
|
-
node.on("mouseover", (event, d) => {
|
|
177
|
-
tooltip.style("display", "block")
|
|
178
|
-
.html(\`
|
|
179
|
-
<strong>URL:</strong> \${d.url}<br>
|
|
180
|
-
<strong>Depth:</strong> \${d.depth}<br>
|
|
181
|
-
<strong>In-Links:</strong> \${d.inLinks}<br>
|
|
182
|
-
<strong>Out-Links:</strong> \${d.outLinks}<br>
|
|
183
|
-
<strong>Status:</strong> \${d.status}
|
|
184
|
-
\`)
|
|
185
|
-
.style("left", (event.pageX + 10) + "px")
|
|
186
|
-
.style("top", (event.pageY - 10) + "px");
|
|
187
|
-
})
|
|
188
|
-
.on("mouseout", () => {
|
|
189
|
-
tooltip.style("display", "none");
|
|
190
|
-
});
|
|
191
|
-
|
|
192
|
-
simulation.on("tick", () => {
|
|
193
|
-
link
|
|
194
|
-
.attr("x1", d => d.source.x)
|
|
195
|
-
.attr("y1", d => d.source.y)
|
|
196
|
-
.attr("x2", d => d.target.x)
|
|
197
|
-
.attr("y2", d => d.target.y);
|
|
198
|
-
|
|
199
|
-
node
|
|
200
|
-
.attr("cx", d => d.x)
|
|
201
|
-
.attr("cy", d => d.y);
|
|
202
|
-
});
|
|
203
|
-
|
|
204
|
-
function dragstarted(event, d) {
|
|
205
|
-
if (!event.active) simulation.alphaTarget(0.3).restart();
|
|
206
|
-
d.fx = d.x;
|
|
207
|
-
d.fy = d.y;
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
function dragged(event, d) {
|
|
211
|
-
d.fx = event.x;
|
|
212
|
-
d.fy = event.y;
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
function dragended(event, d) {
|
|
216
|
-
if (!event.active) simulation.alphaTarget(0);
|
|
217
|
-
d.fx = null;
|
|
218
|
-
d.fy = null;
|
|
219
|
-
}
|
|
19
|
+
window.METRICS_DATA = ${metricsJson};
|
|
220
20
|
</script>
|
|
221
|
-
</body
|
|
222
|
-
</html>`;
|
|
21
|
+
</body>`);
|
|
223
22
|
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { Graph } from '../graph/graph.js';
|
|
2
|
+
import { Metrics } from '../graph/metrics.js';
|
|
3
|
+
export declare const THIN_CONTENT_THRESHOLD = 300;
|
|
4
|
+
export declare const LOW_INTERNAL_LINK_THRESHOLD = 2;
|
|
5
|
+
export declare const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
|
|
6
|
+
export declare const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
|
|
7
|
+
export declare const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
|
|
8
|
+
export interface HealthScoreWeights {
|
|
9
|
+
orphans: number;
|
|
10
|
+
brokenLinks: number;
|
|
11
|
+
redirectChains: number;
|
|
12
|
+
duplicateClusters: number;
|
|
13
|
+
thinContent: number;
|
|
14
|
+
missingH1: number;
|
|
15
|
+
noindexMisuse: number;
|
|
16
|
+
canonicalConflicts: number;
|
|
17
|
+
lowInternalLinks: number;
|
|
18
|
+
excessiveLinks: number;
|
|
19
|
+
blockedByRobots: number;
|
|
20
|
+
}
|
|
21
|
+
export declare const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights;
|
|
22
|
+
export interface CrawlIssueCounts {
|
|
23
|
+
orphanPages: number;
|
|
24
|
+
brokenInternalLinks: number;
|
|
25
|
+
redirectChains: number;
|
|
26
|
+
duplicateClusters: number;
|
|
27
|
+
canonicalConflicts: number;
|
|
28
|
+
accidentalNoindex: number;
|
|
29
|
+
missingH1: number;
|
|
30
|
+
thinContent: number;
|
|
31
|
+
lowInternalLinkCount: number;
|
|
32
|
+
excessiveInternalLinkCount: number;
|
|
33
|
+
highExternalLinkRatio: number;
|
|
34
|
+
imageAltMissing: number;
|
|
35
|
+
strongPagesUnderLinking: number;
|
|
36
|
+
cannibalizationClusters: number;
|
|
37
|
+
nearAuthorityThreshold: number;
|
|
38
|
+
underlinkedHighAuthorityPages: number;
|
|
39
|
+
externalLinks: number;
|
|
40
|
+
blockedByRobots: number;
|
|
41
|
+
}
|
|
42
|
+
export interface HealthScoreBreakdown {
|
|
43
|
+
score: number;
|
|
44
|
+
status: string;
|
|
45
|
+
weightedPenalties: Record<keyof HealthScoreWeights, number>;
|
|
46
|
+
weights: HealthScoreWeights;
|
|
47
|
+
}
|
|
48
|
+
export declare function healthStatusLabel(score: number, hasCritical?: boolean): string;
|
|
49
|
+
export declare function calculateHealthScore(totalPages: number, issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots'>, weights?: HealthScoreWeights): HealthScoreBreakdown;
|
|
50
|
+
export declare function collectCrawlIssues(graph: Graph, metrics: Metrics): CrawlIssueCounts;
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import { analyzeContent } from '../analysis/content.js';
|
|
2
|
+
import { analyzeH1 } from '../analysis/seo.js';
|
|
3
|
+
import { analyzeImageAlts } from '../analysis/images.js';
|
|
4
|
+
import { analyzeLinks } from '../analysis/links.js';
|
|
5
|
+
export const THIN_CONTENT_THRESHOLD = 300;
|
|
6
|
+
export const LOW_INTERNAL_LINK_THRESHOLD = 2;
|
|
7
|
+
export const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
|
|
8
|
+
export const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
|
|
9
|
+
export const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
|
|
10
|
+
export const DEFAULT_HEALTH_WEIGHTS = {
|
|
11
|
+
orphans: 50,
|
|
12
|
+
brokenLinks: 100,
|
|
13
|
+
redirectChains: 20,
|
|
14
|
+
duplicateClusters: 25,
|
|
15
|
+
thinContent: 15,
|
|
16
|
+
missingH1: 10,
|
|
17
|
+
noindexMisuse: 20,
|
|
18
|
+
canonicalConflicts: 10,
|
|
19
|
+
lowInternalLinks: 10,
|
|
20
|
+
excessiveLinks: 5,
|
|
21
|
+
blockedByRobots: 100
|
|
22
|
+
};
|
|
23
|
+
function clamp(value, min, max) {
|
|
24
|
+
return Math.min(max, Math.max(min, value));
|
|
25
|
+
}
|
|
26
|
+
export function healthStatusLabel(score, hasCritical = false) {
|
|
27
|
+
if (hasCritical && score >= 75)
|
|
28
|
+
return 'Needs Attention';
|
|
29
|
+
if (score >= 90)
|
|
30
|
+
return 'Excellent';
|
|
31
|
+
if (score >= 75)
|
|
32
|
+
return 'Good';
|
|
33
|
+
if (score >= 50)
|
|
34
|
+
return 'Needs Attention';
|
|
35
|
+
return 'Critical';
|
|
36
|
+
}
|
|
37
|
+
export function calculateHealthScore(totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) {
|
|
38
|
+
const safePages = Math.max(totalPages, 1);
|
|
39
|
+
const weightedPenalties = {
|
|
40
|
+
orphans: clamp((issues.orphanPages / safePages) * weights.orphans, 0, weights.orphans),
|
|
41
|
+
brokenLinks: clamp((issues.brokenInternalLinks / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
|
|
42
|
+
redirectChains: clamp((issues.redirectChains / safePages) * weights.redirectChains, 0, weights.redirectChains),
|
|
43
|
+
duplicateClusters: clamp((issues.duplicateClusters / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
|
|
44
|
+
thinContent: clamp((issues.thinContent / safePages) * weights.thinContent, 0, weights.thinContent),
|
|
45
|
+
missingH1: clamp((issues.missingH1 / safePages) * weights.missingH1, 0, weights.missingH1),
|
|
46
|
+
noindexMisuse: clamp((issues.accidentalNoindex / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
|
|
47
|
+
canonicalConflicts: clamp((issues.canonicalConflicts / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
|
|
48
|
+
lowInternalLinks: clamp((issues.lowInternalLinkCount / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
|
|
49
|
+
excessiveLinks: clamp((issues.excessiveInternalLinkCount / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
|
|
50
|
+
blockedByRobots: clamp((issues.blockedByRobots / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots)
|
|
51
|
+
};
|
|
52
|
+
const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
|
|
53
|
+
const score = Number(clamp(100 - totalPenalty, 0, 100).toFixed(1));
|
|
54
|
+
const hasCritical = (issues.orphanPages > 0 ||
|
|
55
|
+
issues.brokenInternalLinks > 0 ||
|
|
56
|
+
issues.redirectChains > 0 ||
|
|
57
|
+
issues.duplicateClusters > 0 ||
|
|
58
|
+
issues.canonicalConflicts > 0 ||
|
|
59
|
+
issues.accidentalNoindex > 0 ||
|
|
60
|
+
issues.blockedByRobots > 0);
|
|
61
|
+
return {
|
|
62
|
+
score,
|
|
63
|
+
status: healthStatusLabel(score, hasCritical),
|
|
64
|
+
weightedPenalties,
|
|
65
|
+
weights
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
export function collectCrawlIssues(graph, metrics) {
|
|
69
|
+
const nodes = graph.getNodes();
|
|
70
|
+
let brokenInternalLinks = 0;
|
|
71
|
+
let redirectChains = 0;
|
|
72
|
+
let canonicalConflicts = 0;
|
|
73
|
+
let accidentalNoindex = 0;
|
|
74
|
+
let missingH1 = 0;
|
|
75
|
+
let thinContent = 0;
|
|
76
|
+
let highExternalLinkRatio = 0;
|
|
77
|
+
let imageAltMissing = 0;
|
|
78
|
+
let lowInternalLinkCount = 0;
|
|
79
|
+
let excessiveInternalLinkCount = 0;
|
|
80
|
+
let strongPagesUnderLinking = 0;
|
|
81
|
+
let nearAuthorityThreshold = 0;
|
|
82
|
+
let underlinkedHighAuthorityPages = 0;
|
|
83
|
+
let externalLinks = 0;
|
|
84
|
+
let blockedByRobots = 0;
|
|
85
|
+
for (const node of nodes) {
|
|
86
|
+
if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
|
|
87
|
+
blockedByRobots += 1;
|
|
88
|
+
}
|
|
89
|
+
const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
|
|
90
|
+
if (isConfirmedError) {
|
|
91
|
+
brokenInternalLinks += 1;
|
|
92
|
+
}
|
|
93
|
+
if (node.brokenLinks) {
|
|
94
|
+
const actualBreaks = node.brokenLinks.filter(url => {
|
|
95
|
+
const target = graph.nodes.get(url);
|
|
96
|
+
return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
|
|
97
|
+
});
|
|
98
|
+
brokenInternalLinks += actualBreaks.length;
|
|
99
|
+
}
|
|
100
|
+
if ((node.redirectChain?.length || 0) > 1) {
|
|
101
|
+
redirectChains += 1;
|
|
102
|
+
}
|
|
103
|
+
if (node.canonical && node.canonical !== node.url) {
|
|
104
|
+
canonicalConflicts += 1;
|
|
105
|
+
}
|
|
106
|
+
if (node.noindex && node.status >= 200 && node.status < 300) {
|
|
107
|
+
accidentalNoindex += 1;
|
|
108
|
+
}
|
|
109
|
+
if (node.inLinks < LOW_INTERNAL_LINK_THRESHOLD && node.depth > 0) {
|
|
110
|
+
lowInternalLinkCount += 1;
|
|
111
|
+
}
|
|
112
|
+
if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
|
|
113
|
+
excessiveInternalLinkCount += 1;
|
|
114
|
+
}
|
|
115
|
+
if (!node.html) {
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
const h1 = analyzeH1(node.html, '');
|
|
119
|
+
if (h1.count === 0) {
|
|
120
|
+
missingH1 += 1;
|
|
121
|
+
}
|
|
122
|
+
const content = analyzeContent(node.html);
|
|
123
|
+
if (content.wordCount < THIN_CONTENT_THRESHOLD) {
|
|
124
|
+
thinContent += 1;
|
|
125
|
+
}
|
|
126
|
+
const links = analyzeLinks(node.html, node.url, node.url);
|
|
127
|
+
externalLinks += links.externalLinks;
|
|
128
|
+
if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
|
|
129
|
+
highExternalLinkRatio += 1;
|
|
130
|
+
}
|
|
131
|
+
const imageAlt = analyzeImageAlts(node.html);
|
|
132
|
+
if (imageAlt.missingAlt > 0) {
|
|
133
|
+
imageAltMissing += 1;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
const duplicateClusters = graph.duplicateClusters?.length || 0;
|
|
137
|
+
const cannibalizationClusters = graph.duplicateClusters?.filter((cluster) => cluster.type === 'near').length || 0;
|
|
138
|
+
for (const node of nodes) {
|
|
139
|
+
const authority = node.pageRank || 0;
|
|
140
|
+
if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
|
|
141
|
+
strongPagesUnderLinking += 1;
|
|
142
|
+
}
|
|
143
|
+
if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
|
|
144
|
+
nearAuthorityThreshold += 1;
|
|
145
|
+
}
|
|
146
|
+
if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
|
|
147
|
+
underlinkedHighAuthorityPages += 1;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return {
|
|
151
|
+
orphanPages: metrics.orphanPages.length,
|
|
152
|
+
brokenInternalLinks,
|
|
153
|
+
redirectChains,
|
|
154
|
+
duplicateClusters,
|
|
155
|
+
canonicalConflicts,
|
|
156
|
+
accidentalNoindex,
|
|
157
|
+
missingH1,
|
|
158
|
+
thinContent,
|
|
159
|
+
lowInternalLinkCount,
|
|
160
|
+
excessiveInternalLinkCount,
|
|
161
|
+
highExternalLinkRatio,
|
|
162
|
+
imageAltMissing,
|
|
163
|
+
strongPagesUnderLinking,
|
|
164
|
+
cannibalizationClusters,
|
|
165
|
+
nearAuthorityThreshold,
|
|
166
|
+
underlinkedHighAuthorityPages,
|
|
167
|
+
externalLinks,
|
|
168
|
+
blockedByRobots
|
|
169
|
+
};
|
|
170
|
+
}
|
package/dist/scoring/hits.d.ts
CHANGED
|
@@ -5,5 +5,6 @@ export interface HITSOptions {
|
|
|
5
5
|
/**
|
|
6
6
|
* Computes Hub and Authority scores using the HITS algorithm.
|
|
7
7
|
* Operates purely on the internal link graph.
|
|
8
|
+
* Optimized for performance using array-based adjacency lists.
|
|
8
9
|
*/
|
|
9
10
|
export declare function computeHITS(graph: Graph, options?: HITSOptions): void;
|
package/dist/scoring/hits.js
CHANGED
|
@@ -1,81 +1,90 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Computes Hub and Authority scores using the HITS algorithm.
|
|
3
3
|
* Operates purely on the internal link graph.
|
|
4
|
+
* Optimized for performance using array-based adjacency lists.
|
|
4
5
|
*/
|
|
5
6
|
export function computeHITS(graph, options = {}) {
|
|
6
7
|
const iterations = options.iterations || 20;
|
|
7
8
|
const nodes = graph.getNodes();
|
|
8
9
|
// 1. Filter eligible nodes
|
|
9
|
-
// Eligibility: status 200
|
|
10
|
-
|
|
10
|
+
// Eligibility: status 200 (crawled) or status 0 (discovered)
|
|
11
|
+
// Non-redirect, not noindex (if known), non-external
|
|
12
|
+
const eligibleNodes = nodes.filter(n => (n.status === 200 || n.status === 0) &&
|
|
11
13
|
(!n.redirectChain || n.redirectChain.length === 0) &&
|
|
12
14
|
!n.noindex);
|
|
13
|
-
|
|
15
|
+
const N = eligibleNodes.length;
|
|
16
|
+
if (N === 0)
|
|
14
17
|
return;
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
node.authorityScore = 1.0;
|
|
20
|
-
node.hubScore = 1.0;
|
|
18
|
+
// Map URL to Index for O(1) access
|
|
19
|
+
const urlToIndex = new Map();
|
|
20
|
+
for (let i = 0; i < N; i++) {
|
|
21
|
+
urlToIndex.set(eligibleNodes[i].url, i);
|
|
21
22
|
}
|
|
23
|
+
// Build Adjacency Lists (Indices)
|
|
24
|
+
// incoming[i] = list of { sourceIndex, weight }
|
|
25
|
+
// outgoing[i] = list of { targetIndex, weight }
|
|
26
|
+
const incoming = new Array(N).fill(null).map(() => []);
|
|
27
|
+
const outgoing = new Array(N).fill(null).map(() => []);
|
|
22
28
|
const allEdges = graph.getEdges();
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
incoming.set(edge.target, []);
|
|
33
|
-
incoming.get(edge.target).push({ source: edge.source, weight: edge.weight });
|
|
34
|
-
if (!outgoing.has(edge.source))
|
|
35
|
-
outgoing.set(edge.source, []);
|
|
36
|
-
outgoing.get(edge.source).push({ target: edge.target, weight: edge.weight });
|
|
29
|
+
for (const edge of allEdges) {
|
|
30
|
+
if (edge.source === edge.target)
|
|
31
|
+
continue;
|
|
32
|
+
const sourceIndex = urlToIndex.get(edge.source);
|
|
33
|
+
const targetIndex = urlToIndex.get(edge.target);
|
|
34
|
+
if (sourceIndex !== undefined && targetIndex !== undefined) {
|
|
35
|
+
incoming[targetIndex].push({ sourceIndex, weight: edge.weight });
|
|
36
|
+
outgoing[sourceIndex].push({ targetIndex, weight: edge.weight });
|
|
37
|
+
}
|
|
37
38
|
}
|
|
38
|
-
//
|
|
39
|
-
|
|
39
|
+
// Initialize Scores
|
|
40
|
+
const authScores = new Float64Array(N).fill(1.0);
|
|
41
|
+
const hubScores = new Float64Array(N).fill(1.0);
|
|
42
|
+
// 2. Iteration
|
|
43
|
+
for (let iter = 0; iter < iterations; iter++) {
|
|
40
44
|
// Update Authorities
|
|
41
45
|
let normAuth = 0;
|
|
42
|
-
for (
|
|
43
|
-
const inLinks = incoming
|
|
46
|
+
for (let i = 0; i < N; i++) {
|
|
47
|
+
const inLinks = incoming[i];
|
|
44
48
|
let newAuth = 0;
|
|
45
|
-
for (
|
|
46
|
-
const
|
|
47
|
-
newAuth +=
|
|
49
|
+
for (let j = 0; j < inLinks.length; j++) {
|
|
50
|
+
const link = inLinks[j];
|
|
51
|
+
newAuth += hubScores[link.sourceIndex] * link.weight;
|
|
48
52
|
}
|
|
49
|
-
|
|
53
|
+
authScores[i] = newAuth;
|
|
50
54
|
normAuth += newAuth * newAuth;
|
|
51
55
|
}
|
|
52
56
|
// Normalize Authorities (L2 norm)
|
|
53
57
|
normAuth = Math.sqrt(normAuth);
|
|
54
58
|
if (normAuth > 0) {
|
|
55
|
-
for (
|
|
56
|
-
|
|
59
|
+
for (let i = 0; i < N; i++) {
|
|
60
|
+
authScores[i] /= normAuth;
|
|
57
61
|
}
|
|
58
62
|
}
|
|
59
63
|
// Update Hubs
|
|
60
64
|
let normHub = 0;
|
|
61
|
-
for (
|
|
62
|
-
const outLinks = outgoing
|
|
65
|
+
for (let i = 0; i < N; i++) {
|
|
66
|
+
const outLinks = outgoing[i];
|
|
63
67
|
let newHub = 0;
|
|
64
|
-
for (
|
|
65
|
-
const
|
|
66
|
-
newHub +=
|
|
68
|
+
for (let j = 0; j < outLinks.length; j++) {
|
|
69
|
+
const link = outLinks[j];
|
|
70
|
+
newHub += authScores[link.targetIndex] * link.weight;
|
|
67
71
|
}
|
|
68
|
-
|
|
72
|
+
hubScores[i] = newHub;
|
|
69
73
|
normHub += newHub * newHub;
|
|
70
74
|
}
|
|
71
75
|
// Normalize Hubs (L2 norm)
|
|
72
76
|
normHub = Math.sqrt(normHub);
|
|
73
77
|
if (normHub > 0) {
|
|
74
|
-
for (
|
|
75
|
-
|
|
78
|
+
for (let i = 0; i < N; i++) {
|
|
79
|
+
hubScores[i] /= normHub;
|
|
76
80
|
}
|
|
77
81
|
}
|
|
78
82
|
}
|
|
83
|
+
// 3. Assign back to GraphNodes
|
|
84
|
+
for (let i = 0; i < N; i++) {
|
|
85
|
+
eligibleNodes[i].authorityScore = authScores[i];
|
|
86
|
+
eligibleNodes[i].hubScore = hubScores[i];
|
|
87
|
+
}
|
|
79
88
|
// 4. Classification Logic
|
|
80
89
|
classifyLinkRoles(eligibleNodes);
|
|
81
90
|
}
|
|
@@ -85,13 +94,24 @@ function classifyLinkRoles(nodes) {
|
|
|
85
94
|
const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
|
|
86
95
|
const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
|
|
87
96
|
// Use 75th percentile as "high" threshold
|
|
97
|
+
// Using median (50th percentile) as per original implementation,
|
|
98
|
+
// but the comment said "Use 75th percentile" while code used median.
|
|
99
|
+
// I'll stick to median to avoid breaking existing behavior, but correct the comment or logic?
|
|
100
|
+
// The original code:
|
|
101
|
+
// const medianAuth = authScores[Math.floor(authScores.length / 2)];
|
|
102
|
+
// const isHighAuth = auth > medianAuth && auth > 0.0001;
|
|
103
|
+
// So it uses median. I'll keep it as median.
|
|
88
104
|
const medianAuth = authScores[Math.floor(authScores.length / 2)];
|
|
89
105
|
const medianHub = hubScores[Math.floor(hubScores.length / 2)];
|
|
106
|
+
const maxAuth = authScores[authScores.length - 1];
|
|
107
|
+
const maxHub = hubScores[hubScores.length - 1];
|
|
90
108
|
for (const node of nodes) {
|
|
91
109
|
const auth = node.authorityScore || 0;
|
|
92
110
|
const hub = node.hubScore || 0;
|
|
93
|
-
|
|
94
|
-
|
|
111
|
+
// A node is high if it's above median, OR if it's the max (to handle uniform distributions)
|
|
112
|
+
// auth > 0 check is essential.
|
|
113
|
+
const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
|
|
114
|
+
const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
|
|
95
115
|
if (isHighAuth && isHighHub) {
|
|
96
116
|
node.linkRole = 'power';
|
|
97
117
|
}
|
|
@@ -101,7 +121,7 @@ function classifyLinkRoles(nodes) {
|
|
|
101
121
|
else if (isHighHub) {
|
|
102
122
|
node.linkRole = 'hub';
|
|
103
123
|
}
|
|
104
|
-
else if (auth > 0.
|
|
124
|
+
else if (auth > 0.00001 && hub > 0.00001) {
|
|
105
125
|
node.linkRole = 'balanced';
|
|
106
126
|
}
|
|
107
127
|
else {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
|
|
2
2
|
export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
|
|
3
|
-
export interface
|
|
3
|
+
export interface CrawlNode {
|
|
4
4
|
url: string;
|
|
5
5
|
depth: number;
|
|
6
6
|
inLinks: number;
|
|
@@ -17,7 +17,7 @@ export interface SitegraphNode {
|
|
|
17
17
|
duplicateContent?: boolean;
|
|
18
18
|
isProductOrCommercial?: boolean;
|
|
19
19
|
}
|
|
20
|
-
export interface
|
|
20
|
+
export interface CrawlEdge {
|
|
21
21
|
source: string;
|
|
22
22
|
target: string;
|
|
23
23
|
}
|
|
@@ -28,12 +28,12 @@ export interface OrphanScoringOptions {
|
|
|
28
28
|
minInbound: number;
|
|
29
29
|
rootUrl?: string;
|
|
30
30
|
}
|
|
31
|
-
export type AnnotatedNode =
|
|
31
|
+
export type AnnotatedNode = CrawlNode & {
|
|
32
32
|
orphan: boolean;
|
|
33
33
|
orphanType?: OrphanType;
|
|
34
34
|
orphanSeverity?: number;
|
|
35
35
|
impactLevel?: ImpactLevel;
|
|
36
36
|
};
|
|
37
37
|
export declare function mapImpactLevel(score: number): ImpactLevel;
|
|
38
|
-
export declare function calculateOrphanSeverity(orphanType: OrphanType, node:
|
|
39
|
-
export declare function annotateOrphans(nodes:
|
|
38
|
+
export declare function calculateOrphanSeverity(orphanType: OrphanType, node: CrawlNode): number;
|
|
39
|
+
export declare function annotateOrphans(nodes: CrawlNode[], edges: CrawlEdge[], options: OrphanScoringOptions): AnnotatedNode[];
|