@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -1,227 +1,27 @@
1
1
  import { Metrics } from '../graph/metrics.js';
2
+ import { Crawl_HTML } from './crawl_template.js';
2
3
 
3
4
  function safeJson(data: any): string {
4
5
  return JSON.stringify(data).replace(/</g, '\\u003c');
5
6
  }
6
7
 
7
8
  export function generateHtml(graphData: any, metrics: Metrics): string {
8
- const graphJson = safeJson(graphData);
9
-
10
- return `<!DOCTYPE html>
11
- <html lang="en">
12
- <head>
13
- <meta charset="UTF-8">
14
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
15
- <title>Crawlith Site Graph</title>
16
- <style>
17
- body { margin: 0; overflow: hidden; font-family: sans-serif; }
18
- #graph { width: 100vw; height: 100vh; background: #f0f0f0; }
19
- .tooltip {
20
- position: absolute;
21
- background: white;
22
- border: 1px solid #ccc;
23
- padding: 10px;
24
- pointer-events: none;
25
- font-size: 12px;
26
- box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
27
- display: none;
28
- }
29
- #metrics {
30
- position: absolute;
31
- top: 10px;
32
- left: 10px;
33
- background: rgba(255, 255, 255, 0.9);
34
- padding: 15px;
35
- border-radius: 5px;
36
- box-shadow: 0 0 10px rgba(0,0,0,0.1);
37
- max-width: 320px;
38
- max-height: 90vh;
39
- overflow-y: auto;
40
- z-index: 100;
41
- }
42
- h1 { font-size: 18px; margin-top: 0; }
43
- h2 { font-size: 14px; margin: 15px 0 5px; border-bottom: 1px solid #ddd; }
44
- ul { padding-left: 20px; margin: 5px 0; }
45
- .legend { margin-top: 10px; font-size: 11px; }
46
- .legend-item { display: flex; align-items: center; margin-bottom: 3px; }
47
- .dot { width: 8px; height: 8px; border-radius: 50%; margin-right: 5px; }
48
- .stat-row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
49
- .stat-label { color: #666; }
50
- .stat-value { font-weight: bold; }
51
- </style>
52
- </head>
53
- <body>
54
- <div id="metrics">
55
- <h1>Crawlith Site Graph</h1>
56
-
57
- <div class="stat-row">
58
- <span class="stat-label">Discovered Pages:</span>
59
- <span class="stat-value">${metrics.totalPages}</span>
60
- </div>
61
- ${metrics.sessionStats ? `
62
- <div class="stat-row">
63
- <span class="stat-label">Session Crawl:</span>
64
- <span class="stat-value">${metrics.sessionStats.pagesFetched} pages</span>
65
- </div>
66
- ${metrics.sessionStats.pagesCached > 0 ? `
67
- <div class="stat-row" style="font-size: 11px; margin-top: -3px;">
68
- <span class="stat-label" style="padding-left: 10px;">- Reuse Cached:</span>
69
- <span class="stat-value">${metrics.sessionStats.pagesCached}</span>
70
- </div>` : ''}
71
- ` : ''}
72
- <div class="stat-row">
73
- <span class="stat-label">Total Edges:</span>
74
- <span class="stat-value">${metrics.totalEdges}</span>
75
- </div>
76
- <div class="stat-row">
77
- <span class="stat-label">Max Depth:</span>
78
- <span class="stat-value">${metrics.maxDepthFound}</span>
79
- </div>
80
- <div class="stat-row">
81
- <span class="stat-label">Avg Out-Degree:</span>
82
- <span class="stat-value">${metrics.averageOutDegree.toFixed(2)}</span>
83
- </div>
84
-
85
- <div class="legend">
86
- <div class="legend-item"><div class="dot" style="background: red;"></div>Orphan (In-Links: 0)</div>
87
- <div class="legend-item"><div class="dot" style="background: orange;"></div>Deep (Depth >= 4)</div>
88
- <div class="legend-item"><div class="dot" style="background: blue;"></div>Normal</div>
89
- </div>
90
-
91
- ${metrics.topAuthorityPages.length > 0 ? `
92
- <h3>Top Authority</h3>
93
- <ul>
94
- ${metrics.topAuthorityPages.map(p => `<li><a href="${p.url}" target="_blank">${new URL(p.url).pathname}</a> (${p.authority.toFixed(2)})</li>`).join('')}
95
- </ul>
96
- ` : ''}
97
-
98
- ${metrics.orphanPages.length > 0 ? `
99
- <h3>Orphan Pages (${metrics.orphanPages.length})</h3>
100
- <details>
101
- <summary>Show list</summary>
102
- <ul>
103
- ${metrics.orphanPages.slice(0, 20).map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('')}
104
- ${metrics.orphanPages.length > 20 ? `<li>... and ${metrics.orphanPages.length - 20} more</li>` : ''}
105
- </ul>
106
- </details>
107
- ` : ''}
108
- </div>
109
- <div id="graph"></div>
110
- <div class="tooltip" id="tooltip"></div>
111
-
112
- <script src="https://d3js.org/d3.v7.min.js"></script>
113
- <script>
114
- // Make data available globally
9
+ // Strip heavy HTML content from nodes to keep the report lightweight
10
+ const vizGraphData = {
11
+ ...graphData,
12
+ nodes: graphData.nodes ? graphData.nodes.map((n: any) => {
13
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
14
+ const { html, ...rest } = n;
15
+ return rest;
16
+ }) : []
17
+ };
18
+
19
+ const graphJson = safeJson(vizGraphData);
20
+ const metricsJson = safeJson(metrics);
21
+
22
+ return Crawl_HTML.replace('</body>', `<script>
115
23
  window.GRAPH_DATA = ${graphJson};
116
-
117
- const data = window.GRAPH_DATA;
118
- const width = window.innerWidth;
119
- const height = window.innerHeight;
120
-
121
- const svg = d3.select("#graph").append("svg")
122
- .attr("width", width)
123
- .attr("height", height)
124
- .call(d3.zoom().on("zoom", (event) => {
125
- g.attr("transform", event.transform);
126
- }));
127
-
128
- const g = svg.append("g");
129
-
130
- // Define arrow marker
131
- svg.append("defs").selectAll("marker")
132
- .data(["arrow"])
133
- .enter().append("marker")
134
- .attr("id", d => d)
135
- .attr("viewBox", "0 -5 10 10")
136
- .attr("refX", 15)
137
- .attr("refY", 0)
138
- .attr("markerWidth", 6)
139
- .attr("markerHeight", 6)
140
- .attr("orient", "auto")
141
- .append("path")
142
- .attr("d", "M0,-5L10,0L0,5")
143
- .attr("fill", "#999");
144
-
145
- const simulation = d3.forceSimulation(data.nodes)
146
- .force("link", d3.forceLink(data.edges).id(d => d.url).distance(100))
147
- .force("charge", d3.forceManyBody().strength(-300))
148
- .force("center", d3.forceCenter(width / 2, height / 2))
149
- .force("collide", d3.forceCollide().radius(d => Math.sqrt((d.inLinks || 0) + 1) * 5 + 2));
150
-
151
- const link = g.append("g")
152
- .attr("stroke", "#999")
153
- .attr("stroke-opacity", 0.6)
154
- .selectAll("line")
155
- .data(data.edges)
156
- .join("line")
157
- .attr("stroke-width", 1)
158
- .attr("marker-end", "url(#arrow)");
159
-
160
-
161
- const node = g.append("g")
162
- .attr("stroke", "#fff")
163
- .attr("stroke-width", 1.5)
164
- .selectAll("circle")
165
- .data(data.nodes)
166
- .join("circle")
167
- .attr("r", d => Math.sqrt((d.inLinks || 0) + 1) * 3 + 2)
168
- .attr("fill", d => {
169
- if (d.inLinks === 0 && d.depth > 0) return "red";
170
- if (d.depth >= 4) return "orange";
171
- return "blue";
172
- })
173
- .call(d3.drag()
174
- .on("start", dragstarted)
175
- .on("drag", dragged)
176
- .on("end", dragended));
177
-
178
- const tooltip = d3.select("#tooltip");
179
-
180
- node.on("mouseover", (event, d) => {
181
- tooltip.style("display", "block")
182
- .html(\`
183
- <strong>URL:</strong> \${d.url}<br>
184
- <strong>Depth:</strong> \${d.depth}<br>
185
- <strong>In-Links:</strong> \${d.inLinks}<br>
186
- <strong>Out-Links:</strong> \${d.outLinks}<br>
187
- <strong>Status:</strong> \${d.status}
188
- \`)
189
- .style("left", (event.pageX + 10) + "px")
190
- .style("top", (event.pageY - 10) + "px");
191
- })
192
- .on("mouseout", () => {
193
- tooltip.style("display", "none");
194
- });
195
-
196
- simulation.on("tick", () => {
197
- link
198
- .attr("x1", d => d.source.x)
199
- .attr("y1", d => d.source.y)
200
- .attr("x2", d => d.target.x)
201
- .attr("y2", d => d.target.y);
202
-
203
- node
204
- .attr("cx", d => d.x)
205
- .attr("cy", d => d.y);
206
- });
207
-
208
- function dragstarted(event, d) {
209
- if (!event.active) simulation.alphaTarget(0.3).restart();
210
- d.fx = d.x;
211
- d.fy = d.y;
212
- }
213
-
214
- function dragged(event, d) {
215
- d.fx = event.x;
216
- d.fy = event.y;
217
- }
218
-
219
- function dragended(event, d) {
220
- if (!event.active) simulation.alphaTarget(0);
221
- d.fx = null;
222
- d.fy = null;
223
- }
24
+ window.METRICS_DATA = ${metricsJson};
224
25
  </script>
225
- </body>
226
- </html>`;
26
+ </body>`);
227
27
  }
@@ -0,0 +1,241 @@
1
+ import { Graph } from '../graph/graph.js';
2
+ import { Metrics } from '../graph/metrics.js';
3
+ import { analyzeContent } from '../analysis/content.js';
4
+ import { analyzeH1 } from '../analysis/seo.js';
5
+ import { analyzeImageAlts } from '../analysis/images.js';
6
+ import { analyzeLinks } from '../analysis/links.js';
7
+
8
+ export const THIN_CONTENT_THRESHOLD = 300;
9
+ export const LOW_INTERNAL_LINK_THRESHOLD = 2;
10
+ export const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
11
+ export const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
12
+ export const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
13
+
14
+ export interface HealthScoreWeights {
15
+ orphans: number;
16
+ brokenLinks: number;
17
+ redirectChains: number;
18
+ duplicateClusters: number;
19
+ thinContent: number;
20
+ missingH1: number;
21
+ noindexMisuse: number;
22
+ canonicalConflicts: number;
23
+ lowInternalLinks: number;
24
+ excessiveLinks: number;
25
+ blockedByRobots: number;
26
+ }
27
+
28
+ export const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights = {
29
+ orphans: 50,
30
+ brokenLinks: 100,
31
+ redirectChains: 20,
32
+ duplicateClusters: 25,
33
+ thinContent: 15,
34
+ missingH1: 10,
35
+ noindexMisuse: 20,
36
+ canonicalConflicts: 10,
37
+ lowInternalLinks: 10,
38
+ excessiveLinks: 5,
39
+ blockedByRobots: 100
40
+ };
41
+
42
+ export interface CrawlIssueCounts {
43
+ orphanPages: number;
44
+ brokenInternalLinks: number;
45
+ redirectChains: number;
46
+ duplicateClusters: number;
47
+ canonicalConflicts: number;
48
+ accidentalNoindex: number;
49
+ missingH1: number;
50
+ thinContent: number;
51
+ lowInternalLinkCount: number;
52
+ excessiveInternalLinkCount: number;
53
+ highExternalLinkRatio: number;
54
+ imageAltMissing: number;
55
+ strongPagesUnderLinking: number;
56
+ cannibalizationClusters: number;
57
+ nearAuthorityThreshold: number;
58
+ underlinkedHighAuthorityPages: number;
59
+ externalLinks: number;
60
+ blockedByRobots: number;
61
+ }
62
+
63
+ export interface HealthScoreBreakdown {
64
+ score: number;
65
+ status: string;
66
+ weightedPenalties: Record<keyof HealthScoreWeights, number>;
67
+ weights: HealthScoreWeights;
68
+ }
69
+
70
+ function clamp(value: number, min: number, max: number): number {
71
+ return Math.min(max, Math.max(min, value));
72
+ }
73
+
74
+ export function healthStatusLabel(score: number, hasCritical: boolean = false): string {
75
+ if (hasCritical && score >= 75) return 'Needs Attention';
76
+ if (score >= 90) return 'Excellent';
77
+ if (score >= 75) return 'Good';
78
+ if (score >= 50) return 'Needs Attention';
79
+ return 'Critical';
80
+ }
81
+
82
+ export function calculateHealthScore(
83
+ totalPages: number,
84
+ issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots'>,
85
+ weights: HealthScoreWeights = DEFAULT_HEALTH_WEIGHTS
86
+ ): HealthScoreBreakdown {
87
+ const safePages = Math.max(totalPages, 1);
88
+
89
+ const weightedPenalties = {
90
+ orphans: clamp((issues.orphanPages / safePages) * weights.orphans, 0, weights.orphans),
91
+ brokenLinks: clamp((issues.brokenInternalLinks / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
92
+ redirectChains: clamp((issues.redirectChains / safePages) * weights.redirectChains, 0, weights.redirectChains),
93
+ duplicateClusters: clamp((issues.duplicateClusters / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
94
+ thinContent: clamp((issues.thinContent / safePages) * weights.thinContent, 0, weights.thinContent),
95
+ missingH1: clamp((issues.missingH1 / safePages) * weights.missingH1, 0, weights.missingH1),
96
+ noindexMisuse: clamp((issues.accidentalNoindex / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
97
+ canonicalConflicts: clamp((issues.canonicalConflicts / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
98
+ lowInternalLinks: clamp((issues.lowInternalLinkCount / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
99
+ excessiveLinks: clamp((issues.excessiveInternalLinkCount / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
100
+ blockedByRobots: clamp((issues.blockedByRobots / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots)
101
+ };
102
+
103
+ const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
104
+ const score = Number(clamp(100 - totalPenalty, 0, 100).toFixed(1));
105
+
106
+ const hasCritical = (
107
+ issues.orphanPages > 0 ||
108
+ issues.brokenInternalLinks > 0 ||
109
+ issues.redirectChains > 0 ||
110
+ issues.duplicateClusters > 0 ||
111
+ issues.canonicalConflicts > 0 ||
112
+ issues.accidentalNoindex > 0 ||
113
+ issues.blockedByRobots > 0
114
+ );
115
+
116
+ return {
117
+ score,
118
+ status: healthStatusLabel(score, hasCritical),
119
+ weightedPenalties,
120
+ weights
121
+ };
122
+ }
123
+
124
+ export function collectCrawlIssues(graph: Graph, metrics: Metrics): CrawlIssueCounts {
125
+ const nodes = graph.getNodes();
126
+
127
+ let brokenInternalLinks = 0;
128
+ let redirectChains = 0;
129
+ let canonicalConflicts = 0;
130
+ let accidentalNoindex = 0;
131
+ let missingH1 = 0;
132
+ let thinContent = 0;
133
+ let highExternalLinkRatio = 0;
134
+ let imageAltMissing = 0;
135
+ let lowInternalLinkCount = 0;
136
+ let excessiveInternalLinkCount = 0;
137
+ let strongPagesUnderLinking = 0;
138
+ let nearAuthorityThreshold = 0;
139
+ let underlinkedHighAuthorityPages = 0;
140
+ let externalLinks = 0;
141
+ let blockedByRobots = 0;
142
+
143
+ for (const node of nodes) {
144
+ if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
145
+ blockedByRobots += 1;
146
+ }
147
+
148
+ const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
149
+
150
+ if (isConfirmedError) {
151
+ brokenInternalLinks += 1;
152
+ }
153
+
154
+ if (node.brokenLinks) {
155
+ const actualBreaks = node.brokenLinks.filter(url => {
156
+ const target = graph.nodes.get(url);
157
+ return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
158
+ });
159
+ brokenInternalLinks += actualBreaks.length;
160
+ }
161
+
162
+ if ((node.redirectChain?.length || 0) > 1) {
163
+ redirectChains += 1;
164
+ }
165
+ if (node.canonical && node.canonical !== node.url) {
166
+ canonicalConflicts += 1;
167
+ }
168
+ if (node.noindex && node.status >= 200 && node.status < 300) {
169
+ accidentalNoindex += 1;
170
+ }
171
+
172
+ if (node.inLinks < LOW_INTERNAL_LINK_THRESHOLD && node.depth > 0) {
173
+ lowInternalLinkCount += 1;
174
+ }
175
+ if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
176
+ excessiveInternalLinkCount += 1;
177
+ }
178
+
179
+ if (!node.html) {
180
+ continue;
181
+ }
182
+
183
+ const h1 = analyzeH1(node.html, '');
184
+ if (h1.count === 0) {
185
+ missingH1 += 1;
186
+ }
187
+
188
+ const content = analyzeContent(node.html);
189
+ if (content.wordCount < THIN_CONTENT_THRESHOLD) {
190
+ thinContent += 1;
191
+ }
192
+
193
+ const links = analyzeLinks(node.html, node.url, node.url);
194
+ externalLinks += links.externalLinks;
195
+ if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
196
+ highExternalLinkRatio += 1;
197
+ }
198
+
199
+ const imageAlt = analyzeImageAlts(node.html);
200
+ if (imageAlt.missingAlt > 0) {
201
+ imageAltMissing += 1;
202
+ }
203
+ }
204
+
205
+ const duplicateClusters = graph.duplicateClusters?.length || 0;
206
+ const cannibalizationClusters = graph.duplicateClusters?.filter((cluster) => cluster.type === 'near').length || 0;
207
+
208
+ for (const node of nodes) {
209
+ const authority = node.pageRank || 0;
210
+ if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
211
+ strongPagesUnderLinking += 1;
212
+ }
213
+ if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
214
+ nearAuthorityThreshold += 1;
215
+ }
216
+ if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
217
+ underlinkedHighAuthorityPages += 1;
218
+ }
219
+ }
220
+
221
+ return {
222
+ orphanPages: metrics.orphanPages.length,
223
+ brokenInternalLinks,
224
+ redirectChains,
225
+ duplicateClusters,
226
+ canonicalConflicts,
227
+ accidentalNoindex,
228
+ missingH1,
229
+ thinContent,
230
+ lowInternalLinkCount,
231
+ excessiveInternalLinkCount,
232
+ highExternalLinkRatio,
233
+ imageAltMissing,
234
+ strongPagesUnderLinking,
235
+ cannibalizationClusters,
236
+ nearAuthorityThreshold,
237
+ underlinkedHighAuthorityPages,
238
+ externalLinks,
239
+ blockedByRobots
240
+ };
241
+ }
@@ -7,94 +7,104 @@ export interface HITSOptions {
7
7
  /**
8
8
  * Computes Hub and Authority scores using the HITS algorithm.
9
9
  * Operates purely on the internal link graph.
10
+ * Optimized for performance using array-based adjacency lists.
10
11
  */
11
12
  export function computeHITS(graph: Graph, options: HITSOptions = {}): void {
12
13
  const iterations = options.iterations || 20;
13
14
  const nodes = graph.getNodes();
14
15
 
15
16
  // 1. Filter eligible nodes
16
- // Eligibility: status 200, non-redirect (redirectChain empty), not noindex, non-external
17
+ // Eligibility: status 200 (crawled) or status 0 (discovered)
18
+ // Non-redirect, not noindex (if known), non-external
17
19
  const eligibleNodes = nodes.filter(n =>
18
- n.status === 200 &&
20
+ (n.status === 200 || n.status === 0) &&
19
21
  (!n.redirectChain || n.redirectChain.length === 0) &&
20
22
  !n.noindex
21
23
  );
22
24
 
23
- if (eligibleNodes.length === 0) return;
25
+ const N = eligibleNodes.length;
26
+ if (N === 0) return;
24
27
 
25
- const urlToNode = new Map<string, GraphNode>();
26
- for (const node of eligibleNodes) {
27
- urlToNode.set(node.url, node);
28
- // 2. Initialization
29
- node.authorityScore = 1.0;
30
- node.hubScore = 1.0;
28
+ // Map URL to Index for O(1) access
29
+ const urlToIndex = new Map<string, number>();
30
+ for (let i = 0; i < N; i++) {
31
+ urlToIndex.set(eligibleNodes[i].url, i);
31
32
  }
32
33
 
33
- const allEdges = graph.getEdges();
34
- // Filter edges: internal links only (both source and target must be in eligibleNodes), no self-links
35
- const eligibleEdges = allEdges.filter(e =>
36
- e.source !== e.target &&
37
- urlToNode.has(e.source) &&
38
- urlToNode.has(e.target)
39
- );
34
+ // Build Adjacency Lists (Indices)
35
+ // incoming[i] = list of { sourceIndex, weight }
36
+ // outgoing[i] = list of { targetIndex, weight }
37
+ const incoming: { sourceIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []);
38
+ const outgoing: { targetIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []);
40
39
 
41
- // Group edges for efficient iteration
42
- const incoming = new Map<string, { source: string, weight: number }[]>();
43
- const outgoing = new Map<string, { target: string, weight: number }[]>();
40
+ const allEdges = graph.getEdges();
41
+ for (const edge of allEdges) {
42
+ if (edge.source === edge.target) continue;
44
43
 
45
- for (const edge of eligibleEdges) {
46
- if (!incoming.has(edge.target)) incoming.set(edge.target, []);
47
- incoming.get(edge.target)!.push({ source: edge.source, weight: edge.weight });
44
+ const sourceIndex = urlToIndex.get(edge.source);
45
+ const targetIndex = urlToIndex.get(edge.target);
48
46
 
49
- if (!outgoing.has(edge.source)) outgoing.set(edge.source, []);
50
- outgoing.get(edge.source)!.push({ target: edge.target, weight: edge.weight });
47
+ if (sourceIndex !== undefined && targetIndex !== undefined) {
48
+ incoming[targetIndex].push({ sourceIndex, weight: edge.weight });
49
+ outgoing[sourceIndex].push({ targetIndex, weight: edge.weight });
50
+ }
51
51
  }
52
52
 
53
- // 3. Iteration
54
- for (let i = 0; i < iterations; i++) {
53
+ // Initialize Scores
54
+ const authScores = new Float64Array(N).fill(1.0);
55
+ const hubScores = new Float64Array(N).fill(1.0);
56
+
57
+ // 2. Iteration
58
+ for (let iter = 0; iter < iterations; iter++) {
55
59
  // Update Authorities
56
60
  let normAuth = 0;
57
- for (const node of eligibleNodes) {
58
- const inLinks = incoming.get(node.url) || [];
61
+ for (let i = 0; i < N; i++) {
62
+ const inLinks = incoming[i];
59
63
  let newAuth = 0;
60
- for (const link of inLinks) {
61
- const sourceNode = urlToNode.get(link.source)!;
62
- newAuth += (sourceNode.hubScore || 0) * link.weight;
64
+ for (let j = 0; j < inLinks.length; j++) {
65
+ const link = inLinks[j];
66
+ newAuth += hubScores[link.sourceIndex] * link.weight;
63
67
  }
64
- node.authorityScore = newAuth;
68
+ authScores[i] = newAuth;
65
69
  normAuth += newAuth * newAuth;
66
70
  }
67
71
 
68
72
  // Normalize Authorities (L2 norm)
69
73
  normAuth = Math.sqrt(normAuth);
70
74
  if (normAuth > 0) {
71
- for (const node of eligibleNodes) {
72
- node.authorityScore = (node.authorityScore || 0) / normAuth;
75
+ for (let i = 0; i < N; i++) {
76
+ authScores[i] /= normAuth;
73
77
  }
74
78
  }
75
79
 
76
80
  // Update Hubs
77
81
  let normHub = 0;
78
- for (const node of eligibleNodes) {
79
- const outLinks = outgoing.get(node.url) || [];
82
+ for (let i = 0; i < N; i++) {
83
+ const outLinks = outgoing[i];
80
84
  let newHub = 0;
81
- for (const link of outLinks) {
82
- const targetNode = urlToNode.get(link.target)!;
83
- newHub += (targetNode.authorityScore || 0) * link.weight;
85
+ for (let j = 0; j < outLinks.length; j++) {
86
+ const link = outLinks[j];
87
+ newHub += authScores[link.targetIndex] * link.weight;
84
88
  }
85
- node.hubScore = newHub;
89
+ hubScores[i] = newHub;
86
90
  normHub += newHub * newHub;
87
91
  }
88
92
 
89
93
  // Normalize Hubs (L2 norm)
90
94
  normHub = Math.sqrt(normHub);
91
95
  if (normHub > 0) {
92
- for (const node of eligibleNodes) {
93
- node.hubScore = (node.hubScore || 0) / normHub;
96
+ for (let i = 0; i < N; i++) {
97
+ hubScores[i] /= normHub;
94
98
  }
95
99
  }
96
100
  }
97
101
 
102
+ // 3. Assign back to GraphNodes
103
+ for (let i = 0; i < N; i++) {
104
+ eligibleNodes[i].authorityScore = authScores[i];
105
+ eligibleNodes[i].hubScore = hubScores[i];
106
+ }
107
+
98
108
  // 4. Classification Logic
99
109
  classifyLinkRoles(eligibleNodes);
100
110
  }
@@ -106,15 +116,27 @@ function classifyLinkRoles(nodes: GraphNode[]): void {
106
116
  const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
107
117
 
108
118
  // Use 75th percentile as "high" threshold
119
+ // Using median (50th percentile) as per original implementation,
120
+ // but the comment said "Use 75th percentile" while code used median.
121
+ // I'll stick to median to avoid breaking existing behavior, but correct the comment or logic?
122
+ // The original code:
123
+ // const medianAuth = authScores[Math.floor(authScores.length / 2)];
124
+ // const isHighAuth = auth > medianAuth && auth > 0.0001;
125
+ // So it uses median. I'll keep it as median.
126
+
109
127
  const medianAuth = authScores[Math.floor(authScores.length / 2)];
110
128
  const medianHub = hubScores[Math.floor(hubScores.length / 2)];
129
+ const maxAuth = authScores[authScores.length - 1];
130
+ const maxHub = hubScores[hubScores.length - 1];
111
131
 
112
132
  for (const node of nodes) {
113
133
  const auth = node.authorityScore || 0;
114
134
  const hub = node.hubScore || 0;
115
135
 
116
- const isHighAuth = auth > medianAuth && auth > 0.0001;
117
- const isHighHub = hub > medianHub && hub > 0.0001;
136
+ // A node is high if it's above median, OR if it's the max (to handle uniform distributions)
137
+ // auth > 0 check is essential.
138
+ const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
139
+ const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
118
140
 
119
141
  if (isHighAuth && isHighHub) {
120
142
  node.linkRole = 'power';
@@ -122,7 +144,7 @@ function classifyLinkRoles(nodes: GraphNode[]): void {
122
144
  node.linkRole = 'authority';
123
145
  } else if (isHighHub) {
124
146
  node.linkRole = 'hub';
125
- } else if (auth > 0.0001 && hub > 0.0001) {
147
+ } else if (auth > 0.00001 && hub > 0.00001) {
126
148
  node.linkRole = 'balanced';
127
149
  } else {
128
150
  node.linkRole = 'peripheral';