@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -1,223 +1,22 @@
1
+ import { Crawl_HTML } from './crawl_template.js';
1
2
  function safeJson(data) {
2
3
  return JSON.stringify(data).replace(/</g, '\\u003c');
3
4
  }
4
5
  export function generateHtml(graphData, metrics) {
5
- const graphJson = safeJson(graphData);
6
- return `<!DOCTYPE html>
7
- <html lang="en">
8
- <head>
9
- <meta charset="UTF-8">
10
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
11
- <title>Crawlith Site Graph</title>
12
- <style>
13
- body { margin: 0; overflow: hidden; font-family: sans-serif; }
14
- #graph { width: 100vw; height: 100vh; background: #f0f0f0; }
15
- .tooltip {
16
- position: absolute;
17
- background: white;
18
- border: 1px solid #ccc;
19
- padding: 10px;
20
- pointer-events: none;
21
- font-size: 12px;
22
- box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
23
- display: none;
24
- }
25
- #metrics {
26
- position: absolute;
27
- top: 10px;
28
- left: 10px;
29
- background: rgba(255, 255, 255, 0.9);
30
- padding: 15px;
31
- border-radius: 5px;
32
- box-shadow: 0 0 10px rgba(0,0,0,0.1);
33
- max-width: 320px;
34
- max-height: 90vh;
35
- overflow-y: auto;
36
- z-index: 100;
37
- }
38
- h1 { font-size: 18px; margin-top: 0; }
39
- h2 { font-size: 14px; margin: 15px 0 5px; border-bottom: 1px solid #ddd; }
40
- ul { padding-left: 20px; margin: 5px 0; }
41
- .legend { margin-top: 10px; font-size: 11px; }
42
- .legend-item { display: flex; align-items: center; margin-bottom: 3px; }
43
- .dot { width: 8px; height: 8px; border-radius: 50%; margin-right: 5px; }
44
- .stat-row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
45
- .stat-label { color: #666; }
46
- .stat-value { font-weight: bold; }
47
- </style>
48
- </head>
49
- <body>
50
- <div id="metrics">
51
- <h1>Crawlith Site Graph</h1>
52
-
53
- <div class="stat-row">
54
- <span class="stat-label">Discovered Pages:</span>
55
- <span class="stat-value">${metrics.totalPages}</span>
56
- </div>
57
- ${metrics.sessionStats ? `
58
- <div class="stat-row">
59
- <span class="stat-label">Session Crawl:</span>
60
- <span class="stat-value">${metrics.sessionStats.pagesFetched} pages</span>
61
- </div>
62
- ${metrics.sessionStats.pagesCached > 0 ? `
63
- <div class="stat-row" style="font-size: 11px; margin-top: -3px;">
64
- <span class="stat-label" style="padding-left: 10px;">- Reuse Cached:</span>
65
- <span class="stat-value">${metrics.sessionStats.pagesCached}</span>
66
- </div>` : ''}
67
- ` : ''}
68
- <div class="stat-row">
69
- <span class="stat-label">Total Edges:</span>
70
- <span class="stat-value">${metrics.totalEdges}</span>
71
- </div>
72
- <div class="stat-row">
73
- <span class="stat-label">Max Depth:</span>
74
- <span class="stat-value">${metrics.maxDepthFound}</span>
75
- </div>
76
- <div class="stat-row">
77
- <span class="stat-label">Avg Out-Degree:</span>
78
- <span class="stat-value">${metrics.averageOutDegree.toFixed(2)}</span>
79
- </div>
80
-
81
- <div class="legend">
82
- <div class="legend-item"><div class="dot" style="background: red;"></div>Orphan (In-Links: 0)</div>
83
- <div class="legend-item"><div class="dot" style="background: orange;"></div>Deep (Depth >= 4)</div>
84
- <div class="legend-item"><div class="dot" style="background: blue;"></div>Normal</div>
85
- </div>
86
-
87
- ${metrics.topAuthorityPages.length > 0 ? `
88
- <h3>Top Authority</h3>
89
- <ul>
90
- ${metrics.topAuthorityPages.map(p => `<li><a href="${p.url}" target="_blank">${new URL(p.url).pathname}</a> (${p.authority.toFixed(2)})</li>`).join('')}
91
- </ul>
92
- ` : ''}
93
-
94
- ${metrics.orphanPages.length > 0 ? `
95
- <h3>Orphan Pages (${metrics.orphanPages.length})</h3>
96
- <details>
97
- <summary>Show list</summary>
98
- <ul>
99
- ${metrics.orphanPages.slice(0, 20).map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('')}
100
- ${metrics.orphanPages.length > 20 ? `<li>... and ${metrics.orphanPages.length - 20} more</li>` : ''}
101
- </ul>
102
- </details>
103
- ` : ''}
104
- </div>
105
- <div id="graph"></div>
106
- <div class="tooltip" id="tooltip"></div>
107
-
108
- <script src="https://d3js.org/d3.v7.min.js"></script>
109
- <script>
110
- // Make data available globally
6
+ // Strip heavy HTML content from nodes to keep the report lightweight
7
+ const vizGraphData = {
8
+ ...graphData,
9
+ nodes: graphData.nodes ? graphData.nodes.map((n) => {
10
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
11
+ const { html, ...rest } = n;
12
+ return rest;
13
+ }) : []
14
+ };
15
+ const graphJson = safeJson(vizGraphData);
16
+ const metricsJson = safeJson(metrics);
17
+ return Crawl_HTML.replace('</body>', `<script>
111
18
  window.GRAPH_DATA = ${graphJson};
112
-
113
- const data = window.GRAPH_DATA;
114
- const width = window.innerWidth;
115
- const height = window.innerHeight;
116
-
117
- const svg = d3.select("#graph").append("svg")
118
- .attr("width", width)
119
- .attr("height", height)
120
- .call(d3.zoom().on("zoom", (event) => {
121
- g.attr("transform", event.transform);
122
- }));
123
-
124
- const g = svg.append("g");
125
-
126
- // Define arrow marker
127
- svg.append("defs").selectAll("marker")
128
- .data(["arrow"])
129
- .enter().append("marker")
130
- .attr("id", d => d)
131
- .attr("viewBox", "0 -5 10 10")
132
- .attr("refX", 15)
133
- .attr("refY", 0)
134
- .attr("markerWidth", 6)
135
- .attr("markerHeight", 6)
136
- .attr("orient", "auto")
137
- .append("path")
138
- .attr("d", "M0,-5L10,0L0,5")
139
- .attr("fill", "#999");
140
-
141
- const simulation = d3.forceSimulation(data.nodes)
142
- .force("link", d3.forceLink(data.edges).id(d => d.url).distance(100))
143
- .force("charge", d3.forceManyBody().strength(-300))
144
- .force("center", d3.forceCenter(width / 2, height / 2))
145
- .force("collide", d3.forceCollide().radius(d => Math.sqrt((d.inLinks || 0) + 1) * 5 + 2));
146
-
147
- const link = g.append("g")
148
- .attr("stroke", "#999")
149
- .attr("stroke-opacity", 0.6)
150
- .selectAll("line")
151
- .data(data.edges)
152
- .join("line")
153
- .attr("stroke-width", 1)
154
- .attr("marker-end", "url(#arrow)");
155
-
156
-
157
- const node = g.append("g")
158
- .attr("stroke", "#fff")
159
- .attr("stroke-width", 1.5)
160
- .selectAll("circle")
161
- .data(data.nodes)
162
- .join("circle")
163
- .attr("r", d => Math.sqrt((d.inLinks || 0) + 1) * 3 + 2)
164
- .attr("fill", d => {
165
- if (d.inLinks === 0 && d.depth > 0) return "red";
166
- if (d.depth >= 4) return "orange";
167
- return "blue";
168
- })
169
- .call(d3.drag()
170
- .on("start", dragstarted)
171
- .on("drag", dragged)
172
- .on("end", dragended));
173
-
174
- const tooltip = d3.select("#tooltip");
175
-
176
- node.on("mouseover", (event, d) => {
177
- tooltip.style("display", "block")
178
- .html(\`
179
- <strong>URL:</strong> \${d.url}<br>
180
- <strong>Depth:</strong> \${d.depth}<br>
181
- <strong>In-Links:</strong> \${d.inLinks}<br>
182
- <strong>Out-Links:</strong> \${d.outLinks}<br>
183
- <strong>Status:</strong> \${d.status}
184
- \`)
185
- .style("left", (event.pageX + 10) + "px")
186
- .style("top", (event.pageY - 10) + "px");
187
- })
188
- .on("mouseout", () => {
189
- tooltip.style("display", "none");
190
- });
191
-
192
- simulation.on("tick", () => {
193
- link
194
- .attr("x1", d => d.source.x)
195
- .attr("y1", d => d.source.y)
196
- .attr("x2", d => d.target.x)
197
- .attr("y2", d => d.target.y);
198
-
199
- node
200
- .attr("cx", d => d.x)
201
- .attr("cy", d => d.y);
202
- });
203
-
204
- function dragstarted(event, d) {
205
- if (!event.active) simulation.alphaTarget(0.3).restart();
206
- d.fx = d.x;
207
- d.fy = d.y;
208
- }
209
-
210
- function dragged(event, d) {
211
- d.fx = event.x;
212
- d.fy = event.y;
213
- }
214
-
215
- function dragended(event, d) {
216
- if (!event.active) simulation.alphaTarget(0);
217
- d.fx = null;
218
- d.fy = null;
219
- }
19
+ window.METRICS_DATA = ${metricsJson};
220
20
  </script>
221
- </body>
222
- </html>`;
21
+ </body>`);
223
22
  }
@@ -0,0 +1,50 @@
1
+ import { Graph } from '../graph/graph.js';
2
+ import { Metrics } from '../graph/metrics.js';
3
+ export declare const THIN_CONTENT_THRESHOLD = 300;
4
+ export declare const LOW_INTERNAL_LINK_THRESHOLD = 2;
5
+ export declare const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
6
+ export declare const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
7
+ export declare const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
8
+ export interface HealthScoreWeights {
9
+ orphans: number;
10
+ brokenLinks: number;
11
+ redirectChains: number;
12
+ duplicateClusters: number;
13
+ thinContent: number;
14
+ missingH1: number;
15
+ noindexMisuse: number;
16
+ canonicalConflicts: number;
17
+ lowInternalLinks: number;
18
+ excessiveLinks: number;
19
+ blockedByRobots: number;
20
+ }
21
+ export declare const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights;
22
+ export interface CrawlIssueCounts {
23
+ orphanPages: number;
24
+ brokenInternalLinks: number;
25
+ redirectChains: number;
26
+ duplicateClusters: number;
27
+ canonicalConflicts: number;
28
+ accidentalNoindex: number;
29
+ missingH1: number;
30
+ thinContent: number;
31
+ lowInternalLinkCount: number;
32
+ excessiveInternalLinkCount: number;
33
+ highExternalLinkRatio: number;
34
+ imageAltMissing: number;
35
+ strongPagesUnderLinking: number;
36
+ cannibalizationClusters: number;
37
+ nearAuthorityThreshold: number;
38
+ underlinkedHighAuthorityPages: number;
39
+ externalLinks: number;
40
+ blockedByRobots: number;
41
+ }
42
+ export interface HealthScoreBreakdown {
43
+ score: number;
44
+ status: string;
45
+ weightedPenalties: Record<keyof HealthScoreWeights, number>;
46
+ weights: HealthScoreWeights;
47
+ }
48
+ export declare function healthStatusLabel(score: number, hasCritical?: boolean): string;
49
+ export declare function calculateHealthScore(totalPages: number, issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots'>, weights?: HealthScoreWeights): HealthScoreBreakdown;
50
+ export declare function collectCrawlIssues(graph: Graph, metrics: Metrics): CrawlIssueCounts;
@@ -0,0 +1,170 @@
1
+ import { analyzeContent } from '../analysis/content.js';
2
+ import { analyzeH1 } from '../analysis/seo.js';
3
+ import { analyzeImageAlts } from '../analysis/images.js';
4
+ import { analyzeLinks } from '../analysis/links.js';
5
+ export const THIN_CONTENT_THRESHOLD = 300;
6
+ export const LOW_INTERNAL_LINK_THRESHOLD = 2;
7
+ export const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
8
+ export const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
9
+ export const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
10
+ export const DEFAULT_HEALTH_WEIGHTS = {
11
+ orphans: 50,
12
+ brokenLinks: 100,
13
+ redirectChains: 20,
14
+ duplicateClusters: 25,
15
+ thinContent: 15,
16
+ missingH1: 10,
17
+ noindexMisuse: 20,
18
+ canonicalConflicts: 10,
19
+ lowInternalLinks: 10,
20
+ excessiveLinks: 5,
21
+ blockedByRobots: 100
22
+ };
23
+ function clamp(value, min, max) {
24
+ return Math.min(max, Math.max(min, value));
25
+ }
26
+ export function healthStatusLabel(score, hasCritical = false) {
27
+ if (hasCritical && score >= 75)
28
+ return 'Needs Attention';
29
+ if (score >= 90)
30
+ return 'Excellent';
31
+ if (score >= 75)
32
+ return 'Good';
33
+ if (score >= 50)
34
+ return 'Needs Attention';
35
+ return 'Critical';
36
+ }
37
+ export function calculateHealthScore(totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) {
38
+ const safePages = Math.max(totalPages, 1);
39
+ const weightedPenalties = {
40
+ orphans: clamp((issues.orphanPages / safePages) * weights.orphans, 0, weights.orphans),
41
+ brokenLinks: clamp((issues.brokenInternalLinks / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
42
+ redirectChains: clamp((issues.redirectChains / safePages) * weights.redirectChains, 0, weights.redirectChains),
43
+ duplicateClusters: clamp((issues.duplicateClusters / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
44
+ thinContent: clamp((issues.thinContent / safePages) * weights.thinContent, 0, weights.thinContent),
45
+ missingH1: clamp((issues.missingH1 / safePages) * weights.missingH1, 0, weights.missingH1),
46
+ noindexMisuse: clamp((issues.accidentalNoindex / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
47
+ canonicalConflicts: clamp((issues.canonicalConflicts / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
48
+ lowInternalLinks: clamp((issues.lowInternalLinkCount / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
49
+ excessiveLinks: clamp((issues.excessiveInternalLinkCount / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
50
+ blockedByRobots: clamp((issues.blockedByRobots / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots)
51
+ };
52
+ const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
53
+ const score = Number(clamp(100 - totalPenalty, 0, 100).toFixed(1));
54
+ const hasCritical = (issues.orphanPages > 0 ||
55
+ issues.brokenInternalLinks > 0 ||
56
+ issues.redirectChains > 0 ||
57
+ issues.duplicateClusters > 0 ||
58
+ issues.canonicalConflicts > 0 ||
59
+ issues.accidentalNoindex > 0 ||
60
+ issues.blockedByRobots > 0);
61
+ return {
62
+ score,
63
+ status: healthStatusLabel(score, hasCritical),
64
+ weightedPenalties,
65
+ weights
66
+ };
67
+ }
68
+ export function collectCrawlIssues(graph, metrics) {
69
+ const nodes = graph.getNodes();
70
+ let brokenInternalLinks = 0;
71
+ let redirectChains = 0;
72
+ let canonicalConflicts = 0;
73
+ let accidentalNoindex = 0;
74
+ let missingH1 = 0;
75
+ let thinContent = 0;
76
+ let highExternalLinkRatio = 0;
77
+ let imageAltMissing = 0;
78
+ let lowInternalLinkCount = 0;
79
+ let excessiveInternalLinkCount = 0;
80
+ let strongPagesUnderLinking = 0;
81
+ let nearAuthorityThreshold = 0;
82
+ let underlinkedHighAuthorityPages = 0;
83
+ let externalLinks = 0;
84
+ let blockedByRobots = 0;
85
+ for (const node of nodes) {
86
+ if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
87
+ blockedByRobots += 1;
88
+ }
89
+ const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
90
+ if (isConfirmedError) {
91
+ brokenInternalLinks += 1;
92
+ }
93
+ if (node.brokenLinks) {
94
+ const actualBreaks = node.brokenLinks.filter(url => {
95
+ const target = graph.nodes.get(url);
96
+ return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
97
+ });
98
+ brokenInternalLinks += actualBreaks.length;
99
+ }
100
+ if ((node.redirectChain?.length || 0) > 1) {
101
+ redirectChains += 1;
102
+ }
103
+ if (node.canonical && node.canonical !== node.url) {
104
+ canonicalConflicts += 1;
105
+ }
106
+ if (node.noindex && node.status >= 200 && node.status < 300) {
107
+ accidentalNoindex += 1;
108
+ }
109
+ if (node.inLinks < LOW_INTERNAL_LINK_THRESHOLD && node.depth > 0) {
110
+ lowInternalLinkCount += 1;
111
+ }
112
+ if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
113
+ excessiveInternalLinkCount += 1;
114
+ }
115
+ if (!node.html) {
116
+ continue;
117
+ }
118
+ const h1 = analyzeH1(node.html, '');
119
+ if (h1.count === 0) {
120
+ missingH1 += 1;
121
+ }
122
+ const content = analyzeContent(node.html);
123
+ if (content.wordCount < THIN_CONTENT_THRESHOLD) {
124
+ thinContent += 1;
125
+ }
126
+ const links = analyzeLinks(node.html, node.url, node.url);
127
+ externalLinks += links.externalLinks;
128
+ if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
129
+ highExternalLinkRatio += 1;
130
+ }
131
+ const imageAlt = analyzeImageAlts(node.html);
132
+ if (imageAlt.missingAlt > 0) {
133
+ imageAltMissing += 1;
134
+ }
135
+ }
136
+ const duplicateClusters = graph.duplicateClusters?.length || 0;
137
+ const cannibalizationClusters = graph.duplicateClusters?.filter((cluster) => cluster.type === 'near').length || 0;
138
+ for (const node of nodes) {
139
+ const authority = node.pageRank || 0;
140
+ if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
141
+ strongPagesUnderLinking += 1;
142
+ }
143
+ if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
144
+ nearAuthorityThreshold += 1;
145
+ }
146
+ if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
147
+ underlinkedHighAuthorityPages += 1;
148
+ }
149
+ }
150
+ return {
151
+ orphanPages: metrics.orphanPages.length,
152
+ brokenInternalLinks,
153
+ redirectChains,
154
+ duplicateClusters,
155
+ canonicalConflicts,
156
+ accidentalNoindex,
157
+ missingH1,
158
+ thinContent,
159
+ lowInternalLinkCount,
160
+ excessiveInternalLinkCount,
161
+ highExternalLinkRatio,
162
+ imageAltMissing,
163
+ strongPagesUnderLinking,
164
+ cannibalizationClusters,
165
+ nearAuthorityThreshold,
166
+ underlinkedHighAuthorityPages,
167
+ externalLinks,
168
+ blockedByRobots
169
+ };
170
+ }
@@ -5,5 +5,6 @@ export interface HITSOptions {
5
5
  /**
6
6
  * Computes Hub and Authority scores using the HITS algorithm.
7
7
  * Operates purely on the internal link graph.
8
+ * Optimized for performance using array-based adjacency lists.
8
9
  */
9
10
  export declare function computeHITS(graph: Graph, options?: HITSOptions): void;
@@ -1,81 +1,90 @@
1
1
  /**
2
2
  * Computes Hub and Authority scores using the HITS algorithm.
3
3
  * Operates purely on the internal link graph.
4
+ * Optimized for performance using array-based adjacency lists.
4
5
  */
5
6
  export function computeHITS(graph, options = {}) {
6
7
  const iterations = options.iterations || 20;
7
8
  const nodes = graph.getNodes();
8
9
  // 1. Filter eligible nodes
9
- // Eligibility: status 200, non-redirect (redirectChain empty), not noindex, non-external
10
- const eligibleNodes = nodes.filter(n => n.status === 200 &&
10
+ // Eligibility: status 200 (crawled) or status 0 (discovered)
11
+ // Non-redirect, not noindex (if known), non-external
12
+ const eligibleNodes = nodes.filter(n => (n.status === 200 || n.status === 0) &&
11
13
  (!n.redirectChain || n.redirectChain.length === 0) &&
12
14
  !n.noindex);
13
- if (eligibleNodes.length === 0)
15
+ const N = eligibleNodes.length;
16
+ if (N === 0)
14
17
  return;
15
- const urlToNode = new Map();
16
- for (const node of eligibleNodes) {
17
- urlToNode.set(node.url, node);
18
- // 2. Initialization
19
- node.authorityScore = 1.0;
20
- node.hubScore = 1.0;
18
+ // Map URL to Index for O(1) access
19
+ const urlToIndex = new Map();
20
+ for (let i = 0; i < N; i++) {
21
+ urlToIndex.set(eligibleNodes[i].url, i);
21
22
  }
23
+ // Build Adjacency Lists (Indices)
24
+ // incoming[i] = list of { sourceIndex, weight }
25
+ // outgoing[i] = list of { targetIndex, weight }
26
+ const incoming = new Array(N).fill(null).map(() => []);
27
+ const outgoing = new Array(N).fill(null).map(() => []);
22
28
  const allEdges = graph.getEdges();
23
- // Filter edges: internal links only (both source and target must be in eligibleNodes), no self-links
24
- const eligibleEdges = allEdges.filter(e => e.source !== e.target &&
25
- urlToNode.has(e.source) &&
26
- urlToNode.has(e.target));
27
- // Group edges for efficient iteration
28
- const incoming = new Map();
29
- const outgoing = new Map();
30
- for (const edge of eligibleEdges) {
31
- if (!incoming.has(edge.target))
32
- incoming.set(edge.target, []);
33
- incoming.get(edge.target).push({ source: edge.source, weight: edge.weight });
34
- if (!outgoing.has(edge.source))
35
- outgoing.set(edge.source, []);
36
- outgoing.get(edge.source).push({ target: edge.target, weight: edge.weight });
29
+ for (const edge of allEdges) {
30
+ if (edge.source === edge.target)
31
+ continue;
32
+ const sourceIndex = urlToIndex.get(edge.source);
33
+ const targetIndex = urlToIndex.get(edge.target);
34
+ if (sourceIndex !== undefined && targetIndex !== undefined) {
35
+ incoming[targetIndex].push({ sourceIndex, weight: edge.weight });
36
+ outgoing[sourceIndex].push({ targetIndex, weight: edge.weight });
37
+ }
37
38
  }
38
- // 3. Iteration
39
- for (let i = 0; i < iterations; i++) {
39
+ // Initialize Scores
40
+ const authScores = new Float64Array(N).fill(1.0);
41
+ const hubScores = new Float64Array(N).fill(1.0);
42
+ // 2. Iteration
43
+ for (let iter = 0; iter < iterations; iter++) {
40
44
  // Update Authorities
41
45
  let normAuth = 0;
42
- for (const node of eligibleNodes) {
43
- const inLinks = incoming.get(node.url) || [];
46
+ for (let i = 0; i < N; i++) {
47
+ const inLinks = incoming[i];
44
48
  let newAuth = 0;
45
- for (const link of inLinks) {
46
- const sourceNode = urlToNode.get(link.source);
47
- newAuth += (sourceNode.hubScore || 0) * link.weight;
49
+ for (let j = 0; j < inLinks.length; j++) {
50
+ const link = inLinks[j];
51
+ newAuth += hubScores[link.sourceIndex] * link.weight;
48
52
  }
49
- node.authorityScore = newAuth;
53
+ authScores[i] = newAuth;
50
54
  normAuth += newAuth * newAuth;
51
55
  }
52
56
  // Normalize Authorities (L2 norm)
53
57
  normAuth = Math.sqrt(normAuth);
54
58
  if (normAuth > 0) {
55
- for (const node of eligibleNodes) {
56
- node.authorityScore = (node.authorityScore || 0) / normAuth;
59
+ for (let i = 0; i < N; i++) {
60
+ authScores[i] /= normAuth;
57
61
  }
58
62
  }
59
63
  // Update Hubs
60
64
  let normHub = 0;
61
- for (const node of eligibleNodes) {
62
- const outLinks = outgoing.get(node.url) || [];
65
+ for (let i = 0; i < N; i++) {
66
+ const outLinks = outgoing[i];
63
67
  let newHub = 0;
64
- for (const link of outLinks) {
65
- const targetNode = urlToNode.get(link.target);
66
- newHub += (targetNode.authorityScore || 0) * link.weight;
68
+ for (let j = 0; j < outLinks.length; j++) {
69
+ const link = outLinks[j];
70
+ newHub += authScores[link.targetIndex] * link.weight;
67
71
  }
68
- node.hubScore = newHub;
72
+ hubScores[i] = newHub;
69
73
  normHub += newHub * newHub;
70
74
  }
71
75
  // Normalize Hubs (L2 norm)
72
76
  normHub = Math.sqrt(normHub);
73
77
  if (normHub > 0) {
74
- for (const node of eligibleNodes) {
75
- node.hubScore = (node.hubScore || 0) / normHub;
78
+ for (let i = 0; i < N; i++) {
79
+ hubScores[i] /= normHub;
76
80
  }
77
81
  }
78
82
  }
83
+ // 3. Assign back to GraphNodes
84
+ for (let i = 0; i < N; i++) {
85
+ eligibleNodes[i].authorityScore = authScores[i];
86
+ eligibleNodes[i].hubScore = hubScores[i];
87
+ }
79
88
  // 4. Classification Logic
80
89
  classifyLinkRoles(eligibleNodes);
81
90
  }
@@ -85,13 +94,24 @@ function classifyLinkRoles(nodes) {
85
94
  const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
86
95
  const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
87
96
  // Use 75th percentile as "high" threshold
97
+ // Using median (50th percentile) as per original implementation,
98
+ // but the comment said "Use 75th percentile" while code used median.
99
+ // I'll stick to median to avoid breaking existing behavior, but correct the comment or logic?
100
+ // The original code:
101
+ // const medianAuth = authScores[Math.floor(authScores.length / 2)];
102
+ // const isHighAuth = auth > medianAuth && auth > 0.0001;
103
+ // So it uses median. I'll keep it as median.
88
104
  const medianAuth = authScores[Math.floor(authScores.length / 2)];
89
105
  const medianHub = hubScores[Math.floor(hubScores.length / 2)];
106
+ const maxAuth = authScores[authScores.length - 1];
107
+ const maxHub = hubScores[hubScores.length - 1];
90
108
  for (const node of nodes) {
91
109
  const auth = node.authorityScore || 0;
92
110
  const hub = node.hubScore || 0;
93
- const isHighAuth = auth > medianAuth && auth > 0.0001;
94
- const isHighHub = hub > medianHub && hub > 0.0001;
111
+ // A node is high if it's above median, OR if it's the max (to handle uniform distributions)
112
+ // auth > 0 check is essential.
113
+ const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
114
+ const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
95
115
  if (isHighAuth && isHighHub) {
96
116
  node.linkRole = 'power';
97
117
  }
@@ -101,7 +121,7 @@ function classifyLinkRoles(nodes) {
101
121
  else if (isHighHub) {
102
122
  node.linkRole = 'hub';
103
123
  }
104
- else if (auth > 0.0001 && hub > 0.0001) {
124
+ else if (auth > 0.00001 && hub > 0.00001) {
105
125
  node.linkRole = 'balanced';
106
126
  }
107
127
  else {
@@ -1,6 +1,6 @@
1
1
  export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
2
2
  export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
3
- export interface SitegraphNode {
3
+ export interface CrawlNode {
4
4
  url: string;
5
5
  depth: number;
6
6
  inLinks: number;
@@ -17,7 +17,7 @@ export interface SitegraphNode {
17
17
  duplicateContent?: boolean;
18
18
  isProductOrCommercial?: boolean;
19
19
  }
20
- export interface SitegraphEdge {
20
+ export interface CrawlEdge {
21
21
  source: string;
22
22
  target: string;
23
23
  }
@@ -28,12 +28,12 @@ export interface OrphanScoringOptions {
28
28
  minInbound: number;
29
29
  rootUrl?: string;
30
30
  }
31
- export type AnnotatedNode = SitegraphNode & {
31
+ export type AnnotatedNode = CrawlNode & {
32
32
  orphan: boolean;
33
33
  orphanType?: OrphanType;
34
34
  orphanSeverity?: number;
35
35
  impactLevel?: ImpactLevel;
36
36
  };
37
37
  export declare function mapImpactLevel(score: number): ImpactLevel;
38
- export declare function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphNode): number;
39
- export declare function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[], options: OrphanScoringOptions): AnnotatedNode[];
38
+ export declare function calculateOrphanSeverity(orphanType: OrphanType, node: CrawlNode): number;
39
+ export declare function annotateOrphans(nodes: CrawlNode[], edges: CrawlEdge[], options: OrphanScoringOptions): AnnotatedNode[];