@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,227 +0,0 @@
1
- import { Metrics } from '../graph/metrics.js';
2
-
3
- function safeJson(data: any): string {
4
- return JSON.stringify(data).replace(/</g, '\\u003c');
5
- }
6
-
7
- export function generateHtml(graphData: any, metrics: Metrics): string {
8
- const graphJson = safeJson(graphData);
9
-
10
- return `<!DOCTYPE html>
11
- <html lang="en">
12
- <head>
13
- <meta charset="UTF-8">
14
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
15
- <title>Crawlith Site Graph</title>
16
- <style>
17
- body { margin: 0; overflow: hidden; font-family: sans-serif; }
18
- #graph { width: 100vw; height: 100vh; background: #f0f0f0; }
19
- .tooltip {
20
- position: absolute;
21
- background: white;
22
- border: 1px solid #ccc;
23
- padding: 10px;
24
- pointer-events: none;
25
- font-size: 12px;
26
- box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
27
- display: none;
28
- }
29
- #metrics {
30
- position: absolute;
31
- top: 10px;
32
- left: 10px;
33
- background: rgba(255, 255, 255, 0.9);
34
- padding: 15px;
35
- border-radius: 5px;
36
- box-shadow: 0 0 10px rgba(0,0,0,0.1);
37
- max-width: 320px;
38
- max-height: 90vh;
39
- overflow-y: auto;
40
- z-index: 100;
41
- }
42
- h1 { font-size: 18px; margin-top: 0; }
43
- h2 { font-size: 14px; margin: 15px 0 5px; border-bottom: 1px solid #ddd; }
44
- ul { padding-left: 20px; margin: 5px 0; }
45
- .legend { margin-top: 10px; font-size: 11px; }
46
- .legend-item { display: flex; align-items: center; margin-bottom: 3px; }
47
- .dot { width: 8px; height: 8px; border-radius: 50%; margin-right: 5px; }
48
- .stat-row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
49
- .stat-label { color: #666; }
50
- .stat-value { font-weight: bold; }
51
- </style>
52
- </head>
53
- <body>
54
- <div id="metrics">
55
- <h1>Crawlith Site Graph</h1>
56
-
57
- <div class="stat-row">
58
- <span class="stat-label">Discovered Pages:</span>
59
- <span class="stat-value">${metrics.totalPages}</span>
60
- </div>
61
- ${metrics.sessionStats ? `
62
- <div class="stat-row">
63
- <span class="stat-label">Session Crawl:</span>
64
- <span class="stat-value">${metrics.sessionStats.pagesFetched} pages</span>
65
- </div>
66
- ${metrics.sessionStats.pagesCached > 0 ? `
67
- <div class="stat-row" style="font-size: 11px; margin-top: -3px;">
68
- <span class="stat-label" style="padding-left: 10px;">- Reuse Cached:</span>
69
- <span class="stat-value">${metrics.sessionStats.pagesCached}</span>
70
- </div>` : ''}
71
- ` : ''}
72
- <div class="stat-row">
73
- <span class="stat-label">Total Edges:</span>
74
- <span class="stat-value">${metrics.totalEdges}</span>
75
- </div>
76
- <div class="stat-row">
77
- <span class="stat-label">Max Depth:</span>
78
- <span class="stat-value">${metrics.maxDepthFound}</span>
79
- </div>
80
- <div class="stat-row">
81
- <span class="stat-label">Avg Out-Degree:</span>
82
- <span class="stat-value">${metrics.averageOutDegree.toFixed(2)}</span>
83
- </div>
84
-
85
- <div class="legend">
86
- <div class="legend-item"><div class="dot" style="background: red;"></div>Orphan (In-Links: 0)</div>
87
- <div class="legend-item"><div class="dot" style="background: orange;"></div>Deep (Depth >= 4)</div>
88
- <div class="legend-item"><div class="dot" style="background: blue;"></div>Normal</div>
89
- </div>
90
-
91
- ${metrics.topAuthorityPages.length > 0 ? `
92
- <h3>Top Authority</h3>
93
- <ul>
94
- ${metrics.topAuthorityPages.map(p => `<li><a href="${p.url}" target="_blank">${new URL(p.url).pathname}</a> (${p.authority.toFixed(2)})</li>`).join('')}
95
- </ul>
96
- ` : ''}
97
-
98
- ${metrics.orphanPages.length > 0 ? `
99
- <h3>Orphan Pages (${metrics.orphanPages.length})</h3>
100
- <details>
101
- <summary>Show list</summary>
102
- <ul>
103
- ${metrics.orphanPages.slice(0, 20).map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('')}
104
- ${metrics.orphanPages.length > 20 ? `<li>... and ${metrics.orphanPages.length - 20} more</li>` : ''}
105
- </ul>
106
- </details>
107
- ` : ''}
108
- </div>
109
- <div id="graph"></div>
110
- <div class="tooltip" id="tooltip"></div>
111
-
112
- <script src="https://d3js.org/d3.v7.min.js"></script>
113
- <script>
114
- // Make data available globally
115
- window.GRAPH_DATA = ${graphJson};
116
-
117
- const data = window.GRAPH_DATA;
118
- const width = window.innerWidth;
119
- const height = window.innerHeight;
120
-
121
- const svg = d3.select("#graph").append("svg")
122
- .attr("width", width)
123
- .attr("height", height)
124
- .call(d3.zoom().on("zoom", (event) => {
125
- g.attr("transform", event.transform);
126
- }));
127
-
128
- const g = svg.append("g");
129
-
130
- // Define arrow marker
131
- svg.append("defs").selectAll("marker")
132
- .data(["arrow"])
133
- .enter().append("marker")
134
- .attr("id", d => d)
135
- .attr("viewBox", "0 -5 10 10")
136
- .attr("refX", 15)
137
- .attr("refY", 0)
138
- .attr("markerWidth", 6)
139
- .attr("markerHeight", 6)
140
- .attr("orient", "auto")
141
- .append("path")
142
- .attr("d", "M0,-5L10,0L0,5")
143
- .attr("fill", "#999");
144
-
145
- const simulation = d3.forceSimulation(data.nodes)
146
- .force("link", d3.forceLink(data.edges).id(d => d.url).distance(100))
147
- .force("charge", d3.forceManyBody().strength(-300))
148
- .force("center", d3.forceCenter(width / 2, height / 2))
149
- .force("collide", d3.forceCollide().radius(d => Math.sqrt((d.inLinks || 0) + 1) * 5 + 2));
150
-
151
- const link = g.append("g")
152
- .attr("stroke", "#999")
153
- .attr("stroke-opacity", 0.6)
154
- .selectAll("line")
155
- .data(data.edges)
156
- .join("line")
157
- .attr("stroke-width", 1)
158
- .attr("marker-end", "url(#arrow)");
159
-
160
-
161
- const node = g.append("g")
162
- .attr("stroke", "#fff")
163
- .attr("stroke-width", 1.5)
164
- .selectAll("circle")
165
- .data(data.nodes)
166
- .join("circle")
167
- .attr("r", d => Math.sqrt((d.inLinks || 0) + 1) * 3 + 2)
168
- .attr("fill", d => {
169
- if (d.inLinks === 0 && d.depth > 0) return "red";
170
- if (d.depth >= 4) return "orange";
171
- return "blue";
172
- })
173
- .call(d3.drag()
174
- .on("start", dragstarted)
175
- .on("drag", dragged)
176
- .on("end", dragended));
177
-
178
- const tooltip = d3.select("#tooltip");
179
-
180
- node.on("mouseover", (event, d) => {
181
- tooltip.style("display", "block")
182
- .html(\`
183
- <strong>URL:</strong> \${d.url}<br>
184
- <strong>Depth:</strong> \${d.depth}<br>
185
- <strong>In-Links:</strong> \${d.inLinks}<br>
186
- <strong>Out-Links:</strong> \${d.outLinks}<br>
187
- <strong>Status:</strong> \${d.status}
188
- \`)
189
- .style("left", (event.pageX + 10) + "px")
190
- .style("top", (event.pageY - 10) + "px");
191
- })
192
- .on("mouseout", () => {
193
- tooltip.style("display", "none");
194
- });
195
-
196
- simulation.on("tick", () => {
197
- link
198
- .attr("x1", d => d.source.x)
199
- .attr("y1", d => d.source.y)
200
- .attr("x2", d => d.target.x)
201
- .attr("y2", d => d.target.y);
202
-
203
- node
204
- .attr("cx", d => d.x)
205
- .attr("cy", d => d.y);
206
- });
207
-
208
- function dragstarted(event, d) {
209
- if (!event.active) simulation.alphaTarget(0.3).restart();
210
- d.fx = d.x;
211
- d.fy = d.y;
212
- }
213
-
214
- function dragged(event, d) {
215
- d.fx = event.x;
216
- d.fy = event.y;
217
- }
218
-
219
- function dragended(event, d) {
220
- if (!event.active) simulation.alphaTarget(0);
221
- d.fx = null;
222
- d.fy = null;
223
- }
224
- </script>
225
- </body>
226
- </html>`;
227
- }
@@ -1,58 +0,0 @@
1
- export function renderSitegraphCsvNodes(graphData: any): string {
2
- const nodeHeaders = ['URL', 'Depth', 'Status', 'InboundLinks', 'OutboundLinks', 'PageRankScore'];
3
- const nodeRows = graphData.nodes.map((n: any) => {
4
- const outbound = graphData.edges.filter((e: any) => e.source === n.url).length;
5
- const inbound = graphData.edges.filter((e: any) => e.target === n.url).length;
6
- const statusStr = n.status === 0 ? 'Pending/Limit' : n.status;
7
- return [n.url, n.depth, statusStr, inbound, outbound, (n.pageRankScore || 0).toFixed(3)].join(',');
8
- });
9
- return [nodeHeaders.join(','), ...nodeRows].join('\n');
10
- }
11
-
12
- export function renderSitegraphCsvEdges(graphData: any): string {
13
- const edgeHeaders = ['Source', 'Target', 'Weight'];
14
- const edgeRows = graphData.edges.map((e: any) => [e.source, e.target, e.weight].join(','));
15
- return [edgeHeaders.join(','), ...edgeRows].join('\n');
16
- }
17
-
18
- export function renderSitegraphMarkdown(url: string, graphData: any, metrics: any, graph: any): string {
19
- const md = [
20
- `# Crawlith Crawl Summary - ${url}`,
21
- '',
22
- `## 📊 Metrics`,
23
- `- Total Pages Discovered: ${metrics.totalPages}`,
24
- `- Session Pages Crawled: ${graph.sessionStats?.pagesFetched ?? 0}`,
25
- `- Total Edges: ${metrics.totalEdges}`,
26
- `- Avg Depth: ${metrics.averageDepth.toFixed(2)}`,
27
- `- Max Depth: ${metrics.maxDepthFound}`,
28
- `- Crawl Efficiency: ${(metrics.crawlEfficiencyScore * 100).toFixed(1)}%`,
29
- '',
30
- `## 📄 Top Pages (by In-degree)`,
31
- ];
32
-
33
- const topPages = [...graphData.nodes]
34
- .map((n: any) => ({ ...n, inLinks: graphData.edges.filter((e: any) => e.target === n.url).length }))
35
- .sort((a, b) => b.inLinks - a.inLinks)
36
- .slice(0, 10);
37
-
38
- md.push('| URL | Inbound | Status |');
39
- md.push('| :--- | :--- | :--- |');
40
- topPages.forEach(p => {
41
- const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
42
- md.push(`| ${p.url} | ${p.inLinks} | ${statusStr} |`);
43
- });
44
-
45
- if (metrics.topPageRankPages?.length > 0) {
46
- md.push('');
47
- md.push('## 🏆 Top PageRank Pages');
48
- md.push('| URL | Score |');
49
- md.push('| :--- | :--- |');
50
- metrics.topPageRankPages.slice(0, 10).forEach((p: any) => {
51
- const node = graph.nodes?.get ? graph.nodes.get(p.url) : graph.getNodes?.().find((x: any) => x.url === p.url);
52
- const score = node?.pageRankScore ?? 0;
53
- md.push(`| ${p.url} | ${score.toFixed(3)}/100 |`);
54
- });
55
- }
56
-
57
- return md.join('\n');
58
- }
@@ -1,131 +0,0 @@
1
- import { Graph, GraphNode } from '../graph/graph.js';
2
-
3
- export interface HITSOptions {
4
- iterations?: number;
5
- }
6
-
7
- /**
8
- * Computes Hub and Authority scores using the HITS algorithm.
9
- * Operates purely on the internal link graph.
10
- */
11
- export function computeHITS(graph: Graph, options: HITSOptions = {}): void {
12
- const iterations = options.iterations || 20;
13
- const nodes = graph.getNodes();
14
-
15
- // 1. Filter eligible nodes
16
- // Eligibility: status 200, non-redirect (redirectChain empty), not noindex, non-external
17
- const eligibleNodes = nodes.filter(n =>
18
- n.status === 200 &&
19
- (!n.redirectChain || n.redirectChain.length === 0) &&
20
- !n.noindex
21
- );
22
-
23
- if (eligibleNodes.length === 0) return;
24
-
25
- const urlToNode = new Map<string, GraphNode>();
26
- for (const node of eligibleNodes) {
27
- urlToNode.set(node.url, node);
28
- // 2. Initialization
29
- node.authorityScore = 1.0;
30
- node.hubScore = 1.0;
31
- }
32
-
33
- const allEdges = graph.getEdges();
34
- // Filter edges: internal links only (both source and target must be in eligibleNodes), no self-links
35
- const eligibleEdges = allEdges.filter(e =>
36
- e.source !== e.target &&
37
- urlToNode.has(e.source) &&
38
- urlToNode.has(e.target)
39
- );
40
-
41
- // Group edges for efficient iteration
42
- const incoming = new Map<string, { source: string, weight: number }[]>();
43
- const outgoing = new Map<string, { target: string, weight: number }[]>();
44
-
45
- for (const edge of eligibleEdges) {
46
- if (!incoming.has(edge.target)) incoming.set(edge.target, []);
47
- incoming.get(edge.target)!.push({ source: edge.source, weight: edge.weight });
48
-
49
- if (!outgoing.has(edge.source)) outgoing.set(edge.source, []);
50
- outgoing.get(edge.source)!.push({ target: edge.target, weight: edge.weight });
51
- }
52
-
53
- // 3. Iteration
54
- for (let i = 0; i < iterations; i++) {
55
- // Update Authorities
56
- let normAuth = 0;
57
- for (const node of eligibleNodes) {
58
- const inLinks = incoming.get(node.url) || [];
59
- let newAuth = 0;
60
- for (const link of inLinks) {
61
- const sourceNode = urlToNode.get(link.source)!;
62
- newAuth += (sourceNode.hubScore || 0) * link.weight;
63
- }
64
- node.authorityScore = newAuth;
65
- normAuth += newAuth * newAuth;
66
- }
67
-
68
- // Normalize Authorities (L2 norm)
69
- normAuth = Math.sqrt(normAuth);
70
- if (normAuth > 0) {
71
- for (const node of eligibleNodes) {
72
- node.authorityScore = (node.authorityScore || 0) / normAuth;
73
- }
74
- }
75
-
76
- // Update Hubs
77
- let normHub = 0;
78
- for (const node of eligibleNodes) {
79
- const outLinks = outgoing.get(node.url) || [];
80
- let newHub = 0;
81
- for (const link of outLinks) {
82
- const targetNode = urlToNode.get(link.target)!;
83
- newHub += (targetNode.authorityScore || 0) * link.weight;
84
- }
85
- node.hubScore = newHub;
86
- normHub += newHub * newHub;
87
- }
88
-
89
- // Normalize Hubs (L2 norm)
90
- normHub = Math.sqrt(normHub);
91
- if (normHub > 0) {
92
- for (const node of eligibleNodes) {
93
- node.hubScore = (node.hubScore || 0) / normHub;
94
- }
95
- }
96
- }
97
-
98
- // 4. Classification Logic
99
- classifyLinkRoles(eligibleNodes);
100
- }
101
-
102
- function classifyLinkRoles(nodes: GraphNode[]): void {
103
- if (nodes.length === 0) return;
104
-
105
- const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
106
- const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
107
-
108
- // Use 75th percentile as "high" threshold
109
- const medianAuth = authScores[Math.floor(authScores.length / 2)];
110
- const medianHub = hubScores[Math.floor(hubScores.length / 2)];
111
-
112
- for (const node of nodes) {
113
- const auth = node.authorityScore || 0;
114
- const hub = node.hubScore || 0;
115
-
116
- const isHighAuth = auth > medianAuth && auth > 0.0001;
117
- const isHighHub = hub > medianHub && hub > 0.0001;
118
-
119
- if (isHighAuth && isHighHub) {
120
- node.linkRole = 'power';
121
- } else if (isHighAuth) {
122
- node.linkRole = 'authority';
123
- } else if (isHighHub) {
124
- node.linkRole = 'hub';
125
- } else if (auth > 0.0001 && hub > 0.0001) {
126
- node.linkRole = 'balanced';
127
- } else {
128
- node.linkRole = 'peripheral';
129
- }
130
- }
131
- }
@@ -1,176 +0,0 @@
1
- export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
2
- export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
3
-
4
- export interface SitegraphNode {
5
- url: string;
6
- depth: number;
7
- inLinks: number;
8
- outLinks: number;
9
- status: number;
10
- discoveredViaSitemap?: boolean;
11
- robotsExcluded?: boolean;
12
- canonicalUrl?: string;
13
- isHomepage?: boolean;
14
- wordCount?: number;
15
- hasStructuredData?: boolean;
16
- pageType?: string;
17
- noindex?: boolean;
18
- duplicateContent?: boolean;
19
- isProductOrCommercial?: boolean;
20
- }
21
-
22
- export interface SitegraphEdge {
23
- source: string;
24
- target: string;
25
- }
26
-
27
- export interface OrphanScoringOptions {
28
- enabled: boolean;
29
- severityEnabled: boolean;
30
- includeSoftOrphans: boolean;
31
- minInbound: number;
32
- rootUrl?: string;
33
- }
34
-
35
- export type AnnotatedNode = SitegraphNode & {
36
- orphan: boolean;
37
- orphanType?: OrphanType;
38
- orphanSeverity?: number;
39
- impactLevel?: ImpactLevel;
40
- };
41
-
42
- const LOW_VALUE_PATTERNS = [
43
- /[?&](page|p)=\d+/i,
44
- /\/(page|tag|tags|category|categories)\//i,
45
- /[?&](q|query|search|filter|sort)=/i,
46
- /\/search(\/|\?|$)/i
47
- ];
48
-
49
- function isLowValuePage(node: SitegraphNode): boolean {
50
- const type = (node.pageType || '').toLowerCase();
51
- if (['pagination', 'tag', 'category', 'filter', 'search', 'archive'].includes(type)) {
52
- return true;
53
- }
54
- if (node.noindex) {
55
- return true;
56
- }
57
- return LOW_VALUE_PATTERNS.some((pattern) => pattern.test(node.url));
58
- }
59
-
60
- function clampScore(score: number): number {
61
- return Math.max(0, Math.min(100, Math.round(score)));
62
- }
63
-
64
- export function mapImpactLevel(score: number): ImpactLevel {
65
- if (score <= 39) return 'low';
66
- if (score <= 69) return 'medium';
67
- if (score <= 89) return 'high';
68
- return 'critical';
69
- }
70
-
71
- export function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphNode): number {
72
- let score = 0;
73
-
74
- switch (orphanType) {
75
- case 'hard':
76
- score = 90;
77
- break;
78
- case 'crawl-only':
79
- score = 80;
80
- break;
81
- case 'near':
82
- score = node.inLinks <= 1 ? 70 : 60;
83
- break;
84
- case 'soft':
85
- score = 50;
86
- break;
87
- }
88
-
89
- let positiveModifier = 0;
90
- if ((node.wordCount || 0) > 800) positiveModifier += 10;
91
- if (node.hasStructuredData) positiveModifier += 10;
92
- if (node.depth <= 2) positiveModifier += 10;
93
- if (node.isProductOrCommercial) positiveModifier += 10;
94
- positiveModifier = Math.min(20, positiveModifier);
95
-
96
- let negativeModifier = 0;
97
- if ((node.wordCount || 0) > 0 && (node.wordCount || 0) < 300) negativeModifier += 20;
98
- if (node.noindex) negativeModifier += 20;
99
- if (node.duplicateContent) negativeModifier += 20;
100
- if ((node.pageType || '').toLowerCase() === 'archive' || (node.pageType || '').toLowerCase() === 'pagination') negativeModifier += 20;
101
- negativeModifier = Math.min(20, negativeModifier);
102
-
103
- score += positiveModifier;
104
- score -= negativeModifier;
105
-
106
- return clampScore(score);
107
- }
108
-
109
- function consolidateInboundByCanonical(nodes: SitegraphNode[]): Map<string, number> {
110
- const canonicalInbound = new Map<string, number>();
111
- for (const node of nodes) {
112
- const canonical = node.canonicalUrl || node.url;
113
- canonicalInbound.set(canonical, (canonicalInbound.get(canonical) || 0) + node.inLinks);
114
- }
115
- return canonicalInbound;
116
- }
117
-
118
- export function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
119
- if (!options.enabled) {
120
- return nodes.map((node) => ({ ...node, orphan: false }));
121
- }
122
-
123
- const canonicalInbound = consolidateInboundByCanonical(nodes);
124
- const nodeByUrl = new Map(nodes.map((node) => [node.url, node]));
125
-
126
- return nodes.map((node) => {
127
- const isHomepage = node.isHomepage || (options.rootUrl ? node.url === options.rootUrl : node.depth === 0);
128
- if (isHomepage || node.robotsExcluded) {
129
- return { ...node, orphan: false };
130
- }
131
-
132
- const canonical = node.canonicalUrl || node.url;
133
- const inbound = canonicalInbound.get(canonical) || 0;
134
-
135
- let orphanType: OrphanType | undefined;
136
-
137
- if (inbound === 0) {
138
- orphanType = node.discoveredViaSitemap ? 'crawl-only' : 'hard';
139
- } else if (inbound <= options.minInbound) {
140
- orphanType = 'near';
141
- }
142
-
143
- if (!orphanType && options.includeSoftOrphans && inbound > 0) {
144
- const inboundSources = edges
145
- .filter((edge) => edge.target === node.url)
146
- .map((edge) => nodeByUrl.get(edge.source))
147
- .filter((source): source is SitegraphNode => Boolean(source));
148
-
149
- if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) {
150
- orphanType = 'soft';
151
- }
152
- }
153
-
154
- if (!orphanType) {
155
- return { ...node, orphan: false };
156
- }
157
-
158
- if (!options.severityEnabled) {
159
- return {
160
- ...node,
161
- orphan: true,
162
- orphanType
163
- };
164
- }
165
-
166
- const orphanSeverity = calculateOrphanSeverity(orphanType, { ...node, inLinks: inbound });
167
-
168
- return {
169
- ...node,
170
- orphan: true,
171
- orphanType,
172
- orphanSeverity,
173
- impactLevel: mapImpactLevel(orphanSeverity)
174
- };
175
- });
176
- }
@@ -1,18 +0,0 @@
1
- import { readFileSync } from 'node:fs';
2
- import { fileURLToPath } from 'node:url';
3
- import { dirname, join } from 'node:path';
4
-
5
- const __filename = fileURLToPath(import.meta.url);
6
- const __dirname = dirname(__filename);
7
-
8
- let version = '0.0.1';
9
-
10
- try {
11
- const pkgPath = join(__dirname, '../../package.json');
12
- const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
13
- version = pkg.version;
14
- } catch {
15
- // Fallback to internal default
16
- }
17
-
18
- export { version };
@@ -1,49 +0,0 @@
1
- // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
2
-
3
- exports[`orphan detection and severity scoring > canonical consolidation, robots exclusion, and deterministic JSON output snapshot 1`] = `
4
- "[
5
- {
6
- "url": "https://example.com/canonical",
7
- "depth": 1,
8
- "inLinks": 0,
9
- "outLinks": 0,
10
- "status": 200,
11
- "orphan": true,
12
- "orphanType": "near",
13
- "orphanSeverity": 80,
14
- "impactLevel": "high"
15
- },
16
- {
17
- "url": "https://example.com/variant?a=1",
18
- "depth": 1,
19
- "inLinks": 1,
20
- "outLinks": 0,
21
- "status": 200,
22
- "canonicalUrl": "https://example.com/canonical",
23
- "orphan": true,
24
- "orphanType": "near",
25
- "orphanSeverity": 80,
26
- "impactLevel": "high"
27
- },
28
- {
29
- "url": "https://example.com/blocked",
30
- "depth": 1,
31
- "inLinks": 0,
32
- "outLinks": 0,
33
- "status": 200,
34
- "robotsExcluded": true,
35
- "orphan": false
36
- },
37
- {
38
- "url": "https://example.com/redirect-target",
39
- "depth": 1,
40
- "inLinks": 1,
41
- "outLinks": 0,
42
- "status": 200,
43
- "orphan": true,
44
- "orphanType": "near",
45
- "orphanSeverity": 80,
46
- "impactLevel": "high"
47
- }
48
- ]"
49
- `;