@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,223 +1,22 @@
1
+ import { Crawl_HTML } from './crawl_template.js';
1
2
  function safeJson(data) {
2
3
  return JSON.stringify(data).replace(/</g, '\\u003c');
3
4
  }
4
5
  export function generateHtml(graphData, metrics) {
5
- const graphJson = safeJson(graphData);
6
- return `<!DOCTYPE html>
7
- <html lang="en">
8
- <head>
9
- <meta charset="UTF-8">
10
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
11
- <title>Crawlith Site Graph</title>
12
- <style>
13
- body { margin: 0; overflow: hidden; font-family: sans-serif; }
14
- #graph { width: 100vw; height: 100vh; background: #f0f0f0; }
15
- .tooltip {
16
- position: absolute;
17
- background: white;
18
- border: 1px solid #ccc;
19
- padding: 10px;
20
- pointer-events: none;
21
- font-size: 12px;
22
- box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
23
- display: none;
24
- }
25
- #metrics {
26
- position: absolute;
27
- top: 10px;
28
- left: 10px;
29
- background: rgba(255, 255, 255, 0.9);
30
- padding: 15px;
31
- border-radius: 5px;
32
- box-shadow: 0 0 10px rgba(0,0,0,0.1);
33
- max-width: 320px;
34
- max-height: 90vh;
35
- overflow-y: auto;
36
- z-index: 100;
37
- }
38
- h1 { font-size: 18px; margin-top: 0; }
39
- h2 { font-size: 14px; margin: 15px 0 5px; border-bottom: 1px solid #ddd; }
40
- ul { padding-left: 20px; margin: 5px 0; }
41
- .legend { margin-top: 10px; font-size: 11px; }
42
- .legend-item { display: flex; align-items: center; margin-bottom: 3px; }
43
- .dot { width: 8px; height: 8px; border-radius: 50%; margin-right: 5px; }
44
- .stat-row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
45
- .stat-label { color: #666; }
46
- .stat-value { font-weight: bold; }
47
- </style>
48
- </head>
49
- <body>
50
- <div id="metrics">
51
- <h1>Crawlith Site Graph</h1>
52
-
53
- <div class="stat-row">
54
- <span class="stat-label">Discovered Pages:</span>
55
- <span class="stat-value">${metrics.totalPages}</span>
56
- </div>
57
- ${metrics.sessionStats ? `
58
- <div class="stat-row">
59
- <span class="stat-label">Session Crawl:</span>
60
- <span class="stat-value">${metrics.sessionStats.pagesFetched} pages</span>
61
- </div>
62
- ${metrics.sessionStats.pagesCached > 0 ? `
63
- <div class="stat-row" style="font-size: 11px; margin-top: -3px;">
64
- <span class="stat-label" style="padding-left: 10px;">- Reuse Cached:</span>
65
- <span class="stat-value">${metrics.sessionStats.pagesCached}</span>
66
- </div>` : ''}
67
- ` : ''}
68
- <div class="stat-row">
69
- <span class="stat-label">Total Edges:</span>
70
- <span class="stat-value">${metrics.totalEdges}</span>
71
- </div>
72
- <div class="stat-row">
73
- <span class="stat-label">Max Depth:</span>
74
- <span class="stat-value">${metrics.maxDepthFound}</span>
75
- </div>
76
- <div class="stat-row">
77
- <span class="stat-label">Avg Out-Degree:</span>
78
- <span class="stat-value">${metrics.averageOutDegree.toFixed(2)}</span>
79
- </div>
80
-
81
- <div class="legend">
82
- <div class="legend-item"><div class="dot" style="background: red;"></div>Orphan (In-Links: 0)</div>
83
- <div class="legend-item"><div class="dot" style="background: orange;"></div>Deep (Depth >= 4)</div>
84
- <div class="legend-item"><div class="dot" style="background: blue;"></div>Normal</div>
85
- </div>
86
-
87
- ${metrics.topAuthorityPages.length > 0 ? `
88
- <h3>Top Authority</h3>
89
- <ul>
90
- ${metrics.topAuthorityPages.map(p => `<li><a href="${p.url}" target="_blank">${new URL(p.url).pathname}</a> (${p.authority.toFixed(2)})</li>`).join('')}
91
- </ul>
92
- ` : ''}
93
-
94
- ${metrics.orphanPages.length > 0 ? `
95
- <h3>Orphan Pages (${metrics.orphanPages.length})</h3>
96
- <details>
97
- <summary>Show list</summary>
98
- <ul>
99
- ${metrics.orphanPages.slice(0, 20).map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('')}
100
- ${metrics.orphanPages.length > 20 ? `<li>... and ${metrics.orphanPages.length - 20} more</li>` : ''}
101
- </ul>
102
- </details>
103
- ` : ''}
104
- </div>
105
- <div id="graph"></div>
106
- <div class="tooltip" id="tooltip"></div>
107
-
108
- <script src="https://d3js.org/d3.v7.min.js"></script>
109
- <script>
110
- // Make data available globally
6
+ // Strip heavy HTML content from nodes to keep the report lightweight
7
+ const vizGraphData = {
8
+ ...graphData,
9
+ nodes: graphData.nodes ? graphData.nodes.map((n) => {
10
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
11
+ const { html, ...rest } = n;
12
+ return rest;
13
+ }) : []
14
+ };
15
+ const graphJson = safeJson(vizGraphData);
16
+ const metricsJson = safeJson(metrics);
17
+ return Crawl_HTML.replace('</body>', `<script>
111
18
  window.GRAPH_DATA = ${graphJson};
112
-
113
- const data = window.GRAPH_DATA;
114
- const width = window.innerWidth;
115
- const height = window.innerHeight;
116
-
117
- const svg = d3.select("#graph").append("svg")
118
- .attr("width", width)
119
- .attr("height", height)
120
- .call(d3.zoom().on("zoom", (event) => {
121
- g.attr("transform", event.transform);
122
- }));
123
-
124
- const g = svg.append("g");
125
-
126
- // Define arrow marker
127
- svg.append("defs").selectAll("marker")
128
- .data(["arrow"])
129
- .enter().append("marker")
130
- .attr("id", d => d)
131
- .attr("viewBox", "0 -5 10 10")
132
- .attr("refX", 15)
133
- .attr("refY", 0)
134
- .attr("markerWidth", 6)
135
- .attr("markerHeight", 6)
136
- .attr("orient", "auto")
137
- .append("path")
138
- .attr("d", "M0,-5L10,0L0,5")
139
- .attr("fill", "#999");
140
-
141
- const simulation = d3.forceSimulation(data.nodes)
142
- .force("link", d3.forceLink(data.edges).id(d => d.url).distance(100))
143
- .force("charge", d3.forceManyBody().strength(-300))
144
- .force("center", d3.forceCenter(width / 2, height / 2))
145
- .force("collide", d3.forceCollide().radius(d => Math.sqrt((d.inLinks || 0) + 1) * 5 + 2));
146
-
147
- const link = g.append("g")
148
- .attr("stroke", "#999")
149
- .attr("stroke-opacity", 0.6)
150
- .selectAll("line")
151
- .data(data.edges)
152
- .join("line")
153
- .attr("stroke-width", 1)
154
- .attr("marker-end", "url(#arrow)");
155
-
156
-
157
- const node = g.append("g")
158
- .attr("stroke", "#fff")
159
- .attr("stroke-width", 1.5)
160
- .selectAll("circle")
161
- .data(data.nodes)
162
- .join("circle")
163
- .attr("r", d => Math.sqrt((d.inLinks || 0) + 1) * 3 + 2)
164
- .attr("fill", d => {
165
- if (d.inLinks === 0 && d.depth > 0) return "red";
166
- if (d.depth >= 4) return "orange";
167
- return "blue";
168
- })
169
- .call(d3.drag()
170
- .on("start", dragstarted)
171
- .on("drag", dragged)
172
- .on("end", dragended));
173
-
174
- const tooltip = d3.select("#tooltip");
175
-
176
- node.on("mouseover", (event, d) => {
177
- tooltip.style("display", "block")
178
- .html(\`
179
- <strong>URL:</strong> \${d.url}<br>
180
- <strong>Depth:</strong> \${d.depth}<br>
181
- <strong>In-Links:</strong> \${d.inLinks}<br>
182
- <strong>Out-Links:</strong> \${d.outLinks}<br>
183
- <strong>Status:</strong> \${d.status}
184
- \`)
185
- .style("left", (event.pageX + 10) + "px")
186
- .style("top", (event.pageY - 10) + "px");
187
- })
188
- .on("mouseout", () => {
189
- tooltip.style("display", "none");
190
- });
191
-
192
- simulation.on("tick", () => {
193
- link
194
- .attr("x1", d => d.source.x)
195
- .attr("y1", d => d.source.y)
196
- .attr("x2", d => d.target.x)
197
- .attr("y2", d => d.target.y);
198
-
199
- node
200
- .attr("cx", d => d.x)
201
- .attr("cy", d => d.y);
202
- });
203
-
204
- function dragstarted(event, d) {
205
- if (!event.active) simulation.alphaTarget(0.3).restart();
206
- d.fx = d.x;
207
- d.fy = d.y;
208
- }
209
-
210
- function dragged(event, d) {
211
- d.fx = event.x;
212
- d.fy = event.y;
213
- }
214
-
215
- function dragended(event, d) {
216
- if (!event.active) simulation.alphaTarget(0);
217
- d.fx = null;
218
- d.fy = null;
219
- }
19
+ window.METRICS_DATA = ${metricsJson};
220
20
  </script>
221
- </body>
222
- </html>`;
21
+ </body>`);
223
22
  }
@@ -0,0 +1,27 @@
1
+ import { Graph, Metrics } from '@crawlith/core';
2
+ export interface CrawlInsightReport {
3
+ pages: number;
4
+ fetchedPages?: number;
5
+ summary: {
6
+ crawlDepth: number;
7
+ internalLinks: number;
8
+ externalLinks: number;
9
+ };
10
+ health?: {
11
+ score: number;
12
+ status: string;
13
+ weightedPenalties: any;
14
+ };
15
+ issues?: any;
16
+ topAuthorityPages: {
17
+ url: string;
18
+ score: number;
19
+ }[];
20
+ }
21
+ export declare function buildCrawlInsightReport(graph: Graph, metrics: Metrics, healthData?: {
22
+ health: any;
23
+ issues: any;
24
+ }): CrawlInsightReport;
25
+ export declare function renderInsightOutput(report: CrawlInsightReport, snapshotId: number): string;
26
+ export declare function renderScoreBreakdown(health: any): string;
27
+ export declare function hasCriticalIssues(report: CrawlInsightReport): boolean;
@@ -0,0 +1,103 @@
1
+ export function buildCrawlInsightReport(graph, metrics, healthData) {
2
+ return {
3
+ pages: metrics.totalPages,
4
+ fetchedPages: metrics.sessionStats?.pagesFetched,
5
+ health: healthData?.health,
6
+ issues: healthData?.issues,
7
+ summary: {
8
+ crawlDepth: metrics.maxDepthFound,
9
+ internalLinks: metrics.totalEdges,
10
+ externalLinks: healthData?.issues?.externalLinks || 0
11
+ },
12
+ topAuthorityPages: metrics.topAuthorityPages.map(p => ({ url: p.url, score: p.authority }))
13
+ };
14
+ }
15
+ export function renderInsightOutput(report, snapshotId) {
16
+ const lines = [];
17
+ // Header
18
+ lines.push(`CRAWLITH — Crawl`);
19
+ lines.push('');
20
+ lines.push(`# ${snapshotId}`);
21
+ lines.push('');
22
+ if (report.fetchedPages !== undefined) {
23
+ if (report.fetchedPages === report.pages) {
24
+ lines.push(`${report.pages} pages crawled`);
25
+ }
26
+ else {
27
+ lines.push(`${report.fetchedPages} pages fetched / ${report.pages} discovered`);
28
+ }
29
+ }
30
+ else {
31
+ lines.push(`${report.pages} pages crawled`);
32
+ }
33
+ lines.push('');
34
+ // Health Score if available
35
+ if (report.health) {
36
+ lines.push(`Score: ${report.health.score} (${report.health.status})`);
37
+ lines.push('');
38
+ }
39
+ // ===== Critical =====
40
+ if (report.issues) {
41
+ const critical = [];
42
+ const addLine = (arr, condition, text) => condition && arr.push(text);
43
+ addLine(critical, report.issues.orphanPages > 0, `${report.issues.orphanPages} orphan pages`);
44
+ addLine(critical, report.issues.redirectChains > 0, `${report.issues.redirectChains} redirect chains`);
45
+ addLine(critical, report.issues.brokenInternalLinks > 0, `${report.issues.brokenInternalLinks} broken internal links`);
46
+ addLine(critical, report.issues.duplicateClusters > 0, `${report.issues.duplicateClusters} near-duplicate clusters`);
47
+ addLine(critical, report.issues.canonicalConflicts > 0, `${report.issues.canonicalConflicts} canonical conflicts`);
48
+ addLine(critical, report.issues.accidentalNoindex > 0, `${report.issues.accidentalNoindex} pages accidentally noindexed`);
49
+ addLine(critical, report.issues.blockedByRobots > 0, `${report.issues.blockedByRobots} pages blocked by robots.txt`);
50
+ if (critical.length > 0) {
51
+ lines.push(`Critical`);
52
+ for (const c of critical)
53
+ lines.push(` • ${c}`);
54
+ lines.push('');
55
+ }
56
+ // ===== Warnings =====
57
+ const warnings = [];
58
+ addLine(warnings, report.issues.missingH1 > 0, `${report.issues.missingH1} pages missing H1`);
59
+ addLine(warnings, report.issues.thinContent > 0, `${report.issues.thinContent} thin content pages`);
60
+ addLine(warnings, report.issues.excessiveInternalLinkCount > 0, `${report.issues.excessiveInternalLinkCount} pages with excessive links`);
61
+ addLine(warnings, report.issues.imageAltMissing > 0, `${report.issues.imageAltMissing} pages missing image alt`);
62
+ if (warnings.length > 0) {
63
+ lines.push(`Warnings`);
64
+ for (const w of warnings)
65
+ lines.push(` • ${w}`);
66
+ lines.push('');
67
+ }
68
+ }
69
+ // ===== Structure =====
70
+ lines.push(`Structure`);
71
+ lines.push(` Depth Reached ${report.summary.crawlDepth}`);
72
+ lines.push(` Internal Links ${report.summary.internalLinks}`);
73
+ lines.push(` External Links ${report.summary.externalLinks}`);
74
+ lines.push('');
75
+ // ===== Authority =====
76
+ if (report.topAuthorityPages.length > 0) {
77
+ lines.push(`Top Authority`);
78
+ for (const page of report.topAuthorityPages.slice(0, 10)) {
79
+ lines.push(` ${page.url} ${page.score.toFixed(3)}`);
80
+ }
81
+ lines.push('');
82
+ }
83
+ return `${lines.join('\n')}\n`;
84
+ }
85
+ export function renderScoreBreakdown(health) {
86
+ return [
87
+ 'Health Score Breakdown',
88
+ `weights: ${JSON.stringify(health.weights)}`,
89
+ `penalties: ${JSON.stringify(health.weightedPenalties)}`
90
+ ].join('\n');
91
+ }
92
+ export function hasCriticalIssues(report) {
93
+ if (!report.issues)
94
+ return false;
95
+ const { issues } = report;
96
+ return (issues.orphanPages > 0 ||
97
+ issues.brokenInternalLinks > 0 ||
98
+ issues.redirectChains > 0 ||
99
+ issues.duplicateClusters > 0 ||
100
+ issues.canonicalConflicts > 0 ||
101
+ issues.accidentalNoindex > 0 ||
102
+ issues.blockedByRobots > 0);
103
+ }
@@ -0,0 +1,56 @@
1
+ import { Graph } from '../graph/graph.js';
2
+ export interface HealthScoreWeights {
3
+ orphans: number;
4
+ brokenLinks: number;
5
+ redirectChains: number;
6
+ duplicateClusters: number;
7
+ thinContent: number;
8
+ missingH1: number;
9
+ noindexMisuse: number;
10
+ canonicalConflicts: number;
11
+ lowInternalLinks: number;
12
+ excessiveLinks: number;
13
+ blockedByRobots: number;
14
+ crawlTraps: number;
15
+ }
16
+ export interface CrawlIssueCounts {
17
+ orphanPages: number;
18
+ brokenInternalLinks: number;
19
+ redirectChains: number;
20
+ duplicateClusters: number;
21
+ canonicalConflicts: number;
22
+ accidentalNoindex: number;
23
+ missingH1: number;
24
+ thinContent: number;
25
+ lowInternalLinkCount: number;
26
+ excessiveInternalLinkCount: number;
27
+ highExternalLinkRatio: number;
28
+ imageAltMissing: number;
29
+ strongPagesUnderLinking: number;
30
+ cannibalizationClusters: number;
31
+ nearAuthorityThreshold: number;
32
+ underlinkedHighAuthorityPages: number;
33
+ externalLinks: number;
34
+ blockedByRobots: number;
35
+ crawlTraps: number;
36
+ }
37
+ export interface HealthScoreBreakdown {
38
+ score: number;
39
+ status: string;
40
+ weightedPenalties: Record<string, number>;
41
+ weights: HealthScoreWeights;
42
+ }
43
+ export declare const THIN_CONTENT_THRESHOLD = 200;
44
+ export declare const LOW_INTERNAL_LINK_THRESHOLD = 2;
45
+ export declare const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
46
+ export declare const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
47
+ export declare const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
48
+ export declare const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights;
49
+ export declare class HealthService {
50
+ calculateHealthScore(totalPages: number, issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots' | 'crawlTraps'>, weights?: HealthScoreWeights): HealthScoreBreakdown;
51
+ collectCrawlIssues(graph: Graph, metrics: any, rootOrigin?: string): CrawlIssueCounts;
52
+ private clamp;
53
+ private healthStatusLabel;
54
+ }
55
+ export declare const calculateHealthScore: (totalPages: number, issues: Pick<CrawlIssueCounts, "orphanPages" | "brokenInternalLinks" | "redirectChains" | "duplicateClusters" | "thinContent" | "missingH1" | "accidentalNoindex" | "canonicalConflicts" | "lowInternalLinkCount" | "excessiveInternalLinkCount" | "blockedByRobots" | "crawlTraps">, weights?: HealthScoreWeights) => HealthScoreBreakdown;
56
+ export declare const healthStatusLabel: (score: number, hasCritical?: boolean) => "Needs Attention" | "Excellent" | "Good" | "Critical";
@@ -0,0 +1,213 @@
1
+ import { analyzeContent } from '../analysis/content.js';
2
+ import { analyzeH1 } from '../analysis/seo.js';
3
+ import { analyzeImageAlts } from '../analysis/images.js';
4
+ import { analyzeLinks } from '../analysis/links.js';
5
+ import { UrlUtil } from '../crawler/normalize.js';
6
+ export const THIN_CONTENT_THRESHOLD = 200;
7
+ export const LOW_INTERNAL_LINK_THRESHOLD = 2;
8
+ export const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
9
+ export const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
10
+ export const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
11
+ export const DEFAULT_HEALTH_WEIGHTS = {
12
+ orphans: 50,
13
+ brokenLinks: 100,
14
+ redirectChains: 20,
15
+ duplicateClusters: 25,
16
+ thinContent: 15,
17
+ missingH1: 10,
18
+ noindexMisuse: 20,
19
+ canonicalConflicts: 10,
20
+ lowInternalLinks: 10,
21
+ excessiveLinks: 5,
22
+ blockedByRobots: 100,
23
+ crawlTraps: 50
24
+ };
25
+ export class HealthService {
26
+ calculateHealthScore(totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) {
27
+ const safePages = Math.max(totalPages, 1);
28
+ const weightedPenalties = {
29
+ orphans: this.clamp(((issues.orphanPages || 0) / safePages) * weights.orphans, 0, weights.orphans),
30
+ brokenLinks: this.clamp(((issues.brokenInternalLinks || 0) / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
31
+ redirectChains: this.clamp(((issues.redirectChains || 0) / safePages) * weights.redirectChains, 0, weights.redirectChains),
32
+ duplicateClusters: this.clamp(((issues.duplicateClusters || 0) / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
33
+ thinContent: this.clamp(((issues.thinContent || 0) / safePages) * weights.thinContent, 0, weights.thinContent),
34
+ missingH1: this.clamp(((issues.missingH1 || 0) / safePages) * weights.missingH1, 0, weights.missingH1),
35
+ noindexMisuse: this.clamp(((issues.accidentalNoindex || 0) / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
36
+ canonicalConflicts: this.clamp(((issues.canonicalConflicts || 0) / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
37
+ lowInternalLinks: this.clamp(((issues.lowInternalLinkCount || 0) / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
38
+ excessiveLinks: this.clamp(((issues.excessiveInternalLinkCount || 0) / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
39
+ blockedByRobots: this.clamp(((issues.blockedByRobots || 0) / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots),
40
+ crawlTraps: this.clamp(((issues.crawlTraps || 0) / safePages) * weights.crawlTraps, 0, weights.crawlTraps)
41
+ };
42
+ const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
43
+ const score = Number(this.clamp(100 - totalPenalty, 0, 100).toFixed(1));
44
+ const hasCritical = ((issues.orphanPages || 0) > 0 ||
45
+ (issues.brokenInternalLinks || 0) > 0 ||
46
+ (issues.redirectChains || 0) > 0 ||
47
+ (issues.duplicateClusters || 0) > 0 ||
48
+ (issues.canonicalConflicts || 0) > 0 ||
49
+ (issues.accidentalNoindex || 0) > 0 ||
50
+ (issues.blockedByRobots || 0) > 0);
51
+ return {
52
+ score,
53
+ status: this.healthStatusLabel(score, hasCritical),
54
+ weightedPenalties,
55
+ weights
56
+ };
57
+ }
58
+ collectCrawlIssues(graph, metrics, rootOrigin = '') {
59
+ const nodes = graph.getNodes();
60
+ let brokenInternalLinks = 0;
61
+ let redirectChains = 0;
62
+ let canonicalConflicts = 0;
63
+ let accidentalNoindex = 0;
64
+ let missingH1 = 0;
65
+ let thinContent = 0;
66
+ let highExternalLinkRatio = 0;
67
+ let imageAltMissing = 0;
68
+ let lowInternalLinkCount = 0;
69
+ let excessiveInternalLinkCount = 0;
70
+ let strongPagesUnderLinking = 0;
71
+ let nearAuthorityThreshold = 0;
72
+ let underlinkedHighAuthorityPages = 0;
73
+ let externalLinks = 0;
74
+ let blockedByRobots = 0;
75
+ let crawlTraps = 0;
76
+ for (const node of nodes) {
77
+ if (!node.isInternal) {
78
+ continue;
79
+ }
80
+ if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
81
+ blockedByRobots += 1;
82
+ }
83
+ if (node.crawlTrapFlag) {
84
+ crawlTraps += 1;
85
+ }
86
+ const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
87
+ if (isConfirmedError) {
88
+ brokenInternalLinks += 1;
89
+ }
90
+ if (node.brokenLinks) {
91
+ const actualBreaks = node.brokenLinks.filter(url => {
92
+ const target = graph.nodes.get(url);
93
+ return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
94
+ });
95
+ brokenInternalLinks += actualBreaks.length;
96
+ }
97
+ if ((node.redirectChain?.length || 0) > 1) {
98
+ redirectChains += 1;
99
+ }
100
+ const absoluteUrl = rootOrigin ? (node.url.startsWith('http') ? node.url : new URL(node.url, rootOrigin).toString()) : node.url;
101
+ if (node.canonical && node.canonical !== node.url && node.canonical !== absoluteUrl) {
102
+ // Final check: normalize both to ignore trailing slash differences or protocol mismatches if they are considered "same"
103
+ const normCanonical = node.canonical.replace(/\/$/, '');
104
+ const normAbsolute = absoluteUrl.replace(/\/$/, '');
105
+ if (normCanonical !== normAbsolute) {
106
+ canonicalConflicts += 1;
107
+ }
108
+ }
109
+ if (node.noindex && node.status >= 200 && node.status < 300) {
110
+ accidentalNoindex += 1;
111
+ }
112
+ if (node.inLinks === 1 && node.depth > 0) {
113
+ lowInternalLinkCount += 1;
114
+ }
115
+ if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
116
+ excessiveInternalLinkCount += 1;
117
+ }
118
+ if (!node.html) {
119
+ continue;
120
+ }
121
+ const h1Res = analyzeH1(node.html, '');
122
+ if (h1Res.count === 0) {
123
+ missingH1 += 1;
124
+ }
125
+ if (node.wordCount != null) {
126
+ if (node.wordCount < THIN_CONTENT_THRESHOLD) {
127
+ thinContent += 1;
128
+ }
129
+ }
130
+ else if (node.html) {
131
+ const content = analyzeContent(node.html);
132
+ if (content.wordCount < THIN_CONTENT_THRESHOLD) {
133
+ thinContent += 1;
134
+ }
135
+ }
136
+ const pageAbsUrl = rootOrigin ? UrlUtil.toAbsolute(node.url, rootOrigin) : node.url;
137
+ const links = analyzeLinks(node.html || '', pageAbsUrl, rootOrigin || node.url);
138
+ externalLinks += links.externalLinks;
139
+ if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
140
+ highExternalLinkRatio += 1;
141
+ }
142
+ if (node.html) {
143
+ const imageAlt = analyzeImageAlts(node.html);
144
+ if (imageAlt.missingAlt > 0) {
145
+ imageAltMissing += 1;
146
+ }
147
+ }
148
+ }
149
+ const clusters = graph.contentClusters || metrics.clusters || [];
150
+ const duplicateClusters = clusters.length;
151
+ const cannibalizationClusters = clusters.filter((cluster) => cluster.risk === 'high' || cluster.type === 'near').length;
152
+ for (const node of nodes) {
153
+ const authority = node.inLinks > 5 ? 0.8 : 0.2;
154
+ if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
155
+ strongPagesUnderLinking += 1;
156
+ }
157
+ if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
158
+ nearAuthorityThreshold += 1;
159
+ }
160
+ if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
161
+ underlinkedHighAuthorityPages += 1;
162
+ }
163
+ }
164
+ return {
165
+ orphanPages: metrics.orphanPages?.length || 0,
166
+ brokenInternalLinks,
167
+ redirectChains,
168
+ duplicateClusters,
169
+ canonicalConflicts,
170
+ accidentalNoindex,
171
+ missingH1,
172
+ thinContent,
173
+ lowInternalLinkCount,
174
+ excessiveInternalLinkCount,
175
+ highExternalLinkRatio,
176
+ imageAltMissing,
177
+ strongPagesUnderLinking,
178
+ cannibalizationClusters,
179
+ nearAuthorityThreshold,
180
+ underlinkedHighAuthorityPages,
181
+ externalLinks,
182
+ blockedByRobots,
183
+ crawlTraps
184
+ };
185
+ }
186
+ clamp(value, min, max) {
187
+ return Math.min(max, Math.max(min, value));
188
+ }
189
+ healthStatusLabel(score, hasCritical = false) {
190
+ if (hasCritical && score >= 75)
191
+ return 'Needs Attention';
192
+ if (score >= 90)
193
+ return 'Excellent';
194
+ if (score >= 75)
195
+ return 'Good';
196
+ if (score >= 50)
197
+ return 'Needs Attention';
198
+ return 'Critical';
199
+ }
200
+ }
201
+ const service = new HealthService();
202
+ export const calculateHealthScore = (totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) => service.calculateHealthScore(totalPages, issues, weights);
203
+ export const healthStatusLabel = (score, hasCritical = false) => {
204
+ if (hasCritical && score >= 75)
205
+ return 'Needs Attention';
206
+ if (score >= 90)
207
+ return 'Excellent';
208
+ if (score >= 75)
209
+ return 'Good';
210
+ if (score >= 50)
211
+ return 'Needs Attention';
212
+ return 'Critical';
213
+ };
@@ -0,0 +1,6 @@
1
+ interface Chalk {
2
+ (text: unknown): string;
3
+ [key: string]: Chalk;
4
+ }
5
+ declare const chalk: Chalk;
6
+ export default chalk;