@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,223 @@
1
+ function safeJson(data) {
2
+ return JSON.stringify(data).replace(/</g, '\\u003c');
3
+ }
4
+ export function generateHtml(graphData, metrics) {
5
+ const graphJson = safeJson(graphData);
6
+ return `<!DOCTYPE html>
7
+ <html lang="en">
8
+ <head>
9
+ <meta charset="UTF-8">
10
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
11
+ <title>Crawlith Site Graph</title>
12
+ <style>
13
+ body { margin: 0; overflow: hidden; font-family: sans-serif; }
14
+ #graph { width: 100vw; height: 100vh; background: #f0f0f0; }
15
+ .tooltip {
16
+ position: absolute;
17
+ background: white;
18
+ border: 1px solid #ccc;
19
+ padding: 10px;
20
+ pointer-events: none;
21
+ font-size: 12px;
22
+ box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
23
+ display: none;
24
+ }
25
+ #metrics {
26
+ position: absolute;
27
+ top: 10px;
28
+ left: 10px;
29
+ background: rgba(255, 255, 255, 0.9);
30
+ padding: 15px;
31
+ border-radius: 5px;
32
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
33
+ max-width: 320px;
34
+ max-height: 90vh;
35
+ overflow-y: auto;
36
+ z-index: 100;
37
+ }
38
+ h1 { font-size: 18px; margin-top: 0; }
39
+ h2 { font-size: 14px; margin: 15px 0 5px; border-bottom: 1px solid #ddd; }
40
+ ul { padding-left: 20px; margin: 5px 0; }
41
+ .legend { margin-top: 10px; font-size: 11px; }
42
+ .legend-item { display: flex; align-items: center; margin-bottom: 3px; }
43
+ .dot { width: 8px; height: 8px; border-radius: 50%; margin-right: 5px; }
44
+ .stat-row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
45
+ .stat-label { color: #666; }
46
+ .stat-value { font-weight: bold; }
47
+ </style>
48
+ </head>
49
+ <body>
50
+ <div id="metrics">
51
+ <h1>Crawlith Site Graph</h1>
52
+
53
+ <div class="stat-row">
54
+ <span class="stat-label">Discovered Pages:</span>
55
+ <span class="stat-value">${metrics.totalPages}</span>
56
+ </div>
57
+ ${metrics.sessionStats ? `
58
+ <div class="stat-row">
59
+ <span class="stat-label">Session Crawl:</span>
60
+ <span class="stat-value">${metrics.sessionStats.pagesFetched} pages</span>
61
+ </div>
62
+ ${metrics.sessionStats.pagesCached > 0 ? `
63
+ <div class="stat-row" style="font-size: 11px; margin-top: -3px;">
64
+ <span class="stat-label" style="padding-left: 10px;">- Reuse Cached:</span>
65
+ <span class="stat-value">${metrics.sessionStats.pagesCached}</span>
66
+ </div>` : ''}
67
+ ` : ''}
68
+ <div class="stat-row">
69
+ <span class="stat-label">Total Edges:</span>
70
+ <span class="stat-value">${metrics.totalEdges}</span>
71
+ </div>
72
+ <div class="stat-row">
73
+ <span class="stat-label">Max Depth:</span>
74
+ <span class="stat-value">${metrics.maxDepthFound}</span>
75
+ </div>
76
+ <div class="stat-row">
77
+ <span class="stat-label">Avg Out-Degree:</span>
78
+ <span class="stat-value">${metrics.averageOutDegree.toFixed(2)}</span>
79
+ </div>
80
+
81
+ <div class="legend">
82
+ <div class="legend-item"><div class="dot" style="background: red;"></div>Orphan (In-Links: 0)</div>
83
+ <div class="legend-item"><div class="dot" style="background: orange;"></div>Deep (Depth >= 4)</div>
84
+ <div class="legend-item"><div class="dot" style="background: blue;"></div>Normal</div>
85
+ </div>
86
+
87
+ ${metrics.topAuthorityPages.length > 0 ? `
88
+ <h3>Top Authority</h3>
89
+ <ul>
90
+ ${metrics.topAuthorityPages.map(p => `<li><a href="${p.url}" target="_blank">${new URL(p.url).pathname}</a> (${p.authority.toFixed(2)})</li>`).join('')}
91
+ </ul>
92
+ ` : ''}
93
+
94
+ ${metrics.orphanPages.length > 0 ? `
95
+ <h3>Orphan Pages (${metrics.orphanPages.length})</h3>
96
+ <details>
97
+ <summary>Show list</summary>
98
+ <ul>
99
+ ${metrics.orphanPages.slice(0, 20).map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('')}
100
+ ${metrics.orphanPages.length > 20 ? `<li>... and ${metrics.orphanPages.length - 20} more</li>` : ''}
101
+ </ul>
102
+ </details>
103
+ ` : ''}
104
+ </div>
105
+ <div id="graph"></div>
106
+ <div class="tooltip" id="tooltip"></div>
107
+
108
+ <script src="https://d3js.org/d3.v7.min.js"></script>
109
+ <script>
110
+ // Make data available globally
111
+ window.GRAPH_DATA = ${graphJson};
112
+
113
+ const data = window.GRAPH_DATA;
114
+ const width = window.innerWidth;
115
+ const height = window.innerHeight;
116
+
117
+ const svg = d3.select("#graph").append("svg")
118
+ .attr("width", width)
119
+ .attr("height", height)
120
+ .call(d3.zoom().on("zoom", (event) => {
121
+ g.attr("transform", event.transform);
122
+ }));
123
+
124
+ const g = svg.append("g");
125
+
126
+ // Define arrow marker
127
+ svg.append("defs").selectAll("marker")
128
+ .data(["arrow"])
129
+ .enter().append("marker")
130
+ .attr("id", d => d)
131
+ .attr("viewBox", "0 -5 10 10")
132
+ .attr("refX", 15)
133
+ .attr("refY", 0)
134
+ .attr("markerWidth", 6)
135
+ .attr("markerHeight", 6)
136
+ .attr("orient", "auto")
137
+ .append("path")
138
+ .attr("d", "M0,-5L10,0L0,5")
139
+ .attr("fill", "#999");
140
+
141
+ const simulation = d3.forceSimulation(data.nodes)
142
+ .force("link", d3.forceLink(data.edges).id(d => d.url).distance(100))
143
+ .force("charge", d3.forceManyBody().strength(-300))
144
+ .force("center", d3.forceCenter(width / 2, height / 2))
145
+ .force("collide", d3.forceCollide().radius(d => Math.sqrt((d.inLinks || 0) + 1) * 5 + 2));
146
+
147
+ const link = g.append("g")
148
+ .attr("stroke", "#999")
149
+ .attr("stroke-opacity", 0.6)
150
+ .selectAll("line")
151
+ .data(data.edges)
152
+ .join("line")
153
+ .attr("stroke-width", 1)
154
+ .attr("marker-end", "url(#arrow)");
155
+
156
+
157
+ const node = g.append("g")
158
+ .attr("stroke", "#fff")
159
+ .attr("stroke-width", 1.5)
160
+ .selectAll("circle")
161
+ .data(data.nodes)
162
+ .join("circle")
163
+ .attr("r", d => Math.sqrt((d.inLinks || 0) + 1) * 3 + 2)
164
+ .attr("fill", d => {
165
+ if (d.inLinks === 0 && d.depth > 0) return "red";
166
+ if (d.depth >= 4) return "orange";
167
+ return "blue";
168
+ })
169
+ .call(d3.drag()
170
+ .on("start", dragstarted)
171
+ .on("drag", dragged)
172
+ .on("end", dragended));
173
+
174
+ const tooltip = d3.select("#tooltip");
175
+
176
+ node.on("mouseover", (event, d) => {
177
+ tooltip.style("display", "block")
178
+ .html(\`
179
+ <strong>URL:</strong> \${d.url}<br>
180
+ <strong>Depth:</strong> \${d.depth}<br>
181
+ <strong>In-Links:</strong> \${d.inLinks}<br>
182
+ <strong>Out-Links:</strong> \${d.outLinks}<br>
183
+ <strong>Status:</strong> \${d.status}
184
+ \`)
185
+ .style("left", (event.pageX + 10) + "px")
186
+ .style("top", (event.pageY - 10) + "px");
187
+ })
188
+ .on("mouseout", () => {
189
+ tooltip.style("display", "none");
190
+ });
191
+
192
+ simulation.on("tick", () => {
193
+ link
194
+ .attr("x1", d => d.source.x)
195
+ .attr("y1", d => d.source.y)
196
+ .attr("x2", d => d.target.x)
197
+ .attr("y2", d => d.target.y);
198
+
199
+ node
200
+ .attr("cx", d => d.x)
201
+ .attr("cy", d => d.y);
202
+ });
203
+
204
+ function dragstarted(event, d) {
205
+ if (!event.active) simulation.alphaTarget(0.3).restart();
206
+ d.fx = d.x;
207
+ d.fy = d.y;
208
+ }
209
+
210
+ function dragged(event, d) {
211
+ d.fx = event.x;
212
+ d.fy = event.y;
213
+ }
214
+
215
+ function dragended(event, d) {
216
+ if (!event.active) simulation.alphaTarget(0);
217
+ d.fx = null;
218
+ d.fy = null;
219
+ }
220
+ </script>
221
+ </body>
222
+ </html>`;
223
+ }
@@ -0,0 +1,3 @@
1
+ export declare function renderSitegraphCsvNodes(graphData: any): string;
2
+ export declare function renderSitegraphCsvEdges(graphData: any): string;
3
+ export declare function renderSitegraphMarkdown(url: string, graphData: any, metrics: any, graph: any): string;
@@ -0,0 +1,52 @@
1
+ export function renderSitegraphCsvNodes(graphData) {
2
+ const nodeHeaders = ['URL', 'Depth', 'Status', 'InboundLinks', 'OutboundLinks', 'PageRankScore'];
3
+ const nodeRows = graphData.nodes.map((n) => {
4
+ const outbound = graphData.edges.filter((e) => e.source === n.url).length;
5
+ const inbound = graphData.edges.filter((e) => e.target === n.url).length;
6
+ const statusStr = n.status === 0 ? 'Pending/Limit' : n.status;
7
+ return [n.url, n.depth, statusStr, inbound, outbound, (n.pageRankScore || 0).toFixed(3)].join(',');
8
+ });
9
+ return [nodeHeaders.join(','), ...nodeRows].join('\n');
10
+ }
11
+ export function renderSitegraphCsvEdges(graphData) {
12
+ const edgeHeaders = ['Source', 'Target', 'Weight'];
13
+ const edgeRows = graphData.edges.map((e) => [e.source, e.target, e.weight].join(','));
14
+ return [edgeHeaders.join(','), ...edgeRows].join('\n');
15
+ }
16
+ export function renderSitegraphMarkdown(url, graphData, metrics, graph) {
17
+ const md = [
18
+ `# Crawlith Crawl Summary - ${url}`,
19
+ '',
20
+ `## 📊 Metrics`,
21
+ `- Total Pages Discovered: ${metrics.totalPages}`,
22
+ `- Session Pages Crawled: ${graph.sessionStats?.pagesFetched ?? 0}`,
23
+ `- Total Edges: ${metrics.totalEdges}`,
24
+ `- Avg Depth: ${metrics.averageDepth.toFixed(2)}`,
25
+ `- Max Depth: ${metrics.maxDepthFound}`,
26
+ `- Crawl Efficiency: ${(metrics.crawlEfficiencyScore * 100).toFixed(1)}%`,
27
+ '',
28
+ `## 📄 Top Pages (by In-degree)`,
29
+ ];
30
+ const topPages = [...graphData.nodes]
31
+ .map((n) => ({ ...n, inLinks: graphData.edges.filter((e) => e.target === n.url).length }))
32
+ .sort((a, b) => b.inLinks - a.inLinks)
33
+ .slice(0, 10);
34
+ md.push('| URL | Inbound | Status |');
35
+ md.push('| :--- | :--- | :--- |');
36
+ topPages.forEach(p => {
37
+ const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
38
+ md.push(`| ${p.url} | ${p.inLinks} | ${statusStr} |`);
39
+ });
40
+ if (metrics.topPageRankPages?.length > 0) {
41
+ md.push('');
42
+ md.push('## 🏆 Top PageRank Pages');
43
+ md.push('| URL | Score |');
44
+ md.push('| :--- | :--- |');
45
+ metrics.topPageRankPages.slice(0, 10).forEach((p) => {
46
+ const node = graph.nodes?.get ? graph.nodes.get(p.url) : graph.getNodes?.().find((x) => x.url === p.url);
47
+ const score = node?.pageRankScore ?? 0;
48
+ md.push(`| ${p.url} | ${score.toFixed(3)}/100 |`);
49
+ });
50
+ }
51
+ return md.join('\n');
52
+ }
@@ -0,0 +1 @@
1
+ export declare const SITEGRAPH_HTML = "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n <meta charset=\"UTF-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n <title>Crawlith Site Graph</title>\n <style>\n :root {\n --bg-color: #121212;\n --text-color: #e0e0e0;\n --panel-bg: #1e1e1e;\n --border-color: #333;\n --accent-color: #4a90e2;\n --sidebar-width: 300px;\n }\n body { margin: 0; font-family: -apple-system, BlinkMacSystemFont, \"Segoe UI\", Roboto, Helvetica, Arial, sans-serif; background: var(--bg-color); color: var(--text-color); height: 100vh; display: flex; flex-direction: column; overflow: hidden; }\n\n /* Layout */\n header { padding: 0 20px; background: var(--panel-bg); border-bottom: 1px solid var(--border-color); display: flex; justify-content: space-between; align-items: center; height: 60px; box-sizing: border-box; z-index: 10; }\n main { flex: 1; display: flex; overflow: hidden; position: relative; }\n #graph-container { flex: 1; position: relative; overflow: hidden; background: var(--bg-color); }\n #details-panel { width: var(--sidebar-width); background: var(--panel-bg); border-left: 1px solid var(--border-color); padding: 20px; overflow-y: auto; box-sizing: border-box; display: none; flex-direction: column; gap: 15px; }\n #details-panel.visible { display: flex; }\n footer { padding: 5px 20px; background: var(--panel-bg); border-top: 1px solid var(--border-color); font-size: 0.8rem; text-align: center; color: #666; height: 30px; display: flex; align-items: center; justify-content: center; }\n\n /* Header Components */\n .brand { font-weight: bold; font-size: 1.2rem; display: flex; align-items: center; gap: 10px; }\n .brand span { color: var(--accent-color); }\n #metrics-summary { font-size: 0.9rem; color: #aaa; display: flex; gap: 20px; }\n .metric { display: flex; flex-direction: column; align-items: center; line-height: 1.1; }\n .metric-value { font-weight: bold; color: var(--text-color); }\n .metric-label { font-size: 0.7rem; }\n\n #controls { display: flex; gap: 10px; align-items: center; }\n .btn-group { display: flex; background: #333; border-radius: 4px; overflow: hidden; }\n button { background: transparent; color: #aaa; border: none; padding: 6px 12px; cursor: pointer; font-size: 0.85rem; transition: all 0.2s; }\n button:hover { color: white; background: rgba(255,255,255,0.1); }\n button.active { background: var(--accent-color); color: white; }\n\n /* Search */\n #search-container { position: absolute; top: 15px; left: 15px; z-index: 5; }\n #search-input { background: rgba(30,30,30,0.9); border: 1px solid #444; color: white; padding: 8px 12px; border-radius: 20px; width: 200px; outline: none; transition: width 0.3s; }\n #search-input:focus { width: 280px; border-color: var(--accent-color); }\n\n /* Graph */\n svg { width: 100%; height: 100%; display: block; }\n .node { cursor: pointer; transition: stroke-width 0.1s; }\n .link { stroke: #555; stroke-opacity: 0.3; fill: none; pointer-events: none; }\n\n /* Interaction States */\n .node.highlight { stroke: #fff; stroke-width: 2px; }\n .link.highlight { stroke-opacity: 0.8; stroke: #999; }\n .node.faded { opacity: 0.1; }\n .link.faded { opacity: 0.05; }\n\n /* Details Panel Content */\n .detail-section { border-bottom: 1px solid #333; padding-bottom: 10px; }\n .detail-section:last-child { border-bottom: none; }\n .detail-label { font-size: 0.75rem; color: #888; text-transform: uppercase; letter-spacing: 0.5px; margin-bottom: 4px; }\n .detail-value { font-size: 0.95rem; word-break: break-all; }\n .detail-list { list-style: none; padding: 0; margin: 0; max-height: 150px; overflow-y: auto; font-size: 0.85rem; }\n .detail-list li { padding: 4px 0; border-bottom: 1px solid #2a2a2a; }\n .detail-list a { color: var(--accent-color); text-decoration: none; }\n .detail-list a:hover { text-decoration: underline; }\n\n .status-badge { display: inline-block; padding: 2px 6px; border-radius: 3px; font-size: 0.75rem; font-weight: bold; margin-top: 5px; }\n .status-ok { background: #2e7d32; color: white; }\n .status-warn { background: #f9a825; color: black; }\n .status-error { background: #c62828; color: white; }\n\n /* Tooltip */\n #tooltip { position: absolute; background: rgba(20,20,20,0.95); color: white; padding: 10px; border-radius: 6px; pointer-events: none; font-size: 12px; z-index: 100; box-shadow: 0 4px 15px rgba(0,0,0,0.5); border: 1px solid #444; display: none; transform: translate(-50%, -100%); margin-top: -10px; white-space: nowrap; }\n\n /* Responsive Sidebar */\n @media (max-width: 768px) {\n #details-panel { position: absolute; right: 0; top: 0; bottom: 0; z-index: 20; box-shadow: -5px 0 15px rgba(0,0,0,0.5); transform: translateX(100%); transition: transform 0.3s ease; }\n #details-panel.visible { transform: translateX(0); }\n #metrics-summary { display: none; }\n }\n </style>\n</head>\n<body>\n <header>\n <div class=\"brand\"><span>Crawlith</span> SiteGraph</div>\n\n <div id=\"metrics-summary\">\n <div class=\"metric\"><span class=\"metric-value\" id=\"m-pages\">-</span><span class=\"metric-label\">Pages</span></div>\n <div class=\"metric\"><span class=\"metric-value\" id=\"m-depth\">-</span><span class=\"metric-label\">Max Depth</span></div>\n <div class=\"metric\"><span class=\"metric-value\" id=\"m-eff\">-</span><span class=\"metric-label\">Efficiency</span></div>\n <div class=\"metric\"><span class=\"metric-value\" id=\"m-orphan\">-</span><span class=\"metric-label\">Orphans</span></div>\n </div>\n\n <div id=\"controls\">\n <div class=\"btn-group\" style=\"margin-right: 15px;\">\n <button id=\"btn-auth-pagerank\" class=\"active\" title=\"PageRank Authority\">PageRank</button>\n <button id=\"btn-auth-structural\" title=\"Structural Authority (In-Degree)\">In-Degree</button>\n </div>\n <div class=\"btn-group\">\n <button id=\"btn-hierarchical\" class=\"active\">Hierarchical</button>\n <button id=\"btn-radial\">Radial</button>\n </div>\n </div>\n </header>\n\n <main>\n <div id=\"graph-container\">\n <div id=\"search-container\">\n <input type=\"text\" id=\"search-input\" placeholder=\"Search URL...\">\n </div>\n <svg id=\"graph\"></svg>\n <div id=\"tooltip\"></div>\n </div>\n\n <aside id=\"details-panel\">\n <div class=\"detail-section\">\n <div class=\"detail-label\">URL</div>\n <div class=\"detail-value\" id=\"d-url\">-</div>\n <div id=\"d-status\"></div>\n </div>\n <div class=\"detail-section\" style=\"display: flex; gap: 20px;\">\n <div>\n <div class=\"detail-label\">Depth</div>\n <div class=\"detail-value\" id=\"d-depth\">-</div>\n </div>\n <div>\n <div class=\"detail-label\">Authority</div>\n <div class=\"detail-value\" id=\"d-auth-container\">-</div>\n </div>\n </div>\n <div class=\"detail-section\">\n <div class=\"detail-label\">In-links (<span id=\"d-in-count\">0</span>)</div>\n <!-- List could be populated here if we had the reverse index, for now just count -->\n </div>\n <div class=\"detail-section\">\n <div class=\"detail-label\">Out-links (<span id=\"d-out-count\">0</span>)</div>\n <ul class=\"detail-list\" id=\"d-out-list\"></ul>\n </div>\n </aside>\n </main>\n\n <footer>\n Generated by Crawlith Crawler\n </footer>\n\n <!-- D3 from CDN -->\n <script src=\"https://d3js.org/d3.v7.min.js\"></script>\n\n <script>\n // --- State ---\n const state = {\n nodes: [],\n links: [],\n metrics: {},\n adjacency: new Map(), // url -> { in: [], out: [] }\n simulation: null,\n width: 0,\n height: 0,\n transform: d3.zoomIdentity,\n activeNode: null,\n mode: 'hierarchical', // 'hierarchical' | 'radial'\n maxDepth: 0,\n maxInLinks: 0,\n nodeSelection: null,\n linkSelection: null,\n zoom: null\n };\n\n // --- DOM Elements ---\n const svg = d3.select(\"#graph\");\n const container = svg.append(\"g\");\n const linkGroup = container.append(\"g\").attr(\"class\", \"links\");\n const nodeGroup = container.append(\"g\").attr(\"class\", \"nodes\");\n const tooltip = d3.select(\"#tooltip\");\n const detailsPanel = d3.select(\"#details-panel\");\n\n // --- Initialization ---\n // --- Initialization ---\n async function init() {\n try {\n let graphData, metricsData;\n\n // 1. Try to use injected data (for file:// usage)\n // @ts-ignore\n if (window.GRAPH_DATA) graphData = window.GRAPH_DATA;\n // @ts-ignore\n if (window.METRICS_DATA) metricsData = window.METRICS_DATA;\n\n // 2. Fallback to fetching JSON files (for web server usage)\n if (!graphData || !metricsData) {\n try {\n const [graphRes, metricsRes] = await Promise.all([\n fetch('graph.json'),\n fetch('metrics.json')\n ]);\n if (graphRes.ok && metricsRes.ok) {\n graphData = await graphRes.json();\n metricsData = await metricsRes.json();\n }\n } catch (e) {\n console.warn(\"Fetch failed, possibly due to CORS or missing files.\", e);\n }\n }\n\n if (!graphData || !metricsData) {\n throw new Error(\"No data available. Ensure graph.json exists or data is injected.\");\n }\n\n state.metrics = metricsData;\n processData(graphData);\n updateMetricsUI();\n\n // Setup UI\n setupResize();\n setupInteractions();\n setupSearch();\n\n // Start Simulation\n initSimulation();\n\n } catch (err) {\n console.error(err);\n alert(\"Error loading visualization data: \" + err.message);\n }\n }\n\n function processData(data) {\n // Create a map for fast lookup\n const nodeMap = new Map();\n\n data.nodes.forEach(n => {\n n.inLinks = n.inLinks || 0;\n n.outLinks = n.outLinks || 0;\n nodeMap.set(n.url, n);\n });\n\n // Filter valid links\n state.links = data.edges\n .map(e => ({ source: nodeMap.get(e.source), target: nodeMap.get(e.target) }))\n .filter(e => e.source && e.target);\n\n state.nodes = data.nodes;\n\n // Calculate Stats\n state.maxDepth = d3.max(state.nodes, d => d.depth) || 1;\n state.maxInLinks = d3.max(state.nodes, d => d.inLinks) || 1;\n\n // Calculate Authority & Enrich Nodes\n state.nodes.forEach(n => {\n // Structural Authority: log-scaled normalized 0-1 based on in-links\n n.structuralAuthority = Math.log(1 + n.inLinks) / Math.log(1 + state.maxInLinks);\n\n // PageRank Authority: normalized 0-1 from pageRankScore (0-100)\n if (typeof n.pageRankScore === 'number') {\n n.pageRankAuthority = n.pageRankScore / 100;\n } else {\n n.pageRankAuthority = n.structuralAuthority;\n }\n\n // Default authority to PageRank if available, else structural\n n.authority = n.pageRankAuthority;\n\n // Ensure x,y are initialized to avoid NaNs if D3 doesn't do it fast enough\n n.x = 0; n.y = 0;\n });\n\n // Build Adjacency Map\n state.nodes.forEach(n => state.adjacency.set(n.url, { in: [], out: [] }));\n state.links.forEach(l => {\n state.adjacency.get(l.source.url).out.push(l.target);\n state.adjacency.get(l.target.url).in.push(l.source);\n });\n }\n\n function updateMetricsUI() {\n document.getElementById('m-pages').textContent = state.metrics.totalPages;\n document.getElementById('m-depth').textContent = state.metrics.maxDepthFound;\n document.getElementById('m-eff').textContent = (state.metrics.crawlEfficiencyScore * 100).toFixed(1) + '%';\n document.getElementById('m-orphan').textContent = state.metrics.orphanPages.length;\n }\n\n // --- Simulation ---\n function initSimulation() {\n const { width, height } = getDimensions();\n state.width = width;\n state.height = height;\n\n // Safeguards\n const nodeCount = state.nodes.length;\n const enableCollision = nodeCount <= 1200;\n const alphaDecay = nodeCount > 1000 ? 0.05 : 0.02; // Faster decay for large graphs\n\n state.simulation = d3.forceSimulation(state.nodes)\n .alphaDecay(alphaDecay)\n .force(\"link\", d3.forceLink(state.links).id(d => d.url).strength(0.5)) // Reduced strength for flexibility\n .force(\"charge\", d3.forceManyBody().strength(nodeCount > 1000 ? -100 : -300))\n .force(\"center\", d3.forceCenter(width / 2, height / 2));\n\n if (enableCollision) {\n state.simulation.force(\"collide\", d3.forceCollide().radius(d => getNodeRadius(d) + 2).iterations(1));\n }\n\n // Apply Layout Mode\n applyLayoutMode(state.mode);\n\n // Rendering loop\n state.simulation.on(\"tick\", ticked);\n\n // Render initial SVG elements\n render();\n }\n\n function applyLayoutMode(mode) {\n state.mode = mode;\n const { width, height } = state;\n const centerY = height / 2;\n const centerX = width / 2;\n\n // Remove conflicting forces\n state.simulation.force(\"y\", null);\n state.simulation.force(\"radial\", null);\n\n if (mode === 'hierarchical') {\n const depthSpacing = height / (state.maxDepth + 2);\n // Hierarchical: Nodes pushed to Y levels based on depth\n state.simulation.force(\"y\", d3.forceY(d => {\n return (d.depth * depthSpacing) - (height/2) + 50; // Offset to start from top\n }).strength(1));\n // We rely on \"center\" force to keep X centered, but maybe add weak forceX?\n // Let's add weak forceX to prevent wide spread\n state.simulation.force(\"x\", d3.forceX(0).strength(0.05));\n state.simulation.force(\"center\", d3.forceCenter(width/2, height/2)); // Recenter\n\n } else if (mode === 'radial') {\n const maxRadius = Math.min(width, height) / 2 - 50;\n const ringSpacing = maxRadius / (state.maxDepth + 1);\n\n state.simulation.force(\"radial\", d3.forceRadial(\n d => d.depth * ringSpacing,\n width / 2,\n height / 2\n ).strength(0.8));\n\n state.simulation.force(\"x\", null); // Remove X constraint\n }\n\n state.simulation.alpha(1).restart();\n }\n\n function getNodeRadius(d) {\n // 5 + authority * 15\n return 5 + (d.authority * 15);\n }\n\n function getNodeColor(d) {\n // Depth-based sequential color (Blue -> Purple -> Pink)\n const t = d.depth / (state.maxDepth || 1);\n return d3.interpolateViridis(1 - t); // Invert Viridis for better contrast on dark\n }\n\n function render() {\n // Links\n state.linkSelection = linkGroup.selectAll(\"line\")\n .data(state.links)\n .join(\"line\")\n .attr(\"class\", \"link\")\n .attr(\"stroke-width\", 0.5);\n\n // Nodes\n state.nodeSelection = nodeGroup.selectAll(\"circle\")\n .data(state.nodes)\n .join(\"circle\")\n .attr(\"class\", \"node\")\n .attr(\"r\", d => getNodeRadius(d))\n .attr(\"fill\", d => getNodeColor(d))\n .attr(\"stroke\", d => d.status >= 400 ? \"#ff4444\" : null) // Red stroke for errors\n .on(\"mouseover\", (event, d) => {\n if (state.activeNode) return;\n highlightNode(d);\n showTooltip(event, d);\n })\n .on(\"mouseout\", () => {\n if (state.activeNode) return;\n resetHighlight();\n hideTooltip();\n })\n .on(\"click\", (event, d) => {\n event.stopPropagation();\n selectNode(d);\n })\n .call(d3.drag()\n .on(\"start\", dragstarted)\n .on(\"drag\", dragged)\n .on(\"end\", dragended));\n\n // Zoom\n state.zoom = d3.zoom()\n .scaleExtent([0.1, 4])\n .on(\"zoom\", (event) => {\n state.transform = event.transform;\n container.attr(\"transform\", event.transform);\n });\n\n svg.call(state.zoom)\n .call(state.zoom.transform, d3.zoomIdentity.translate(state.width/2, state.height/2).scale(0.8).translate(-state.width/2, -state.height/2)); // Initial zoom out\n }\n\n function ticked() {\n if (state.linkSelection) {\n state.linkSelection\n .attr(\"x1\", d => d.source.x)\n .attr(\"y1\", d => d.source.y)\n .attr(\"x2\", d => d.target.x)\n .attr(\"y2\", d => d.target.y);\n }\n\n if (state.nodeSelection) {\n state.nodeSelection\n .attr(\"cx\", d => d.x)\n .attr(\"cy\", d => d.y);\n }\n }\n\n // --- Interactions ---\n\n function setupInteractions() {\n // Background click to clear selection\n svg.on(\"click\", () => {\n state.activeNode = null;\n resetHighlight();\n detailsPanel.classed(\"visible\", false);\n });\n\n // Layout Toggle\n d3.select(\"#btn-hierarchical\").on(\"click\", function() {\n setMode('hierarchical', this);\n });\n d3.select(\"#btn-radial\").on(\"click\", function() {\n setMode('radial', this);\n });\n\n // Authority Toggle\n d3.select(\"#btn-auth-pagerank\").on(\"click\", function() {\n setAuthorityMode('pagerank', this);\n });\n d3.select(\"#btn-auth-structural\").on(\"click\", function() {\n setAuthorityMode('structural', this);\n });\n }\n\n function setAuthorityMode(mode, btn) {\n d3.select(\"#btn-auth-pagerank\").classed(\"active\", false);\n d3.select(\"#btn-auth-structural\").classed(\"active\", false);\n d3.select(btn).classed(\"active\", true);\n\n state.nodes.forEach(n => {\n n.authority = mode === 'pagerank' ? n.pageRankAuthority : n.structuralAuthority;\n });\n\n // Update Visuals\n nodeGroup.selectAll(\"circle\")\n .transition().duration(500)\n .attr(\"r\", d => getNodeRadius(d));\n\n // Update collision force if enabled\n if (state.simulation.force(\"collide\")) {\n state.simulation.force(\"collide\", d3.forceCollide().radius(d => getNodeRadius(d) + 2).iterations(1));\n state.simulation.alpha(0.3).restart();\n }\n }\n\n function setMode(mode, btn) {\n d3.selectAll(\"#controls button\").classed(\"active\", false);\n d3.select(btn).classed(\"active\", true);\n applyLayoutMode(mode);\n }\n\n function highlightNode(d) {\n const neighbors = new Set();\n const adj = state.adjacency.get(d.url);\n if (adj) {\n adj.in.forEach(n => neighbors.add(n.url));\n adj.out.forEach(n => neighbors.add(n.url));\n }\n neighbors.add(d.url);\n\n nodeGroup.selectAll(\"circle\").classed(\"faded\", n => !neighbors.has(n.url));\n nodeGroup.selectAll(\"circle\").classed(\"highlight\", n => n.url === d.url);\n\n linkGroup.selectAll(\"line\").classed(\"faded\", l =>\n l.source.url !== d.url && l.target.url !== d.url\n );\n linkGroup.selectAll(\"line\").classed(\"highlight\", l =>\n l.source.url === d.url || l.target.url === d.url\n );\n }\n\n function resetHighlight() {\n nodeGroup.selectAll(\"circle\").classed(\"faded\", false).classed(\"highlight\", false);\n linkGroup.selectAll(\"line\").classed(\"faded\", false).classed(\"highlight\", false);\n }\n\n function selectNode(d) {\n state.activeNode = d;\n highlightNode(d);\n showDetails(d);\n }\n\n function showTooltip(event, d) {\n // If we are transforming the container, we need to map coordinates correctly or just use pageX/Y\n tooltip.style(\"display\", \"block\")\n .html(`<strong>${new URL(d.url).pathname}</strong><br>Auth: ${(d.authority * 10).toFixed(1)}`)\n .style(\"left\", (event.pageX) + \"px\")\n .style(\"top\", (event.pageY - 10) + \"px\");\n }\n\n function hideTooltip() {\n tooltip.style(\"display\", \"none\");\n }\n\n function showDetails(d) {\n detailsPanel.classed(\"visible\", true);\n d3.select(\"#d-url\").text(d.url);\n d3.select(\"#d-depth\").text(d.depth);\n\n const authContainer = d3.select(\"#d-auth-container\");\n authContainer.html(\"\");\n const prVal = (d.pageRankAuthority * 100).toFixed(1);\n const structVal = d.structuralAuthority.toFixed(3);\n authContainer.append(\"div\").html(`PR: <strong>${prVal}</strong>`);\n authContainer.append(\"div\").style(\"color\", \"#888\").style(\"font-size\", \"0.8em\").text(`In-Degree: ${structVal}`);\n\n d3.select(\"#d-in-count\").text(d.inLinks);\n d3.select(\"#d-out-count\").text(d.outLinks);\n\n // Status badge\n const statusDiv = d3.select(\"#d-status\");\n statusDiv.html(\"\");\n let sClass = \"status-ok\";\n if (d.status >= 400) sClass = \"status-error\";\n else if (d.status >= 300) sClass = \"status-warn\";\n statusDiv.append(\"span\").attr(\"class\", \"status-badge \" + sClass).text(d.status);\n\n // Outlinks list (limit to 20)\n const list = d3.select(\"#d-out-list\");\n list.html(\"\");\n const adj = state.adjacency.get(d.url);\n if (adj && adj.out.length > 0) {\n adj.out.slice(0, 50).forEach(target => {\n list.append(\"li\").append(\"a\")\n .attr(\"href\", target.url)\n .attr(\"target\", \"_blank\")\n .text(new URL(target.url).pathname);\n });\n if (adj.out.length > 50) {\n list.append(\"li\").text(`...and ${adj.out.length - 50} more`);\n }\n } else {\n list.append(\"li\").text(\"No outgoing links\");\n }\n }\n\n // --- Search ---\n function setupSearch() {\n const input = document.getElementById('search-input');\n input.addEventListener('keydown', (e) => {\n if (e.key === 'Enter') {\n const val = input.value.trim().toLowerCase();\n if (!val) return;\n\n const found = state.nodes.find(n => n.url.toLowerCase().includes(val));\n if (found) {\n selectNode(found);\n // Center view on node\n const transform = d3.zoomIdentity\n .translate(state.width/2, state.height/2)\n .scale(2)\n .translate(-found.x, -found.y);\n\n svg.transition().duration(750).call(state.zoom.transform, transform);\n }\n }\n });\n }\n\n function setupResize() {\n window.addEventListener(\"resize\", () => {\n const { width, height } = getDimensions();\n state.width = width;\n state.height = height;\n state.simulation.force(\"center\", d3.forceCenter(width / 2, height / 2));\n if (state.mode === 'hierarchical') {\n // Re-evaluate Y force if needed, but usually center is enough\n }\n state.simulation.alpha(0.3).restart();\n });\n }\n\n function getDimensions() {\n const rect = document.getElementById(\"graph-container\").getBoundingClientRect();\n return { width: rect.width, height: rect.height };\n }\n\n // --- Dragging ---\n function dragstarted(event, d) {\n if (!event.active) state.simulation.alphaTarget(0.3).restart();\n d.fx = d.x;\n d.fy = d.y;\n }\n\n function dragged(event, d) {\n d.fx = event.x;\n d.fy = event.y;\n }\n\n function dragended(event, d) {\n if (!event.active) state.simulation.alphaTarget(0);\n d.fx = null;\n d.fy = null;\n }\n\n // Start\n if (document.readyState === 'loading') {\n document.addEventListener('DOMContentLoaded', init);\n } else {\n init();\n }\n </script>\n</body>\n</html>\n";