@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,302 +0,0 @@
1
- import { Graph } from './graph.js';
2
- import { SimHash } from './simhash.js';
3
- /**
4
- * Detects exact and near duplicates, identifies canonical conflicts,
5
- * and performs non-destructive collapse of edges.
6
- */
7
- export function detectDuplicates(graph, options = {}) {
8
- const collapse = options.collapse !== false; // Default to true
9
- const threshold = options.simhashThreshold ?? 3;
10
- const nodes = graph.getNodes();
11
- let clusterCounter = 1;
12
- // Phase 1 & 2: Exact Duplicate Detection
13
- const { exactClusters, nearCandidates, nextId: nextId1 } = findExactDuplicates(nodes, clusterCounter);
14
- clusterCounter = nextId1;
15
- // Phase 3: Near Duplicate Detection
16
- const { nearClusters } = findNearDuplicates(nearCandidates, threshold, clusterCounter);
17
- const allClusters = [...exactClusters, ...nearClusters];
18
- // Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative)
19
- processClusters(allClusters, graph, collapse);
20
- // Final Edge Transfer if Collapsing
21
- if (collapse) {
22
- collapseEdges(graph);
23
- }
24
- }
25
- function findExactDuplicates(nodes, startId) {
26
- const exactMap = groupNodesByContentHash(nodes);
27
- return createExactClusters(exactMap, startId);
28
- }
29
- function groupNodesByContentHash(nodes) {
30
- const exactMap = new Map();
31
- for (const node of nodes) {
32
- if (!node.contentHash || node.status !== 200)
33
- continue;
34
- let arr = exactMap.get(node.contentHash);
35
- if (!arr) {
36
- arr = [];
37
- exactMap.set(node.contentHash, arr);
38
- }
39
- arr.push(node);
40
- }
41
- return exactMap;
42
- }
43
- function createExactClusters(exactMap, startId) {
44
- const exactClusters = [];
45
- const nearCandidates = [];
46
- let clusterCounter = startId;
47
- for (const [_hash, group] of exactMap.entries()) {
48
- if (group.length > 1) {
49
- const id = `cluster_exact_${clusterCounter++}`;
50
- exactClusters.push({ id, type: 'exact', nodes: group });
51
- for (const n of group) {
52
- n.duplicateClusterId = id;
53
- n.duplicateType = 'exact';
54
- }
55
- }
56
- else {
57
- nearCandidates.push(group[0]);
58
- }
59
- }
60
- return { exactClusters, nearCandidates, nextId: clusterCounter };
61
- }
62
- function findNearDuplicates(candidates, threshold, startId) {
63
- const { bandsMaps, simhashes } = buildSimHashBuckets(candidates);
64
- const { parent, involvedIndices } = findConnectedComponents(bandsMaps, simhashes, candidates.length, threshold);
65
- return extractClusters(parent, involvedIndices, candidates, startId);
66
- }
67
- function buildSimHashBuckets(candidates) {
68
- const n = candidates.length;
69
- const simhashes = new BigUint64Array(n);
70
- const validIndices = [];
71
- for (let i = 0; i < n; i++) {
72
- if (candidates[i].simhash) {
73
- simhashes[i] = BigInt(candidates[i].simhash);
74
- validIndices.push(i);
75
- }
76
- }
77
- const bandsMaps = Array.from({ length: SimHash.BANDS }, () => new Map());
78
- for (const idx of validIndices) {
79
- const bands = SimHash.getBands(simhashes[idx]);
80
- for (let b = 0; b < SimHash.BANDS; b++) {
81
- let arr = bandsMaps[b].get(bands[b]);
82
- if (!arr) {
83
- arr = [];
84
- bandsMaps[b].set(bands[b], arr);
85
- }
86
- arr.push(idx);
87
- }
88
- }
89
- return { bandsMaps, simhashes, validIndices };
90
- }
91
- function findConnectedComponents(bandsMaps, simhashes, n, threshold) {
92
- // Union-Find Arrays (Integer-based)
93
- const parent = new Uint32Array(n);
94
- const rank = new Uint8Array(n);
95
- for (let i = 0; i < n; i++) {
96
- parent[i] = i;
97
- rank[i] = 0;
98
- }
99
- function find(i) {
100
- let root = i;
101
- while (parent[root] !== root) {
102
- root = parent[root];
103
- }
104
- let curr = i;
105
- while (curr !== root) {
106
- const next = parent[curr];
107
- parent[curr] = root;
108
- curr = next;
109
- }
110
- return root;
111
- }
112
- function union(i, j) {
113
- const rootI = find(i);
114
- const rootJ = find(j);
115
- if (rootI !== rootJ) {
116
- const rankI = rank[rootI];
117
- const rankJ = rank[rootJ];
118
- if (rankI < rankJ) {
119
- parent[rootI] = rootJ;
120
- }
121
- else if (rankI > rankJ) {
122
- parent[rootJ] = rootI;
123
- }
124
- else {
125
- parent[rootJ] = rootI;
126
- rank[rootI]++;
127
- }
128
- }
129
- }
130
- const involvedIndices = new Set();
131
- for (let b = 0; b < SimHash.BANDS; b++) {
132
- for (const bucketIndices of bandsMaps[b].values()) {
133
- if (bucketIndices.length < 2)
134
- continue;
135
- for (let j = 0; j < bucketIndices.length; j++) {
136
- for (let k = j + 1; k < bucketIndices.length; k++) {
137
- const idx1 = bucketIndices[j];
138
- const idx2 = bucketIndices[k];
139
- const root1 = find(idx1);
140
- const root2 = find(idx2);
141
- if (root1 === root2)
142
- continue; // Already connected, skip expensive distance check
143
- const dist = SimHash.hammingDistance(simhashes[idx1], simhashes[idx2]);
144
- if (dist <= threshold) {
145
- union(root1, root2);
146
- involvedIndices.add(idx1);
147
- involvedIndices.add(idx2);
148
- }
149
- }
150
- }
151
- }
152
- }
153
- return { parent, involvedIndices };
154
- }
155
- function extractClusters(parent, involvedIndices, candidates, startId) {
156
- const nearClusters = [];
157
- let clusterCounter = startId;
158
- function find(i) {
159
- let root = i;
160
- while (parent[root] !== root) {
161
- root = parent[root];
162
- }
163
- let curr = i;
164
- while (curr !== root) {
165
- const next = parent[curr];
166
- parent[curr] = root;
167
- curr = next;
168
- }
169
- return root;
170
- }
171
- // Compile clusters
172
- const clusterMap = new Map();
173
- for (const idx of involvedIndices) {
174
- const root = find(idx);
175
- let group = clusterMap.get(root);
176
- if (!group) {
177
- group = [];
178
- clusterMap.set(root, group);
179
- }
180
- group.push(idx);
181
- }
182
- for (const groupIndices of clusterMap.values()) {
183
- if (groupIndices.length > 1) {
184
- const id = `cluster_near_${clusterCounter++}`;
185
- const groupNodes = groupIndices.map(idx => candidates[idx]);
186
- nearClusters.push({ id, type: 'near', nodes: groupNodes });
187
- for (const n of groupNodes) {
188
- n.duplicateClusterId = id;
189
- n.duplicateType = 'near';
190
- }
191
- }
192
- }
193
- return { nearClusters, nextId: clusterCounter };
194
- }
195
- function processClusters(clusters, graph, collapse) {
196
- for (const cluster of clusters) {
197
- processSingleCluster(cluster, graph, collapse);
198
- }
199
- }
200
- function processSingleCluster(cluster, graph, collapse) {
201
- checkTemplateHeavy(cluster);
202
- cluster.severity = calculateSeverity(cluster);
203
- const representative = selectRepresentative(cluster);
204
- cluster.representative = representative.url;
205
- applyClusterToGraph(cluster, representative, graph, collapse);
206
- }
207
- function checkTemplateHeavy(cluster) {
208
- const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
209
- if (avgRatio < 0.3) {
210
- cluster.type = 'template_heavy';
211
- cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
212
- }
213
- }
214
- function calculateSeverity(cluster) {
215
- const canonicals = new Set();
216
- let hasMissing = false;
217
- for (const n of cluster.nodes) {
218
- if (!n.canonical)
219
- hasMissing = true;
220
- else
221
- canonicals.add(n.canonical);
222
- }
223
- if (hasMissing || canonicals.size > 1) {
224
- return 'high';
225
- }
226
- else if (cluster.type === 'near') {
227
- return 'medium';
228
- }
229
- else {
230
- return 'low';
231
- }
232
- }
233
- function selectRepresentative(cluster) {
234
- const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
235
- const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
236
- if (validCanonicals.length > 0) {
237
- return validCanonicals[0];
238
- }
239
- return cluster.nodes.reduce((best, current) => {
240
- if (current.inLinks > best.inLinks)
241
- return current;
242
- if (current.inLinks < best.inLinks)
243
- return best;
244
- if (current.url.length < best.url.length)
245
- return current;
246
- return best;
247
- });
248
- }
249
- function applyClusterToGraph(cluster, representative, graph, collapse) {
250
- cluster.nodes.forEach(n => {
251
- n.isClusterPrimary = n.url === representative.url;
252
- n.isCollapsed = false;
253
- n.collapseInto = undefined;
254
- });
255
- graph.duplicateClusters.push({
256
- id: cluster.id,
257
- type: cluster.type,
258
- size: cluster.nodes.length,
259
- representative: representative.url,
260
- severity: cluster.severity
261
- });
262
- if (collapse) {
263
- for (const n of cluster.nodes) {
264
- if (n.url !== representative.url) {
265
- n.isCollapsed = true;
266
- n.collapseInto = representative.url;
267
- }
268
- }
269
- }
270
- }
271
- function collapseEdges(graph) {
272
- const edges = graph.getEdges();
273
- const updatedEdges = new Map();
274
- for (const edge of edges) {
275
- const sourceNode = graph.nodes.get(edge.source);
276
- const targetNode = graph.nodes.get(edge.target);
277
- if (!sourceNode || !targetNode)
278
- continue;
279
- const actualSource = edge.source;
280
- const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
281
- if (actualSource === actualTarget)
282
- continue;
283
- const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
284
- const existingWeight = updatedEdges.get(edgeKey) || 0;
285
- updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
286
- }
287
- graph.edges = updatedEdges;
288
- // Re-calculate inLinks and outLinks based on collapsed edges
289
- for (const node of graph.getNodes()) {
290
- node.inLinks = 0;
291
- node.outLinks = 0;
292
- }
293
- for (const [edgeKey, _weight] of updatedEdges.entries()) {
294
- const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
295
- const sn = graph.nodes.get(src);
296
- const tn = graph.nodes.get(tgt);
297
- if (sn)
298
- sn.outLinks++;
299
- if (tn)
300
- tn.inLinks++;
301
- }
302
- }
@@ -1,10 +0,0 @@
1
- import { Graph } from '../graph/graph.js';
2
- export interface HITSOptions {
3
- iterations?: number;
4
- }
5
- /**
6
- * Computes Hub and Authority scores using the HITS algorithm.
7
- * Operates purely on the internal link graph.
8
- * Optimized for performance using array-based adjacency lists.
9
- */
10
- export declare function computeHITS(graph: Graph, options?: HITSOptions): void;
@@ -1,131 +0,0 @@
1
- /**
2
- * Computes Hub and Authority scores using the HITS algorithm.
3
- * Operates purely on the internal link graph.
4
- * Optimized for performance using array-based adjacency lists.
5
- */
6
- export function computeHITS(graph, options = {}) {
7
- const iterations = options.iterations || 20;
8
- const nodes = graph.getNodes();
9
- // 1. Filter eligible nodes
10
- // Eligibility: status 200 (crawled) or status 0 (discovered)
11
- // Non-redirect, not noindex (if known), non-external
12
- const eligibleNodes = nodes.filter(n => (n.status === 200 || n.status === 0) &&
13
- (!n.redirectChain || n.redirectChain.length === 0) &&
14
- !n.noindex);
15
- const N = eligibleNodes.length;
16
- if (N === 0)
17
- return;
18
- // Map URL to Index for O(1) access
19
- const urlToIndex = new Map();
20
- for (let i = 0; i < N; i++) {
21
- urlToIndex.set(eligibleNodes[i].url, i);
22
- }
23
- // Build Adjacency Lists (Indices)
24
- // incoming[i] = list of { sourceIndex, weight }
25
- // outgoing[i] = list of { targetIndex, weight }
26
- const incoming = new Array(N).fill(null).map(() => []);
27
- const outgoing = new Array(N).fill(null).map(() => []);
28
- const allEdges = graph.getEdges();
29
- for (const edge of allEdges) {
30
- if (edge.source === edge.target)
31
- continue;
32
- const sourceIndex = urlToIndex.get(edge.source);
33
- const targetIndex = urlToIndex.get(edge.target);
34
- if (sourceIndex !== undefined && targetIndex !== undefined) {
35
- incoming[targetIndex].push({ sourceIndex, weight: edge.weight });
36
- outgoing[sourceIndex].push({ targetIndex, weight: edge.weight });
37
- }
38
- }
39
- // Initialize Scores
40
- const authScores = new Float64Array(N).fill(1.0);
41
- const hubScores = new Float64Array(N).fill(1.0);
42
- // 2. Iteration
43
- for (let iter = 0; iter < iterations; iter++) {
44
- // Update Authorities
45
- let normAuth = 0;
46
- for (let i = 0; i < N; i++) {
47
- const inLinks = incoming[i];
48
- let newAuth = 0;
49
- for (let j = 0; j < inLinks.length; j++) {
50
- const link = inLinks[j];
51
- newAuth += hubScores[link.sourceIndex] * link.weight;
52
- }
53
- authScores[i] = newAuth;
54
- normAuth += newAuth * newAuth;
55
- }
56
- // Normalize Authorities (L2 norm)
57
- normAuth = Math.sqrt(normAuth);
58
- if (normAuth > 0) {
59
- for (let i = 0; i < N; i++) {
60
- authScores[i] /= normAuth;
61
- }
62
- }
63
- // Update Hubs
64
- let normHub = 0;
65
- for (let i = 0; i < N; i++) {
66
- const outLinks = outgoing[i];
67
- let newHub = 0;
68
- for (let j = 0; j < outLinks.length; j++) {
69
- const link = outLinks[j];
70
- newHub += authScores[link.targetIndex] * link.weight;
71
- }
72
- hubScores[i] = newHub;
73
- normHub += newHub * newHub;
74
- }
75
- // Normalize Hubs (L2 norm)
76
- normHub = Math.sqrt(normHub);
77
- if (normHub > 0) {
78
- for (let i = 0; i < N; i++) {
79
- hubScores[i] /= normHub;
80
- }
81
- }
82
- }
83
- // 3. Assign back to GraphNodes
84
- for (let i = 0; i < N; i++) {
85
- eligibleNodes[i].authorityScore = authScores[i];
86
- eligibleNodes[i].hubScore = hubScores[i];
87
- }
88
- // 4. Classification Logic
89
- classifyLinkRoles(eligibleNodes);
90
- }
91
- function classifyLinkRoles(nodes) {
92
- if (nodes.length === 0)
93
- return;
94
- const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
95
- const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
96
- // Use 75th percentile as "high" threshold
97
- // Using median (50th percentile) as per original implementation,
98
- // but the comment said "Use 75th percentile" while code used median.
99
- // I'll stick to median to avoid breaking existing behavior, but correct the comment or logic?
100
- // The original code:
101
- // const medianAuth = authScores[Math.floor(authScores.length / 2)];
102
- // const isHighAuth = auth > medianAuth && auth > 0.0001;
103
- // So it uses median. I'll keep it as median.
104
- const medianAuth = authScores[Math.floor(authScores.length / 2)];
105
- const medianHub = hubScores[Math.floor(hubScores.length / 2)];
106
- const maxAuth = authScores[authScores.length - 1];
107
- const maxHub = hubScores[hubScores.length - 1];
108
- for (const node of nodes) {
109
- const auth = node.authorityScore || 0;
110
- const hub = node.hubScore || 0;
111
- // A node is high if it's above median, OR if it's the max (to handle uniform distributions)
112
- // auth > 0 check is essential.
113
- const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
114
- const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
115
- if (isHighAuth && isHighHub) {
116
- node.linkRole = 'power';
117
- }
118
- else if (isHighAuth) {
119
- node.linkRole = 'authority';
120
- }
121
- else if (isHighHub) {
122
- node.linkRole = 'hub';
123
- }
124
- else if (auth > 0.00001 && hub > 0.00001) {
125
- node.linkRole = 'balanced';
126
- }
127
- else {
128
- node.linkRole = 'peripheral';
129
- }
130
- }
131
- }
@@ -1,37 +0,0 @@
1
- import fs from 'node:fs';
2
- import path from 'node:path';
3
- import { fileURLToPath } from 'node:url';
4
-
5
- const __filename = fileURLToPath(import.meta.url);
6
- const __dirname = path.dirname(__filename);
7
-
8
- // Ensure dist directories exist
9
- const reportDestDir = path.join(__dirname, '../dist/report');
10
- if (!fs.existsSync(reportDestDir)) {
11
- fs.mkdirSync(reportDestDir, { recursive: true });
12
- }
13
-
14
- const analysisDestDir = path.join(__dirname, '../dist/analysis');
15
- if (!fs.existsSync(analysisDestDir)) {
16
- fs.mkdirSync(analysisDestDir, { recursive: true });
17
- }
18
-
19
- // Copy Report Assets
20
- const crawlSrc = path.join(__dirname, '../src/report/crawl.html');
21
- const crawlDest = path.join(reportDestDir, 'crawl.html');
22
- if (fs.existsSync(crawlSrc)) {
23
- fs.copyFileSync(crawlSrc, crawlDest);
24
- }
25
-
26
- // Copy Analysis Assets
27
- const analysisListSrc = path.join(__dirname, '../src/analysis/analysis_list.html');
28
- const analysisListDest = path.join(analysisDestDir, 'analysis_list.html');
29
- if (fs.existsSync(analysisListSrc)) {
30
- fs.copyFileSync(analysisListSrc, analysisListDest);
31
- }
32
-
33
- const analysisPageSrc = path.join(__dirname, '../src/analysis/analysis_page.html');
34
- const analysisPageDest = path.join(analysisDestDir, 'analysis_page.html');
35
- if (fs.existsSync(analysisPageSrc)) {
36
- fs.copyFileSync(analysisPageSrc, analysisPageDest);
37
- }
@@ -1,35 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="utf-8" />
5
- <title>Crawlith Analysis Report</title>
6
- <style>
7
- body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px; color: #333; }
8
- h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
9
- table { width: 100%; border-collapse: collapse; margin-top: 20px; }
10
- th, td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; }
11
- th { background-color: #f4f4f4; }
12
- tr:nth-child(even) { background-color: #f9f9f9; }
13
- tr:hover { background-color: #f1f1f1; }
14
- </style>
15
- </head>
16
- <body>
17
- <h1>Analysis</h1>
18
- <p>Pages: {{PAGES_ANALYZED}}</p>
19
- <p>Average SEO: {{AVG_SEO_SCORE}}</p>
20
- <table border="1" cellspacing="0" cellpadding="6">
21
- <thead>
22
- <tr>
23
- <th>URL</th>
24
- <th>SEO Score</th>
25
- <th>Thin Score</th>
26
- <th>Title</th>
27
- <th>Meta</th>
28
- </tr>
29
- </thead>
30
- <tbody>
31
- {{ROWS}}
32
- </tbody>
33
- </table>
34
- </body>
35
- </html>
@@ -1,123 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Analysis for {{URL}}</title>
7
- <style>
8
- body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
9
- h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
10
- h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
11
- .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
12
- .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
13
- .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
14
- .status-ok { color: green; font-weight: bold; }
15
- .status-warning { color: orange; font-weight: bold; }
16
- .status-critical { color: red; font-weight: bold; }
17
- .status-missing { color: red; font-weight: bold; }
18
- .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
19
- .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
20
- .data-table th { width: 150px; color: #666; }
21
- code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
22
- </style>
23
- </head>
24
- <body>
25
- <h1>Page Analysis</h1>
26
- <p><strong>URL:</strong> <a href="{{URL}}" target="_blank">{{URL}}</a></p>
27
-
28
- <div class="score-card">
29
- <div class="score-box">
30
- <div class="score-val">{{SEO_SCORE}}</div>
31
- <div>SEO Score</div>
32
- </div>
33
- <div class="score-box">
34
- <div class="score-val">{{THIN_SCORE}}</div>
35
- <div>Thin Content Score</div>
36
- </div>
37
- <div class="score-box">
38
- <div class="score-val">{{HTTP_STATUS}}</div>
39
- <div>HTTP Status</div>
40
- </div>
41
- </div>
42
-
43
- <h2>Meta Tags</h2>
44
- <table class="data-table">
45
- <tr>
46
- <th>Title</th>
47
- <td>
48
- <div>{{TITLE_VALUE}}</div>
49
- <small>Length: {{TITLE_LENGTH}} | Status: <span class="status-{{TITLE_STATUS}}">{{TITLE_STATUS}}</span></small>
50
- </td>
51
- </tr>
52
- <tr>
53
- <th>Description</th>
54
- <td>
55
- <div>{{META_DESCRIPTION_VALUE}}</div>
56
- <small>Length: {{META_DESCRIPTION_LENGTH}} | Status: <span class="status-{{META_DESCRIPTION_STATUS}}">{{META_DESCRIPTION_STATUS}}</span></small>
57
- </td>
58
- </tr>
59
- <tr>
60
- <th>Canonical</th>
61
- <td>{{CANONICAL}}</td>
62
- </tr>
63
- <tr>
64
- <th>Robots</th>
65
- <td>
66
- Index: {{ROBOTS_INDEX}},
67
- Follow: {{ROBOTS_FOLLOW}}
68
- </td>
69
- </tr>
70
- </table>
71
-
72
- <h2>Content & Heading</h2>
73
- <table class="data-table">
74
- <tr>
75
- <th>H1 Tag</th>
76
- <td>
77
- Status: <span class="status-{{H1_STATUS}}">{{H1_STATUS}}</span>
78
- ({{H1_COUNT}} detected)
79
- {{H1_MATCHES_TITLE}}
80
- </td>
81
- </tr>
82
- <tr>
83
- <th>Word Count</th>
84
- <td>{{WORD_COUNT}} words</td>
85
- </tr>
86
- <tr>
87
- <th>Unique Sentences</th>
88
- <td>{{UNIQUE_SENTENCES}}</td>
89
- </tr>
90
- <tr>
91
- <th>Text / HTML Ratio</th>
92
- <td>{{TEXT_HTML_RATIO}}%</td>
93
- </tr>
94
- </table>
95
-
96
- <h2>Links & Images</h2>
97
- <table class="data-table">
98
- <tr>
99
- <th>Internal Links</th>
100
- <td>{{INTERNAL_LINKS}}</td>
101
- </tr>
102
- <tr>
103
- <th>External Links</th>
104
- <td>{{EXTERNAL_LINKS}} ({{EXTERNAL_RATIO}}%)</td>
105
- </tr>
106
- <tr>
107
- <th>Images</th>
108
- <td>{{TOTAL_IMAGES}} total ({{MISSING_ALT}} missing alt text)</td>
109
- </tr>
110
- </table>
111
-
112
- <h2>Structured Data</h2>
113
- <table class="data-table">
114
- <tr>
115
- <th>Status</th>
116
- <td>
117
- {{STRUCTURED_DATA_STATUS}}
118
- </td>
119
- </tr>
120
- {{STRUCTURED_DATA_TYPES_ROW}}
121
- </table>
122
- </body>
123
- </html>