@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,102 +1,137 @@
1
+ import { DEFAULTS } from '../constants.js';
1
2
  /**
2
- * Production-Grade Weighted PageRank Engine
3
+ * Service to analyze a site's link graph and compute PageRank metrics.
4
+ * Runs only on the full crawl graph.
3
5
  */
4
- export function computePageRank(graph, options = {}) {
5
- const d = options.dampingFactor ?? 0.85;
6
- const maxIterations = options.maxIterations ?? 40;
7
- const epsilon = options.convergenceThreshold ?? 1e-5;
8
- const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
9
- const allNodes = graph.getNodes();
10
- const allEdges = graph.getEdges();
11
- // 1. Filter Eligible Nodes
12
- const eligibleNodes = allNodes.filter(node => {
13
- if (node.noindex)
14
- return false;
15
- if (node.isCollapsed)
16
- return false;
17
- if (node.soft404Score && node.soft404Score > soft404Threshold)
18
- return false;
19
- if (node.canonical && node.canonical !== node.url)
20
- return false;
21
- if (node.status >= 400)
22
- return false; // Don't pass rank to broken pages
23
- return true;
24
- });
25
- const nodeCount = eligibleNodes.length;
26
- if (nodeCount === 0)
27
- return;
28
- const nodeUrls = eligibleNodes.map(n => n.url);
29
- const nodeMap = new Map();
30
- eligibleNodes.forEach(n => nodeMap.set(n.url, n));
31
- // Initialize PageRank
32
- let pr = new Map();
33
- nodeUrls.forEach(url => pr.set(url, 1 / nodeCount));
34
- // Pre-calculate weighted outbound sums and inverted adjacency
35
- const outWeights = new Map();
36
- const incoming = new Map();
37
- const sinks = [];
38
- // Initialize outWeights for all eligible nodes
39
- nodeUrls.forEach(url => outWeights.set(url, 0));
40
- for (const edge of allEdges) {
41
- if (nodeMap.has(edge.source) && nodeMap.has(edge.target)) {
42
- const weight = edge.weight || 1.0;
43
- const sources = incoming.get(edge.target) ?? [];
44
- sources.push({ source: edge.source, weight });
45
- incoming.set(edge.target, sources);
46
- outWeights.set(edge.source, (outWeights.get(edge.source) || 0) + weight);
47
- }
48
- }
49
- // Identify sinks
50
- nodeUrls.forEach(url => {
51
- if ((outWeights.get(url) || 0) === 0) {
52
- sinks.push(url);
6
+ export class PageRankService {
7
+ /**
8
+ * Computes a Production-Grade Weighted PageRank over the given graph.
9
+ * @param {Graph} graph - The full site graph structure.
10
+ * @param {PageRankOptions} options - Configuration overrides for damping factor, limits, etc.
11
+ * @returns {Map<string, PageRankRow>} The individual metrics keyed by exact normalized url.
12
+ */
13
+ evaluate(graph, options = {}) {
14
+ const d = options.dampingFactor ?? 0.85;
15
+ const maxIterations = options.maxIterations ?? 40;
16
+ const epsilon = options.convergenceThreshold ?? 1e-5;
17
+ const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
18
+ const neutralScoreWhenFlat = options.neutralScoreWhenFlat ?? 50;
19
+ const allNodes = graph.getNodes();
20
+ const allEdges = graph.getEdges();
21
+ // 1. Filter Eligible Nodes
22
+ const eligibleNodes = allNodes.filter(node => {
23
+ if (node.noindex)
24
+ return false;
25
+ if (node.isCollapsed)
26
+ return false;
27
+ // Keep compat with other plugins mutating soft404Score onto nodes
28
+ if (node.soft404Score && node.soft404Score > soft404Threshold)
29
+ return false;
30
+ // canonical is stored as absolute URL; extract pathname for path-based comparison
31
+ if (node.canonical) {
32
+ try {
33
+ const canonicalPath = new URL(node.canonical).pathname;
34
+ if (canonicalPath !== node.url)
35
+ return false;
36
+ }
37
+ catch {
38
+ // if canonical isn't a valid URL, compare as-is
39
+ if (node.canonical !== node.url)
40
+ return false;
41
+ }
42
+ }
43
+ if (node.status >= 400)
44
+ return false; // Don't pass rank to broken pages
45
+ if (node.status === 0)
46
+ return false; // Don't pass rank to uncrawled/external pages
47
+ return true;
48
+ });
49
+ const nodeCount = eligibleNodes.length;
50
+ const results = new Map();
51
+ if (nodeCount === 0)
52
+ return results;
53
+ // Map URL to Index for O(1) access and TypedArray usage
54
+ const urlToIndex = new Map();
55
+ for (let i = 0; i < nodeCount; i++) {
56
+ urlToIndex.set(eligibleNodes[i].url, i);
53
57
  }
54
- });
55
- // Iterative Calculation
56
- for (let i = 0; i < maxIterations; i++) {
57
- const nextPr = new Map();
58
- // Calculate total rank from sinks to redistribute
59
- let sinkRankTotal = 0;
60
- for (const url of sinks) {
61
- sinkRankTotal += pr.get(url) || 0;
58
+ // Pre-calculate weighted outbound sums and inverted adjacency
59
+ const outWeights = new Float64Array(nodeCount);
60
+ const incoming = new Array(nodeCount).fill(null).map(() => []);
61
+ for (const edge of allEdges) {
62
+ const sourceIndex = urlToIndex.get(edge.source);
63
+ const targetIndex = urlToIndex.get(edge.target);
64
+ if (sourceIndex !== undefined && targetIndex !== undefined) {
65
+ const weight = edge.weight || 1.0;
66
+ incoming[targetIndex].push({ sourceIndex, weight });
67
+ outWeights[sourceIndex] += weight;
68
+ }
62
69
  }
63
- const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
64
- for (const url of nodeUrls) {
65
- let rankFromLinks = 0;
66
- const sources = incoming.get(url) || [];
67
- for (const edge of sources) {
68
- const sourceRank = pr.get(edge.source) || 0;
69
- const sourceOutWeight = outWeights.get(edge.source) || 1.0;
70
- rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
70
+ // Identify sinks
71
+ const sinks = [];
72
+ for (let i = 0; i < nodeCount; i++) {
73
+ if (outWeights[i] === 0) {
74
+ sinks.push(i);
71
75
  }
72
- const newRank = baseRank + d * rankFromLinks;
73
- nextPr.set(url, newRank);
74
76
  }
75
- // Convergence check
76
- let maxDelta = 0;
77
- for (const url of nodeUrls) {
78
- const delta = Math.abs(nextPr.get(url) - pr.get(url));
79
- if (delta > maxDelta)
80
- maxDelta = delta;
77
+ // Initialize PageRank typed arrays
78
+ let pr = new Float64Array(nodeCount).fill(1 / nodeCount);
79
+ let nextPr = new Float64Array(nodeCount);
80
+ // Iterative Calculation
81
+ for (let iter = 0; iter < maxIterations; iter++) {
82
+ // Calculate total rank from sinks to redistribute
83
+ let sinkRankTotal = 0;
84
+ for (let i = 0; i < sinks.length; i++) {
85
+ sinkRankTotal += pr[sinks[i]];
86
+ }
87
+ const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
88
+ let maxDelta = 0;
89
+ for (let i = 0; i < nodeCount; i++) {
90
+ let rankFromLinks = 0;
91
+ const sources = incoming[i];
92
+ for (let j = 0; j < sources.length; j++) {
93
+ const edge = sources[j];
94
+ const sourceRank = pr[edge.sourceIndex];
95
+ const sourceOutWeight = outWeights[edge.sourceIndex] || 1.0;
96
+ rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
97
+ }
98
+ const newRank = baseRank + d * rankFromLinks;
99
+ nextPr[i] = newRank;
100
+ const delta = Math.abs(newRank - pr[i]);
101
+ if (delta > maxDelta) {
102
+ maxDelta = delta;
103
+ }
104
+ }
105
+ // Swap arrays
106
+ const temp = pr;
107
+ pr = nextPr;
108
+ nextPr = temp;
109
+ if (maxDelta < epsilon)
110
+ break;
81
111
  }
82
- pr = nextPr;
83
- if (maxDelta < epsilon)
84
- break;
85
- }
86
- // 2. Normalization (0-100)
87
- const ranks = Array.from(pr.values());
88
- const minPR = Math.min(...ranks);
89
- const maxPR = Math.max(...ranks);
90
- const range = maxPR - minPR;
91
- for (const node of eligibleNodes) {
92
- const rawRank = pr.get(node.url);
93
- node.pageRank = rawRank;
94
- if (range > 1e-12) {
95
- node.pageRankScore = 100 * (rawRank - minPR) / range;
112
+ // 2. Normalization (0-100)
113
+ let minPR = pr[0];
114
+ let maxPR = pr[0];
115
+ for (let i = 1; i < nodeCount; i++) {
116
+ const rank = pr[i];
117
+ if (rank < minPR)
118
+ minPR = rank;
119
+ if (rank > maxPR)
120
+ maxPR = rank;
96
121
  }
97
- else {
98
- // If there's no range, all eligible pages are equally important.
99
- node.pageRankScore = 100;
122
+ const range = maxPR - minPR;
123
+ for (let i = 0; i < nodeCount; i++) {
124
+ const rawRank = pr[i];
125
+ const url = eligibleNodes[i].url;
126
+ let score = neutralScoreWhenFlat;
127
+ if (range > DEFAULTS.GRAPH_PRECISION) {
128
+ score = 100 * (rawRank - minPR) / range;
129
+ }
130
+ results.set(url, {
131
+ raw_rank: rawRank,
132
+ score: Number(score.toFixed(3))
133
+ });
100
134
  }
135
+ return results;
101
136
  }
102
137
  }
@@ -2,6 +2,8 @@ export declare class SimHash {
2
2
  private static FNV_PRIME;
3
3
  private static FNV_OFFSET_BASIS;
4
4
  private static MAX_UINT64;
5
+ static readonly BANDS = 4;
6
+ static readonly BAND_WIDTH = 16;
5
7
  /**
6
8
  * Generates a 64-bit FNV-1a hash for a given string token.
7
9
  */
@@ -10,6 +12,10 @@ export declare class SimHash {
10
12
  * Generates a 64-bit SimHash from an array of tokens.
11
13
  */
12
14
  static generate(tokens: string[]): bigint;
15
+ /**
16
+ * Splits a 64-bit SimHash into 4 bands of 16 bits.
17
+ */
18
+ static getBands(simhash: bigint): number[];
13
19
  /**
14
20
  * Computes the Hamming distance between two 64-bit hashes.
15
21
  */
@@ -2,6 +2,8 @@ export class SimHash {
2
2
  static FNV_PRIME = 1099511628211n;
3
3
  static FNV_OFFSET_BASIS = 14695981039346656037n;
4
4
  static MAX_UINT64 = 0xffffffffffffffffn;
5
+ static BANDS = 4;
6
+ static BAND_WIDTH = 16;
5
7
  /**
6
8
  * Generates a 64-bit FNV-1a hash for a given string token.
7
9
  */
@@ -40,6 +42,18 @@ export class SimHash {
40
42
  }
41
43
  return simhash;
42
44
  }
45
+ /**
46
+ * Splits a 64-bit SimHash into 4 bands of 16 bits.
47
+ */
48
+ static getBands(simhash) {
49
+ const bands = [];
50
+ for (let i = 0; i < SimHash.BANDS; i++) {
51
+ // Extract 16-bit chunks
52
+ const chunk = Number((simhash >> BigInt(i * SimHash.BAND_WIDTH)) & 0xffffn);
53
+ bands.push(chunk);
54
+ }
55
+ return bands;
56
+ }
43
57
  /**
44
58
  * Computes the Hamming distance between two 64-bit hashes.
45
59
  */
package/dist/index.d.ts CHANGED
@@ -1,24 +1,35 @@
1
+ export * from './scoring/health.js';
1
2
  export * from './crawler/crawl.js';
3
+ export * from './crawler/normalize.js';
2
4
  export * from './crawler/metricsRunner.js';
5
+ export * from './crawler/trap.js';
3
6
  export * from './graph/metrics.js';
4
- export * from './report/html.js';
5
- export * from './report/sitegraph_template.js';
6
- export * from './report/sitegraphExport.js';
7
7
  export * from './graph/graph.js';
8
- export * from './diff/compare.js';
9
- export * from './scoring/orphanSeverity.js';
8
+ export * from './graph/simhash.js';
10
9
  export * from './graph/pagerank.js';
11
- export * from './graph/duplicate.js';
12
- export * from './graph/cluster.js';
13
- export * from './scoring/hits.js';
10
+ export * from './graph/hits.js';
11
+ export * from './diff/compare.js';
12
+ export * from './diff/service.js';
14
13
  export * from './analysis/analyze.js';
15
14
  export * from './analysis/content.js';
16
15
  export * from './analysis/seo.js';
17
16
  export * from './analysis/images.js';
18
17
  export * from './analysis/links.js';
18
+ export * from './analysis/scoring.js';
19
+ export * from './analysis/clustering.js';
20
+ export * from './analysis/duplicate.js';
21
+ export * from './analysis/soft404.js';
22
+ export * from './analysis/heading.js';
23
+ export * from './analysis/orphan.js';
19
24
  export * from './audit/index.js';
20
25
  export * from './audit/types.js';
26
+ export * from './report/html.js';
27
+ export * from './report/crawl_template.js';
28
+ export * from './report/crawlExport.js';
29
+ export * from './report/export.js';
30
+ export * from './report/insight.js';
21
31
  export * from './db/index.js';
32
+ export * from './db/reset.js';
22
33
  export * from './db/graphLoader.js';
23
34
  export * from './db/repositories/SiteRepository.js';
24
35
  export * from './db/repositories/SnapshotRepository.js';
@@ -28,3 +39,13 @@ export * from './db/repositories/MetricsRepository.js';
28
39
  export * from './lock/lockManager.js';
29
40
  export * from './lock/hashKey.js';
30
41
  export * from './utils/version.js';
42
+ export * from './utils/secureConfig.js';
43
+ export * from './events.js';
44
+ export * from './plugin-system/plugin-types.js';
45
+ export * from './plugin-system/plugin-loader.js';
46
+ export * from './plugin-system/plugin-registry.js';
47
+ export * from './plugin-system/plugin-cli.js';
48
+ export * from './ports/index.js';
49
+ export * from './application/usecase.js';
50
+ export * from './application/usecases.js';
51
+ export { Command } from 'commander';
package/dist/index.js CHANGED
@@ -1,24 +1,35 @@
1
+ export * from './scoring/health.js';
1
2
  export * from './crawler/crawl.js';
3
+ export * from './crawler/normalize.js';
2
4
  export * from './crawler/metricsRunner.js';
5
+ export * from './crawler/trap.js';
3
6
  export * from './graph/metrics.js';
4
- export * from './report/html.js';
5
- export * from './report/sitegraph_template.js';
6
- export * from './report/sitegraphExport.js';
7
7
  export * from './graph/graph.js';
8
- export * from './diff/compare.js';
9
- export * from './scoring/orphanSeverity.js';
8
+ export * from './graph/simhash.js';
10
9
  export * from './graph/pagerank.js';
11
- export * from './graph/duplicate.js';
12
- export * from './graph/cluster.js';
13
- export * from './scoring/hits.js';
10
+ export * from './graph/hits.js';
11
+ export * from './diff/compare.js';
12
+ export * from './diff/service.js';
14
13
  export * from './analysis/analyze.js';
15
14
  export * from './analysis/content.js';
16
15
  export * from './analysis/seo.js';
17
16
  export * from './analysis/images.js';
18
17
  export * from './analysis/links.js';
18
+ export * from './analysis/scoring.js';
19
+ export * from './analysis/clustering.js';
20
+ export * from './analysis/duplicate.js';
21
+ export * from './analysis/soft404.js';
22
+ export * from './analysis/heading.js';
23
+ export * from './analysis/orphan.js';
19
24
  export * from './audit/index.js';
20
25
  export * from './audit/types.js';
26
+ export * from './report/html.js';
27
+ export * from './report/crawl_template.js';
28
+ export * from './report/crawlExport.js';
29
+ export * from './report/export.js';
30
+ export * from './report/insight.js';
21
31
  export * from './db/index.js';
32
+ export * from './db/reset.js';
22
33
  export * from './db/graphLoader.js';
23
34
  export * from './db/repositories/SiteRepository.js';
24
35
  export * from './db/repositories/SnapshotRepository.js';
@@ -28,3 +39,13 @@ export * from './db/repositories/MetricsRepository.js';
28
39
  export * from './lock/lockManager.js';
29
40
  export * from './lock/hashKey.js';
30
41
  export * from './utils/version.js';
42
+ export * from './utils/secureConfig.js';
43
+ export * from './events.js';
44
+ export * from './plugin-system/plugin-types.js';
45
+ export * from './plugin-system/plugin-loader.js';
46
+ export * from './plugin-system/plugin-registry.js';
47
+ export * from './plugin-system/plugin-cli.js';
48
+ export * from './ports/index.js';
49
+ export * from './application/usecase.js';
50
+ export * from './application/usecases.js';
51
+ export { Command } from 'commander';
@@ -20,7 +20,7 @@ const RELEVANT_FLAGS = [
20
20
  'concurrency'
21
21
  ];
22
22
  export function generateLockKey(commandName, targetUrl, options) {
23
- // Respect the query stripping option consistent with sitegraph logic
23
+ // Respect the query stripping option consistent with crawl logic
24
24
  const stripQuery = !options.query;
25
25
  const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
26
26
  // Extract relevant options in a deterministic order
@@ -1,7 +1,11 @@
1
+ import { EngineContext } from '../events.js';
1
2
  export declare class LockManager {
2
3
  private static lockFilePath;
4
+ private static context;
3
5
  private static get lockDir();
4
- static acquireLock(commandName: string, targetUrl: string, options: any, force?: boolean): Promise<void>;
6
+ static acquireLock(commandName: string, targetUrl: string, options: any, context?: EngineContext, force?: boolean): Promise<void>;
5
7
  static releaseLock(): void;
8
+ static clearAllLocks(): Promise<number>;
9
+ private static log;
6
10
  private static registerHandlers;
7
11
  }
@@ -2,18 +2,18 @@ import fs from 'node:fs/promises';
2
2
  import { existsSync, unlinkSync, readFileSync } from 'node:fs';
3
3
  import path from 'node:path';
4
4
  import os from 'node:os';
5
- import chalk from 'chalk';
6
5
  import { generateLockKey } from './hashKey.js';
7
6
  import { isPidAlive } from './pidCheck.js';
8
7
  export class LockManager {
9
8
  static lockFilePath = null;
9
+ static context = null;
10
10
  static get lockDir() {
11
11
  return path.join(os.homedir(), '.crawlith', 'locks');
12
12
  }
13
- static async acquireLock(commandName, targetUrl, options, force = false) {
13
+ static async acquireLock(commandName, targetUrl, options, context, force = false) {
14
+ this.context = context || null;
14
15
  const lockHash = generateLockKey(commandName, targetUrl, options);
15
16
  // Ensure lock directory exists
16
- // We can use sync or async here. Since this is one-time setup, async is fine.
17
17
  await fs.mkdir(this.lockDir, { recursive: true });
18
18
  const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
19
19
  // Check existing lock
@@ -29,10 +29,10 @@ export class LockManager {
29
29
  catch (_e) {
30
30
  // Corrupted -> Treat as stale
31
31
  isStale = true;
32
- pid = 0; // Fallback, though unused if isStale is true
32
+ pid = 0;
33
33
  }
34
34
  if (force) {
35
- console.warn(chalk.yellow('Force mode enabled. Overriding existing lock.'));
35
+ this.log('warn', 'Force mode enabled. Overriding existing lock.');
36
36
  try {
37
37
  unlinkSync(lockPath);
38
38
  }
@@ -40,11 +40,11 @@ export class LockManager {
40
40
  }
41
41
  else {
42
42
  if (!isStale) {
43
- console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (PID ${pid})`));
43
+ this.log('error', `Crawlith: command already running for ${targetUrl} (PID ${pid})`);
44
44
  process.exit(1);
45
45
  }
46
46
  else {
47
- console.log(chalk.gray('Detected stale lock. Continuing execution.'));
47
+ this.log('info', 'Detected stale lock. Continuing execution.');
48
48
  try {
49
49
  unlinkSync(lockPath);
50
50
  }
@@ -68,8 +68,7 @@ export class LockManager {
68
68
  }
69
69
  catch (error) {
70
70
  if (error.code === 'EEXIST') {
71
- // Race condition: another process created lock between our check and open
72
- console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
71
+ this.log('error', `Crawlith: command already running for ${targetUrl} (Race condition)`);
73
72
  process.exit(1);
74
73
  }
75
74
  throw error;
@@ -86,15 +85,41 @@ export class LockManager {
86
85
  }
87
86
  }
88
87
  }
88
+ static async clearAllLocks() {
89
+ if (!existsSync(this.lockDir))
90
+ return 0;
91
+ const files = await fs.readdir(this.lockDir);
92
+ const lockFiles = files.filter(f => f.endsWith('.lock'));
93
+ let count = 0;
94
+ for (const file of lockFiles) {
95
+ try {
96
+ await fs.unlink(path.join(this.lockDir, file));
97
+ count++;
98
+ }
99
+ catch { /* ignore */ }
100
+ }
101
+ return count;
102
+ }
103
+ static log(type, message, error) {
104
+ if (this.context) {
105
+ this.context.emit({ type, message, error });
106
+ }
107
+ else {
108
+ // Fallback for legacy usage or when no context provided
109
+ if (type === 'error')
110
+ console.error(message, error || '');
111
+ else if (type === 'warn')
112
+ console.warn(message);
113
+ else
114
+ console.log(message);
115
+ }
116
+ }
89
117
  static registerHandlers() {
90
118
  // Ensure cleanup only happens once
91
119
  const cleanup = () => {
92
120
  this.releaseLock();
93
121
  };
94
- // process.on('exit') is only called when process.exit() is called or event loop empties.
95
- // It requires synchronous cleanup.
96
122
  process.on('exit', cleanup);
97
- // Signals
98
123
  process.on('SIGINT', () => {
99
124
  cleanup();
100
125
  process.exit(130);
@@ -104,7 +129,7 @@ export class LockManager {
104
129
  process.exit(143);
105
130
  });
106
131
  process.on('uncaughtException', (err) => {
107
- console.error(chalk.red('Uncaught Exception:'), err);
132
+ this.log('error', 'Uncaught Exception', err);
108
133
  cleanup();
109
134
  process.exit(1);
110
135
  });
@@ -0,0 +1,10 @@
1
+ import { Command } from 'commander';
2
+ /**
3
+ * Standard utility for plugins to register their configuration commands.
4
+ * This ensures a consistent 'config <plugin> set' CLI pattern across the ecosystem.
5
+ *
6
+ * @param cli - The main Commander instance (must have name 'crawlith').
7
+ * @param pluginName - The unique name of the plugin (e.g., 'pagespeed').
8
+ * @param credentialLabel - Human-readable label for the credential (e.g., 'Google API Key').
9
+ */
10
+ export declare function registerPluginConfigCommand(cli: Command, pluginName: string, credentialLabel: string): void;
@@ -0,0 +1,31 @@
1
+ import { Command } from 'commander';
2
+ import { setEncryptedConfigKey } from '../utils/secureConfig.js';
3
+ /**
4
+ * Standard utility for plugins to register their configuration commands.
5
+ * This ensures a consistent 'config <plugin> set' CLI pattern across the ecosystem.
6
+ *
7
+ * @param cli - The main Commander instance (must have name 'crawlith').
8
+ * @param pluginName - The unique name of the plugin (e.g., 'pagespeed').
9
+ * @param credentialLabel - Human-readable label for the credential (e.g., 'Google API Key').
10
+ */
11
+ export function registerPluginConfigCommand(cli, pluginName, credentialLabel) {
12
+ // Only register subcommands if we are in the root 'crawlith' CLI context
13
+ if (cli.name() !== 'crawlith')
14
+ return;
15
+ // Find or create 'config' command
16
+ let configCmd = cli.commands.find(c => c.name() === 'config');
17
+ if (!configCmd) {
18
+ configCmd = new Command('config').description('Manage Crawlith plugin configuration');
19
+ cli.addCommand(configCmd);
20
+ }
21
+ // Define plugin-specific subcommand
22
+ const pluginConfigCmd = new Command(pluginName).description(`Manage ${pluginName} configuration`);
23
+ pluginConfigCmd
24
+ .command('set <value>')
25
+ .description(`Set and encrypt ${credentialLabel}`)
26
+ .action((value) => {
27
+ setEncryptedConfigKey(pluginName, value);
28
+ console.log(`✅ ${credentialLabel} for ${pluginName} saved and encrypted.`);
29
+ });
30
+ configCmd.addCommand(pluginConfigCmd);
31
+ }
@@ -0,0 +1,16 @@
1
+ export declare class PluginConfig {
2
+ private pluginName;
3
+ constructor(pluginName: string);
4
+ /**
5
+ * Get a decrypted config key for the current plugin.
6
+ */
7
+ get(keyName?: string): string;
8
+ /**
9
+ * Get a decrypted config key, or throw a user-friendly error if it's missing.
10
+ */
11
+ require(keyName?: string): string;
12
+ /**
13
+ * Set/Encrypt a config key for the current plugin.
14
+ */
15
+ set(value: string): void;
16
+ }
@@ -0,0 +1,36 @@
1
+ import { getDecryptedConfigKey, setEncryptedConfigKey } from '../utils/secureConfig.js';
2
+ export class PluginConfig {
3
+ pluginName;
4
+ constructor(pluginName) {
5
+ this.pluginName = pluginName;
6
+ }
7
+ /**
8
+ * Get a decrypted config key for the current plugin.
9
+ */
10
+ get(keyName) {
11
+ const section = keyName || this.pluginName;
12
+ // Safety check: ensure plugins can only access their own config section
13
+ if (section !== this.pluginName) {
14
+ throw new Error(`Security Violation: Plugin "${this.pluginName}" attempted to access config for "${section}"`);
15
+ }
16
+ return getDecryptedConfigKey(section);
17
+ }
18
+ /**
19
+ * Get a decrypted config key, or throw a user-friendly error if it's missing.
20
+ */
21
+ require(keyName) {
22
+ try {
23
+ return this.get(keyName);
24
+ }
25
+ catch (_error) {
26
+ const section = keyName || this.pluginName;
27
+ throw new Error(`Missing ${section} configuration. Please run: crawlith config ${section} set <value>`, { cause: _error });
28
+ }
29
+ }
30
+ /**
31
+ * Set/Encrypt a config key for the current plugin.
32
+ */
33
+ set(value) {
34
+ setEncryptedConfigKey(this.pluginName, value);
35
+ }
36
+ }
@@ -0,0 +1,17 @@
1
+ import { CrawlithPlugin } from './plugin-types.js';
2
+ export interface PluginLoaderLogger {
3
+ debug(msg: string): void;
4
+ info?(msg: string): void;
5
+ warn?(msg: string): void;
6
+ error?(msg: string): void;
7
+ }
8
+ export declare class PluginLoader {
9
+ private plugins;
10
+ private logger?;
11
+ constructor(logger?: PluginLoaderLogger);
12
+ discover(rootPath: string): Promise<CrawlithPlugin[]>;
13
+ private loadFromDir;
14
+ private loadFromNodeModules;
15
+ private tryLoadPlugin;
16
+ private validatePlugin;
17
+ }