@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Service to compute Hub and Authority scores using the HITS algorithm.
3
+ * Operates purely on the internal link graph.
4
+ */
5
+ export class HITSService {
6
+ /**
7
+ * Computes Hub and Authority scores using the HITS algorithm.
8
+ * @param {Graph} graph - The link graph to analyze.
9
+ * @param {HITSOptions} options - Algorithm options (e.g. number of iterations).
10
+ * @returns {Map<string, HITSRow>} A map of page URLs to their HITS results.
11
+ */
12
+ evaluate(graph, options = {}) {
13
+ const iterations = options.iterations || 20;
14
+ const nodes = graph.getNodes();
15
+ // 1. Filter eligible nodes
16
+ const eligibleNodes = nodes.filter(n => (n.status === 200 || n.status === 0) &&
17
+ (!n.redirectChain || n.redirectChain.length === 0) &&
18
+ !n.noindex);
19
+ const N = eligibleNodes.length;
20
+ const results = new Map();
21
+ if (N === 0)
22
+ return results;
23
+ // Map URL to Index for O(1) access
24
+ const urlToIndex = new Map();
25
+ for (let i = 0; i < N; i++) {
26
+ urlToIndex.set(eligibleNodes[i].url, i);
27
+ }
28
+ // Build Adjacency Lists
29
+ const incoming = new Array(N).fill(null).map(() => []);
30
+ const outgoing = new Array(N).fill(null).map(() => []);
31
+ const allEdges = graph.getEdges();
32
+ for (const edge of allEdges) {
33
+ if (edge.source === edge.target)
34
+ continue;
35
+ const sourceIndex = urlToIndex.get(edge.source);
36
+ const targetIndex = urlToIndex.get(edge.target);
37
+ if (sourceIndex !== undefined && targetIndex !== undefined) {
38
+ const weight = edge.weight || 1.0;
39
+ incoming[targetIndex].push({ sourceIndex, weight });
40
+ outgoing[sourceIndex].push({ targetIndex, weight });
41
+ }
42
+ }
43
+ // Initialize Scores
44
+ const authScores = new Float64Array(N).fill(1.0);
45
+ const hubScores = new Float64Array(N).fill(1.0);
46
+ // 2. Iteration
47
+ for (let iter = 0; iter < iterations; iter++) {
48
+ let normAuth = 0;
49
+ for (let i = 0; i < N; i++) {
50
+ const inLinks = incoming[i];
51
+ let newAuth = 0;
52
+ for (let j = 0; j < inLinks.length; j++) {
53
+ const link = inLinks[j];
54
+ newAuth += hubScores[link.sourceIndex] * link.weight;
55
+ }
56
+ authScores[i] = newAuth;
57
+ normAuth += newAuth * newAuth;
58
+ }
59
+ normAuth = Math.sqrt(normAuth);
60
+ if (normAuth > 0) {
61
+ for (let i = 0; i < N; i++)
62
+ authScores[i] /= normAuth;
63
+ }
64
+ let normHub = 0;
65
+ for (let i = 0; i < N; i++) {
66
+ const outLinks = outgoing[i];
67
+ let newHub = 0;
68
+ for (let j = 0; j < outLinks.length; j++) {
69
+ const link = outLinks[j];
70
+ newHub += authScores[link.targetIndex] * link.weight;
71
+ }
72
+ hubScores[i] = newHub;
73
+ normHub += newHub * newHub;
74
+ }
75
+ normHub = Math.sqrt(normHub);
76
+ if (normHub > 0) {
77
+ for (let i = 0; i < N; i++)
78
+ hubScores[i] /= normHub;
79
+ }
80
+ }
81
+ // 3. Classification and Result Mapping
82
+ const sortedAuth = [...authScores].sort((a, b) => a - b);
83
+ const sortedHub = [...hubScores].sort((a, b) => a - b);
84
+ const medianAuth = sortedAuth[Math.floor(sortedAuth.length / 2)];
85
+ const medianHub = sortedHub[Math.floor(sortedHub.length / 2)];
86
+ const maxAuth = sortedAuth[sortedAuth.length - 1];
87
+ const maxHub = sortedHub[sortedHub.length - 1];
88
+ for (let i = 0; i < N; i++) {
89
+ const auth = authScores[i];
90
+ const hub = hubScores[i];
91
+ const url = eligibleNodes[i].url;
92
+ const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
93
+ const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
94
+ let link_role = 'peripheral';
95
+ if (isHighAuth && isHighHub)
96
+ link_role = 'power';
97
+ else if (isHighAuth)
98
+ link_role = 'authority';
99
+ else if (isHighHub)
100
+ link_role = 'hub';
101
+ else if (auth > 0.00001 && hub > 0.00001)
102
+ link_role = 'balanced';
103
+ results.set(url, {
104
+ authority_score: auth,
105
+ hub_score: hub,
106
+ link_role
107
+ });
108
+ }
109
+ return results;
110
+ }
111
+ }
@@ -14,10 +14,6 @@ export interface Metrics {
14
14
  crawlEfficiencyScore: number;
15
15
  averageDepth: number;
16
16
  structuralEntropy: number;
17
- topPageRankPages: {
18
- url: string;
19
- score: number;
20
- }[];
21
17
  limitReached: boolean;
22
18
  sessionStats?: {
23
19
  pagesFetched: number;
@@ -5,14 +5,24 @@ export function calculateMetrics(graph, _maxDepth) {
5
5
  const totalEdges = edges.length;
6
6
  // Identify broken nodes
7
7
  const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
8
+ // Pre-compute outgoing edges per node for faster lookup
9
+ const outgoingEdges = new Map();
10
+ for (const edge of edges) {
11
+ let targets = outgoingEdges.get(edge.source);
12
+ if (!targets) {
13
+ targets = [];
14
+ outgoingEdges.set(edge.source, targets);
15
+ }
16
+ targets.push(edge.target);
17
+ }
8
18
  // Populate brokenLinks per node
9
19
  for (const node of nodes) {
10
- const nodeEdges = edges.filter(e => e.source === node.url);
11
- const broken = nodeEdges
12
- .map(e => e.target)
13
- .filter(targetUrl => brokenNodes.has(targetUrl));
14
- if (broken.length > 0) {
15
- node.brokenLinks = broken;
20
+ const targets = outgoingEdges.get(node.url);
21
+ if (targets) {
22
+ const broken = targets.filter(targetUrl => brokenNodes.has(targetUrl));
23
+ if (broken.length > 0) {
24
+ node.brokenLinks = broken;
25
+ }
16
26
  }
17
27
  }
18
28
  // Authority Score (per node)
@@ -55,16 +65,11 @@ export function calculateMetrics(graph, _maxDepth) {
55
65
  }
56
66
  }
57
67
  // topAuthorityPages: Top 10 by authority
58
- const topAuthorityPages = [...nodes]
59
- .map(n => ({ url: n.url, authority: n.authorityScore ?? getAuthority(n) }))
68
+ const topAuthorityPages = nodes
69
+ .filter(n => n.isInternal !== false && n.status > 0)
70
+ .map(n => ({ url: n.url, authority: getAuthority(n) }))
60
71
  .sort((a, b) => b.authority - a.authority)
61
72
  .slice(0, 10);
62
- // topPageRankPages: Top 10 by raw PageRank
63
- const topPageRankPages = [...nodes]
64
- .filter(n => n.pageRank !== undefined)
65
- .map(n => ({ url: n.url, score: n.pageRank }))
66
- .sort((a, b) => b.score - a.score)
67
- .slice(0, 10);
68
73
  const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
69
74
  const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
70
75
  return {
@@ -79,7 +84,6 @@ export function calculateMetrics(graph, _maxDepth) {
79
84
  crawlEfficiencyScore,
80
85
  averageDepth,
81
86
  structuralEntropy,
82
- topPageRankPages,
83
87
  limitReached: graph.limitReached,
84
88
  sessionStats: graph.sessionStats
85
89
  };
@@ -1,12 +1,25 @@
1
1
  import { Graph } from './graph.js';
2
- interface PageRankOptions {
2
+ export interface PageRankRow {
3
+ raw_rank: number;
4
+ score: number;
5
+ }
6
+ export interface PageRankOptions {
3
7
  dampingFactor?: number;
4
8
  maxIterations?: number;
5
9
  convergenceThreshold?: number;
6
10
  soft404WeightThreshold?: number;
11
+ neutralScoreWhenFlat?: number;
7
12
  }
8
13
  /**
9
- * Production-Grade Weighted PageRank Engine
14
+ * Service to analyze a site's link graph and compute PageRank metrics.
15
+ * Runs only on the full crawl graph.
10
16
  */
11
- export declare function computePageRank(graph: Graph, options?: PageRankOptions): void;
12
- export {};
17
+ export declare class PageRankService {
18
+ /**
19
+ * Computes a Production-Grade Weighted PageRank over the given graph.
20
+ * @param {Graph} graph - The full site graph structure.
21
+ * @param {PageRankOptions} options - Configuration overrides for damping factor, limits, etc.
22
+ * @returns {Map<string, PageRankRow>} The individual metrics keyed by exact normalized url.
23
+ */
24
+ evaluate(graph: Graph, options?: PageRankOptions): Map<string, PageRankRow>;
25
+ }
@@ -1,104 +1,137 @@
1
+ import { DEFAULTS } from '../constants.js';
1
2
  /**
2
- * Production-Grade Weighted PageRank Engine
3
+ * Service to analyze a site's link graph and compute PageRank metrics.
4
+ * Runs only on the full crawl graph.
3
5
  */
4
- export function computePageRank(graph, options = {}) {
5
- const d = options.dampingFactor ?? 0.85;
6
- const maxIterations = options.maxIterations ?? 40;
7
- const epsilon = options.convergenceThreshold ?? 1e-5;
8
- const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
9
- const allNodes = graph.getNodes();
10
- const allEdges = graph.getEdges();
11
- // 1. Filter Eligible Nodes
12
- const eligibleNodes = allNodes.filter(node => {
13
- if (node.noindex)
14
- return false;
15
- if (node.isCollapsed)
16
- return false;
17
- if (node.soft404Score && node.soft404Score > soft404Threshold)
18
- return false;
19
- if (node.canonical && node.canonical !== node.url)
20
- return false;
21
- if (node.status >= 400)
22
- return false; // Don't pass rank to broken pages
23
- if (node.status === 0)
24
- return false; // Don't pass rank to uncrawled/external pages
25
- return true;
26
- });
27
- const nodeCount = eligibleNodes.length;
28
- if (nodeCount === 0)
29
- return;
30
- const nodeUrls = eligibleNodes.map(n => n.url);
31
- const nodeMap = new Map();
32
- eligibleNodes.forEach(n => nodeMap.set(n.url, n));
33
- // Initialize PageRank
34
- let pr = new Map();
35
- nodeUrls.forEach(url => pr.set(url, 1 / nodeCount));
36
- // Pre-calculate weighted outbound sums and inverted adjacency
37
- const outWeights = new Map();
38
- const incoming = new Map();
39
- const sinks = [];
40
- // Initialize outWeights for all eligible nodes
41
- nodeUrls.forEach(url => outWeights.set(url, 0));
42
- for (const edge of allEdges) {
43
- if (nodeMap.has(edge.source) && nodeMap.has(edge.target)) {
44
- const weight = edge.weight || 1.0;
45
- const sources = incoming.get(edge.target) ?? [];
46
- sources.push({ source: edge.source, weight });
47
- incoming.set(edge.target, sources);
48
- outWeights.set(edge.source, (outWeights.get(edge.source) || 0) + weight);
49
- }
50
- }
51
- // Identify sinks
52
- nodeUrls.forEach(url => {
53
- if ((outWeights.get(url) || 0) === 0) {
54
- sinks.push(url);
6
+ export class PageRankService {
7
+ /**
8
+ * Computes a Production-Grade Weighted PageRank over the given graph.
9
+ * @param {Graph} graph - The full site graph structure.
10
+ * @param {PageRankOptions} options - Configuration overrides for damping factor, limits, etc.
11
+ * @returns {Map<string, PageRankRow>} The individual metrics keyed by exact normalized url.
12
+ */
13
+ evaluate(graph, options = {}) {
14
+ const d = options.dampingFactor ?? 0.85;
15
+ const maxIterations = options.maxIterations ?? 40;
16
+ const epsilon = options.convergenceThreshold ?? 1e-5;
17
+ const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
18
+ const neutralScoreWhenFlat = options.neutralScoreWhenFlat ?? 50;
19
+ const allNodes = graph.getNodes();
20
+ const allEdges = graph.getEdges();
21
+ // 1. Filter Eligible Nodes
22
+ const eligibleNodes = allNodes.filter(node => {
23
+ if (node.noindex)
24
+ return false;
25
+ if (node.isCollapsed)
26
+ return false;
27
+ // Keep compat with other plugins mutating soft404Score onto nodes
28
+ if (node.soft404Score && node.soft404Score > soft404Threshold)
29
+ return false;
30
+ // canonical is stored as absolute URL; extract pathname for path-based comparison
31
+ if (node.canonical) {
32
+ try {
33
+ const canonicalPath = new URL(node.canonical).pathname;
34
+ if (canonicalPath !== node.url)
35
+ return false;
36
+ }
37
+ catch {
38
+ // if canonical isn't a valid URL, compare as-is
39
+ if (node.canonical !== node.url)
40
+ return false;
41
+ }
42
+ }
43
+ if (node.status >= 400)
44
+ return false; // Don't pass rank to broken pages
45
+ if (node.status === 0)
46
+ return false; // Don't pass rank to uncrawled/external pages
47
+ return true;
48
+ });
49
+ const nodeCount = eligibleNodes.length;
50
+ const results = new Map();
51
+ if (nodeCount === 0)
52
+ return results;
53
+ // Map URL to Index for O(1) access and TypedArray usage
54
+ const urlToIndex = new Map();
55
+ for (let i = 0; i < nodeCount; i++) {
56
+ urlToIndex.set(eligibleNodes[i].url, i);
55
57
  }
56
- });
57
- // Iterative Calculation
58
- for (let i = 0; i < maxIterations; i++) {
59
- const nextPr = new Map();
60
- // Calculate total rank from sinks to redistribute
61
- let sinkRankTotal = 0;
62
- for (const url of sinks) {
63
- sinkRankTotal += pr.get(url) || 0;
58
+ // Pre-calculate weighted outbound sums and inverted adjacency
59
+ const outWeights = new Float64Array(nodeCount);
60
+ const incoming = new Array(nodeCount).fill(null).map(() => []);
61
+ for (const edge of allEdges) {
62
+ const sourceIndex = urlToIndex.get(edge.source);
63
+ const targetIndex = urlToIndex.get(edge.target);
64
+ if (sourceIndex !== undefined && targetIndex !== undefined) {
65
+ const weight = edge.weight || 1.0;
66
+ incoming[targetIndex].push({ sourceIndex, weight });
67
+ outWeights[sourceIndex] += weight;
68
+ }
64
69
  }
65
- const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
66
- for (const url of nodeUrls) {
67
- let rankFromLinks = 0;
68
- const sources = incoming.get(url) || [];
69
- for (const edge of sources) {
70
- const sourceRank = pr.get(edge.source) || 0;
71
- const sourceOutWeight = outWeights.get(edge.source) || 1.0;
72
- rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
70
+ // Identify sinks
71
+ const sinks = [];
72
+ for (let i = 0; i < nodeCount; i++) {
73
+ if (outWeights[i] === 0) {
74
+ sinks.push(i);
73
75
  }
74
- const newRank = baseRank + d * rankFromLinks;
75
- nextPr.set(url, newRank);
76
76
  }
77
- // Convergence check
78
- let maxDelta = 0;
79
- for (const url of nodeUrls) {
80
- const delta = Math.abs(nextPr.get(url) - pr.get(url));
81
- if (delta > maxDelta)
82
- maxDelta = delta;
77
+ // Initialize PageRank typed arrays
78
+ let pr = new Float64Array(nodeCount).fill(1 / nodeCount);
79
+ let nextPr = new Float64Array(nodeCount);
80
+ // Iterative Calculation
81
+ for (let iter = 0; iter < maxIterations; iter++) {
82
+ // Calculate total rank from sinks to redistribute
83
+ let sinkRankTotal = 0;
84
+ for (let i = 0; i < sinks.length; i++) {
85
+ sinkRankTotal += pr[sinks[i]];
86
+ }
87
+ const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
88
+ let maxDelta = 0;
89
+ for (let i = 0; i < nodeCount; i++) {
90
+ let rankFromLinks = 0;
91
+ const sources = incoming[i];
92
+ for (let j = 0; j < sources.length; j++) {
93
+ const edge = sources[j];
94
+ const sourceRank = pr[edge.sourceIndex];
95
+ const sourceOutWeight = outWeights[edge.sourceIndex] || 1.0;
96
+ rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
97
+ }
98
+ const newRank = baseRank + d * rankFromLinks;
99
+ nextPr[i] = newRank;
100
+ const delta = Math.abs(newRank - pr[i]);
101
+ if (delta > maxDelta) {
102
+ maxDelta = delta;
103
+ }
104
+ }
105
+ // Swap arrays
106
+ const temp = pr;
107
+ pr = nextPr;
108
+ nextPr = temp;
109
+ if (maxDelta < epsilon)
110
+ break;
83
111
  }
84
- pr = nextPr;
85
- if (maxDelta < epsilon)
86
- break;
87
- }
88
- // 2. Normalization (0-100)
89
- const ranks = Array.from(pr.values());
90
- const minPR = Math.min(...ranks);
91
- const maxPR = Math.max(...ranks);
92
- const range = maxPR - minPR;
93
- for (const node of eligibleNodes) {
94
- const rawRank = pr.get(node.url);
95
- node.pageRank = rawRank;
96
- if (range > 1e-12) {
97
- node.pageRankScore = 100 * (rawRank - minPR) / range;
112
+ // 2. Normalization (0-100)
113
+ let minPR = pr[0];
114
+ let maxPR = pr[0];
115
+ for (let i = 1; i < nodeCount; i++) {
116
+ const rank = pr[i];
117
+ if (rank < minPR)
118
+ minPR = rank;
119
+ if (rank > maxPR)
120
+ maxPR = rank;
98
121
  }
99
- else {
100
- // If there's no range, all eligible pages are equally important.
101
- node.pageRankScore = 100;
122
+ const range = maxPR - minPR;
123
+ for (let i = 0; i < nodeCount; i++) {
124
+ const rawRank = pr[i];
125
+ const url = eligibleNodes[i].url;
126
+ let score = neutralScoreWhenFlat;
127
+ if (range > DEFAULTS.GRAPH_PRECISION) {
128
+ score = 100 * (rawRank - minPR) / range;
129
+ }
130
+ results.set(url, {
131
+ raw_rank: rawRank,
132
+ score: Number(score.toFixed(3))
133
+ });
102
134
  }
135
+ return results;
103
136
  }
104
137
  }
package/dist/index.d.ts CHANGED
@@ -1,26 +1,35 @@
1
+ export * from './scoring/health.js';
1
2
  export * from './crawler/crawl.js';
2
3
  export * from './crawler/normalize.js';
3
4
  export * from './crawler/metricsRunner.js';
5
+ export * from './crawler/trap.js';
4
6
  export * from './graph/metrics.js';
5
- export * from './report/html.js';
6
- export * from './report/crawl_template.js';
7
- export * from './report/crawlExport.js';
8
7
  export * from './graph/graph.js';
9
- export * from './diff/compare.js';
10
- export * from './scoring/orphanSeverity.js';
8
+ export * from './graph/simhash.js';
11
9
  export * from './graph/pagerank.js';
12
- export * from './graph/duplicate.js';
13
- export * from './graph/cluster.js';
14
- export * from './scoring/health.js';
15
- export * from './scoring/hits.js';
10
+ export * from './graph/hits.js';
11
+ export * from './diff/compare.js';
12
+ export * from './diff/service.js';
16
13
  export * from './analysis/analyze.js';
17
14
  export * from './analysis/content.js';
18
15
  export * from './analysis/seo.js';
19
16
  export * from './analysis/images.js';
20
17
  export * from './analysis/links.js';
18
+ export * from './analysis/scoring.js';
19
+ export * from './analysis/clustering.js';
20
+ export * from './analysis/duplicate.js';
21
+ export * from './analysis/soft404.js';
22
+ export * from './analysis/heading.js';
23
+ export * from './analysis/orphan.js';
21
24
  export * from './audit/index.js';
22
25
  export * from './audit/types.js';
26
+ export * from './report/html.js';
27
+ export * from './report/crawl_template.js';
28
+ export * from './report/crawlExport.js';
29
+ export * from './report/export.js';
30
+ export * from './report/insight.js';
23
31
  export * from './db/index.js';
32
+ export * from './db/reset.js';
24
33
  export * from './db/graphLoader.js';
25
34
  export * from './db/repositories/SiteRepository.js';
26
35
  export * from './db/repositories/SnapshotRepository.js';
@@ -30,4 +39,13 @@ export * from './db/repositories/MetricsRepository.js';
30
39
  export * from './lock/lockManager.js';
31
40
  export * from './lock/hashKey.js';
32
41
  export * from './utils/version.js';
42
+ export * from './utils/secureConfig.js';
33
43
  export * from './events.js';
44
+ export * from './plugin-system/plugin-types.js';
45
+ export * from './plugin-system/plugin-loader.js';
46
+ export * from './plugin-system/plugin-registry.js';
47
+ export * from './plugin-system/plugin-cli.js';
48
+ export * from './ports/index.js';
49
+ export * from './application/usecase.js';
50
+ export * from './application/usecases.js';
51
+ export { Command } from 'commander';
package/dist/index.js CHANGED
@@ -1,26 +1,35 @@
1
+ export * from './scoring/health.js';
1
2
  export * from './crawler/crawl.js';
2
3
  export * from './crawler/normalize.js';
3
4
  export * from './crawler/metricsRunner.js';
5
+ export * from './crawler/trap.js';
4
6
  export * from './graph/metrics.js';
5
- export * from './report/html.js';
6
- export * from './report/crawl_template.js';
7
- export * from './report/crawlExport.js';
8
7
  export * from './graph/graph.js';
9
- export * from './diff/compare.js';
10
- export * from './scoring/orphanSeverity.js';
8
+ export * from './graph/simhash.js';
11
9
  export * from './graph/pagerank.js';
12
- export * from './graph/duplicate.js';
13
- export * from './graph/cluster.js';
14
- export * from './scoring/health.js';
15
- export * from './scoring/hits.js';
10
+ export * from './graph/hits.js';
11
+ export * from './diff/compare.js';
12
+ export * from './diff/service.js';
16
13
  export * from './analysis/analyze.js';
17
14
  export * from './analysis/content.js';
18
15
  export * from './analysis/seo.js';
19
16
  export * from './analysis/images.js';
20
17
  export * from './analysis/links.js';
18
+ export * from './analysis/scoring.js';
19
+ export * from './analysis/clustering.js';
20
+ export * from './analysis/duplicate.js';
21
+ export * from './analysis/soft404.js';
22
+ export * from './analysis/heading.js';
23
+ export * from './analysis/orphan.js';
21
24
  export * from './audit/index.js';
22
25
  export * from './audit/types.js';
26
+ export * from './report/html.js';
27
+ export * from './report/crawl_template.js';
28
+ export * from './report/crawlExport.js';
29
+ export * from './report/export.js';
30
+ export * from './report/insight.js';
23
31
  export * from './db/index.js';
32
+ export * from './db/reset.js';
24
33
  export * from './db/graphLoader.js';
25
34
  export * from './db/repositories/SiteRepository.js';
26
35
  export * from './db/repositories/SnapshotRepository.js';
@@ -30,4 +39,13 @@ export * from './db/repositories/MetricsRepository.js';
30
39
  export * from './lock/lockManager.js';
31
40
  export * from './lock/hashKey.js';
32
41
  export * from './utils/version.js';
42
+ export * from './utils/secureConfig.js';
33
43
  export * from './events.js';
44
+ export * from './plugin-system/plugin-types.js';
45
+ export * from './plugin-system/plugin-loader.js';
46
+ export * from './plugin-system/plugin-registry.js';
47
+ export * from './plugin-system/plugin-cli.js';
48
+ export * from './ports/index.js';
49
+ export * from './application/usecase.js';
50
+ export * from './application/usecases.js';
51
+ export { Command } from 'commander';
@@ -5,6 +5,7 @@ export declare class LockManager {
5
5
  private static get lockDir();
6
6
  static acquireLock(commandName: string, targetUrl: string, options: any, context?: EngineContext, force?: boolean): Promise<void>;
7
7
  static releaseLock(): void;
8
+ static clearAllLocks(): Promise<number>;
8
9
  private static log;
9
10
  private static registerHandlers;
10
11
  }
@@ -85,6 +85,21 @@ export class LockManager {
85
85
  }
86
86
  }
87
87
  }
88
+ static async clearAllLocks() {
89
+ if (!existsSync(this.lockDir))
90
+ return 0;
91
+ const files = await fs.readdir(this.lockDir);
92
+ const lockFiles = files.filter(f => f.endsWith('.lock'));
93
+ let count = 0;
94
+ for (const file of lockFiles) {
95
+ try {
96
+ await fs.unlink(path.join(this.lockDir, file));
97
+ count++;
98
+ }
99
+ catch { /* ignore */ }
100
+ }
101
+ return count;
102
+ }
88
103
  static log(type, message, error) {
89
104
  if (this.context) {
90
105
  this.context.emit({ type, message, error });
@@ -0,0 +1,10 @@
1
+ import { Command } from 'commander';
2
+ /**
3
+ * Standard utility for plugins to register their configuration commands.
4
+ * This ensures a consistent 'config <plugin> set' CLI pattern across the ecosystem.
5
+ *
6
+ * @param cli - The main Commander instance (must have name 'crawlith').
7
+ * @param pluginName - The unique name of the plugin (e.g., 'pagespeed').
8
+ * @param credentialLabel - Human-readable label for the credential (e.g., 'Google API Key').
9
+ */
10
+ export declare function registerPluginConfigCommand(cli: Command, pluginName: string, credentialLabel: string): void;
@@ -0,0 +1,31 @@
1
+ import { Command } from 'commander';
2
+ import { setEncryptedConfigKey } from '../utils/secureConfig.js';
3
+ /**
4
+ * Standard utility for plugins to register their configuration commands.
5
+ * This ensures a consistent 'config <plugin> set' CLI pattern across the ecosystem.
6
+ *
7
+ * @param cli - The main Commander instance (must have name 'crawlith').
8
+ * @param pluginName - The unique name of the plugin (e.g., 'pagespeed').
9
+ * @param credentialLabel - Human-readable label for the credential (e.g., 'Google API Key').
10
+ */
11
+ export function registerPluginConfigCommand(cli, pluginName, credentialLabel) {
12
+ // Only register subcommands if we are in the root 'crawlith' CLI context
13
+ if (cli.name() !== 'crawlith')
14
+ return;
15
+ // Find or create 'config' command
16
+ let configCmd = cli.commands.find(c => c.name() === 'config');
17
+ if (!configCmd) {
18
+ configCmd = new Command('config').description('Manage Crawlith plugin configuration');
19
+ cli.addCommand(configCmd);
20
+ }
21
+ // Define plugin-specific subcommand
22
+ const pluginConfigCmd = new Command(pluginName).description(`Manage ${pluginName} configuration`);
23
+ pluginConfigCmd
24
+ .command('set <value>')
25
+ .description(`Set and encrypt ${credentialLabel}`)
26
+ .action((value) => {
27
+ setEncryptedConfigKey(pluginName, value);
28
+ console.log(`✅ ${credentialLabel} for ${pluginName} saved and encrypted.`);
29
+ });
30
+ configCmd.addCommand(pluginConfigCmd);
31
+ }