@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,110 +0,0 @@
1
- import { Graph, GraphNode } from './graph.js';
2
-
3
- export interface Metrics {
4
- totalPages: number;
5
- totalEdges: number;
6
- orphanPages: string[];
7
- nearOrphans: string[];
8
- deepPages: string[];
9
- topAuthorityPages: { url: string; authority: number }[];
10
- averageOutDegree: number;
11
- maxDepthFound: number;
12
- crawlEfficiencyScore: number;
13
- averageDepth: number;
14
- structuralEntropy: number;
15
- topPageRankPages: { url: string; score: number }[];
16
- limitReached: boolean;
17
- sessionStats?: {
18
- pagesFetched: number;
19
- pagesCached: number;
20
- pagesSkipped: number;
21
- totalFound: number;
22
- };
23
- }
24
-
25
- export function calculateMetrics(graph: Graph, _maxDepth: number): Metrics {
26
- const nodes = graph.getNodes();
27
- const edges = graph.getEdges();
28
-
29
- const totalPages = nodes.length;
30
- const totalEdges = edges.length;
31
-
32
- // Authority Score (per node)
33
- const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
34
- const getAuthority = (node: GraphNode) => {
35
- if (maxInLinks === 0) return 0;
36
- return Math.log(1 + node.inLinks) / Math.log(1 + maxInLinks);
37
- };
38
-
39
- // orphanPages: inLinks === 0 && depth > 0
40
- const orphanPages = nodes
41
- .filter(n => n.inLinks === 0 && n.depth > 0)
42
- .map(n => n.url);
43
-
44
- // nearOrphans: inLinks === 1 && depth >= 3
45
- const nearOrphans = nodes
46
- .filter(n => n.inLinks === 1 && n.depth >= 3)
47
- .map(n => n.url);
48
-
49
- // deepPages: depth >= 4
50
- const deepPages = nodes
51
- .filter(n => n.depth >= 4) // Per requirement
52
- .map(n => n.url);
53
-
54
- // crawlEfficiencyScore: 1 - (deepPagesCount / totalPages)
55
- const deepPagesCount = deepPages.length;
56
- const crawlEfficiencyScore = totalPages > 0 ? 1 - (deepPagesCount / totalPages) : 1;
57
-
58
- // averageDepth: sum(depth) / totalPages
59
- const sumDepth = nodes.reduce((acc, n) => acc + n.depth, 0);
60
- const averageDepth = totalPages > 0 ? sumDepth / totalPages : 0;
61
-
62
- // structuralEntropy: Shannon entropy over outDegree distribution
63
- const outDegreeCounts = new Map<number, number>();
64
- nodes.forEach(n => {
65
- outDegreeCounts.set(n.outLinks, (outDegreeCounts.get(n.outLinks) || 0) + 1);
66
- });
67
-
68
- let structuralEntropy = 0;
69
- if (totalPages > 0) {
70
- for (const count of outDegreeCounts.values()) {
71
- const p = count / totalPages;
72
- if (p > 0) {
73
- structuralEntropy -= p * Math.log2(p);
74
- }
75
- }
76
- }
77
-
78
- // topAuthorityPages: Top 10 by authority
79
- const topAuthorityPages = [...nodes]
80
- .map(n => ({ url: n.url, authority: n.authorityScore ?? getAuthority(n) }))
81
- .sort((a, b) => b.authority - a.authority)
82
- .slice(0, 10);
83
-
84
- // topPageRankPages: Top 10 by raw PageRank
85
- const topPageRankPages = [...nodes]
86
- .filter(n => n.pageRank !== undefined)
87
- .map(n => ({ url: n.url, score: n.pageRank! }))
88
- .sort((a, b) => b.score - a.score)
89
- .slice(0, 10);
90
-
91
- const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
92
- const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
93
-
94
- return {
95
- totalPages,
96
- totalEdges,
97
- orphanPages,
98
- nearOrphans,
99
- deepPages,
100
- topAuthorityPages,
101
- averageOutDegree,
102
- maxDepthFound,
103
- crawlEfficiencyScore,
104
- averageDepth,
105
- structuralEntropy,
106
- topPageRankPages,
107
- limitReached: graph.limitReached,
108
- sessionStats: graph.sessionStats
109
- };
110
- }
@@ -1,125 +0,0 @@
1
- import { Graph, GraphNode } from './graph.js';
2
-
3
- interface PageRankOptions {
4
- dampingFactor?: number;
5
- maxIterations?: number;
6
- convergenceThreshold?: number;
7
- soft404WeightThreshold?: number;
8
- }
9
-
10
- /**
11
- * Production-Grade Weighted PageRank Engine
12
- */
13
- export function computePageRank(graph: Graph, options: PageRankOptions = {}) {
14
- const d = options.dampingFactor ?? 0.85;
15
- const maxIterations = options.maxIterations ?? 40;
16
- const epsilon = options.convergenceThreshold ?? 1e-5;
17
- const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
18
-
19
- const allNodes = graph.getNodes();
20
- const allEdges = graph.getEdges();
21
-
22
- // 1. Filter Eligible Nodes
23
- const eligibleNodes = allNodes.filter(node => {
24
- if (node.noindex) return false;
25
- if (node.isCollapsed) return false;
26
- if (node.soft404Score && node.soft404Score > soft404Threshold) return false;
27
- if (node.canonical && node.canonical !== node.url) return false;
28
- if (node.status >= 400) return false; // Don't pass rank to broken pages
29
- return true;
30
- });
31
-
32
- const nodeCount = eligibleNodes.length;
33
- if (nodeCount === 0) return;
34
-
35
- const nodeUrls = eligibleNodes.map(n => n.url);
36
- const nodeMap = new Map<string, GraphNode>();
37
- eligibleNodes.forEach(n => nodeMap.set(n.url, n));
38
-
39
- // Initialize PageRank
40
- let pr = new Map<string, number>();
41
- nodeUrls.forEach(url => pr.set(url, 1 / nodeCount));
42
-
43
- // Pre-calculate weighted outbound sums and inverted adjacency
44
- const outWeights = new Map<string, number>();
45
- const incoming = new Map<string, { source: string; weight: number }[]>();
46
- const sinks: string[] = [];
47
-
48
- // Initialize outWeights for all eligible nodes
49
- nodeUrls.forEach(url => outWeights.set(url, 0));
50
-
51
- for (const edge of allEdges) {
52
- if (nodeMap.has(edge.source) && nodeMap.has(edge.target)) {
53
- const weight = edge.weight || 1.0;
54
-
55
- const sources = incoming.get(edge.target) ?? [];
56
- sources.push({ source: edge.source, weight });
57
- incoming.set(edge.target, sources);
58
-
59
- outWeights.set(edge.source, (outWeights.get(edge.source) || 0) + weight);
60
- }
61
- }
62
-
63
- // Identify sinks
64
- nodeUrls.forEach(url => {
65
- if ((outWeights.get(url) || 0) === 0) {
66
- sinks.push(url);
67
- }
68
- });
69
-
70
- // Iterative Calculation
71
- for (let i = 0; i < maxIterations; i++) {
72
- const nextPr = new Map<string, number>();
73
-
74
- // Calculate total rank from sinks to redistribute
75
- let sinkRankTotal = 0;
76
- for (const url of sinks) {
77
- sinkRankTotal += pr.get(url) || 0;
78
- }
79
-
80
- const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
81
-
82
- for (const url of nodeUrls) {
83
- let rankFromLinks = 0;
84
- const sources = incoming.get(url) || [];
85
-
86
- for (const edge of sources) {
87
- const sourceRank = pr.get(edge.source) || 0;
88
- const sourceOutWeight = outWeights.get(edge.source) || 1.0;
89
- rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
90
- }
91
-
92
- const newRank = baseRank + d * rankFromLinks;
93
- nextPr.set(url, newRank);
94
- }
95
-
96
- // Convergence check
97
- let maxDelta = 0;
98
- for (const url of nodeUrls) {
99
- const delta = Math.abs(nextPr.get(url)! - pr.get(url)!);
100
- if (delta > maxDelta) maxDelta = delta;
101
- }
102
-
103
- pr = nextPr;
104
-
105
- if (maxDelta < epsilon) break;
106
- }
107
-
108
- // 2. Normalization (0-100)
109
- const ranks = Array.from(pr.values());
110
- const minPR = Math.min(...ranks);
111
- const maxPR = Math.max(...ranks);
112
- const range = maxPR - minPR;
113
-
114
- for (const node of eligibleNodes) {
115
- const rawRank = pr.get(node.url)!;
116
- node.pageRank = rawRank;
117
-
118
- if (range > 1e-12) {
119
- node.pageRankScore = 100 * (rawRank - minPR) / range;
120
- } else {
121
- // If there's no range, all eligible pages are equally important.
122
- node.pageRankScore = 100;
123
- }
124
- }
125
- }
@@ -1,61 +0,0 @@
1
- export class SimHash {
2
- private static FNV_PRIME = 1099511628211n;
3
- private static FNV_OFFSET_BASIS = 14695981039346656037n;
4
- private static MAX_UINT64 = 0xffffffffffffffffn;
5
-
6
- /**
7
- * Generates a 64-bit FNV-1a hash for a given string token.
8
- */
9
- static fnv1a64(token: string): bigint {
10
- let hash = this.FNV_OFFSET_BASIS;
11
- const len = token.length;
12
- for (let i = 0; i < len; i++) {
13
- hash ^= BigInt(token.charCodeAt(i));
14
- // BigInt safe multiplication modulo 2^64
15
- hash = (hash * this.FNV_PRIME) & this.MAX_UINT64;
16
- }
17
- return hash;
18
- }
19
-
20
- /**
21
- * Generates a 64-bit SimHash from an array of tokens.
22
- */
23
- static generate(tokens: string[]): bigint {
24
- const v = new Int32Array(64);
25
-
26
- for (const token of tokens) {
27
- const hash = this.fnv1a64(token);
28
- for (let i = 0n; i < 64n; i++) {
29
- const bit = (hash >> i) & 1n;
30
- if (bit === 1n) {
31
- v[Number(i)]++;
32
- } else {
33
- v[Number(i)]--;
34
- }
35
- }
36
- }
37
-
38
- let simhash = 0n;
39
- for (let i = 0n; i < 64n; i++) {
40
- if (v[Number(i)] > 0) {
41
- simhash |= (1n << i);
42
- }
43
- }
44
-
45
- return simhash;
46
- }
47
-
48
- /**
49
- * Computes the Hamming distance between two 64-bit hashes.
50
- */
51
- static hammingDistance(a: bigint, b: bigint): number {
52
- let xor = a ^ b;
53
- let distance = 0;
54
- while (xor > 0n) {
55
- // Kernighan's bit counting
56
- xor &= xor - 1n;
57
- distance++;
58
- }
59
- return distance;
60
- }
61
- }
package/src/index.ts DELETED
@@ -1,30 +0,0 @@
1
- export * from './crawler/crawl.js';
2
- export * from './crawler/metricsRunner.js';
3
- export * from './graph/metrics.js';
4
- export * from './report/html.js';
5
- export * from './report/sitegraph_template.js';
6
- export * from './report/sitegraphExport.js';
7
- export * from './graph/graph.js';
8
- export * from './diff/compare.js';
9
- export * from './scoring/orphanSeverity.js';
10
- export * from './graph/pagerank.js';
11
- export * from './graph/duplicate.js';
12
- export * from './graph/cluster.js';
13
- export * from './scoring/hits.js';
14
- export * from './analysis/analyze.js';
15
- export * from './analysis/content.js';
16
- export * from './analysis/seo.js';
17
- export * from './analysis/images.js';
18
- export * from './analysis/links.js';
19
- export * from './audit/index.js';
20
- export * from './audit/types.js';
21
- export * from './db/index.js';
22
- export * from './db/graphLoader.js';
23
- export * from './db/repositories/SiteRepository.js';
24
- export * from './db/repositories/SnapshotRepository.js';
25
- export * from './db/repositories/PageRepository.js';
26
- export * from './db/repositories/EdgeRepository.js';
27
- export * from './db/repositories/MetricsRepository.js';
28
- export * from './lock/lockManager.js';
29
- export * from './lock/hashKey.js';
30
- export * from './utils/version.js';
@@ -1,51 +0,0 @@
1
- import crypto from 'node:crypto';
2
- import { normalizeUrl } from '../crawler/normalize.js';
3
-
4
- // Flags that change the nature of the crawl and should be part of the lock key
5
- const RELEVANT_FLAGS = [
6
- 'limit',
7
- 'depth',
8
- 'output',
9
- 'sitemap',
10
- 'incremental',
11
- 'detectSoft404',
12
- 'detectTraps',
13
- 'includeSubdomains',
14
- 'allow',
15
- 'deny',
16
- 'proxy',
17
- 'ua',
18
- 'maxRedirects',
19
- 'rate',
20
- 'maxBytes',
21
- 'concurrency'
22
- ];
23
-
24
- export function generateLockKey(commandName: string, targetUrl: string, options: any): string {
25
- // Respect the query stripping option consistent with sitegraph logic
26
- const stripQuery = !options.query;
27
-
28
- const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
29
-
30
- // Extract relevant options in a deterministic order
31
- const lockOptions: Record<string, any> = {};
32
- for (const key of RELEVANT_FLAGS) {
33
- if (options[key] !== undefined) {
34
- lockOptions[key] = options[key];
35
- }
36
- }
37
-
38
- // Create composite key object
39
- const compositeKey = {
40
- command: commandName,
41
- target: normalizedTarget,
42
- options: lockOptions
43
- };
44
-
45
- // Stringify and hash
46
- // Since we inserted keys in a deterministic order (RELEVANT_FLAGS order),
47
- // JSON.stringify will produce a stable string in V8/Node.js.
48
- const stableString = JSON.stringify(compositeKey);
49
-
50
- return crypto.createHash('sha256').update(stableString).digest('hex');
51
- }
@@ -1,124 +0,0 @@
1
- import fs from 'node:fs/promises';
2
- import { existsSync, unlinkSync, readFileSync } from 'node:fs';
3
- import path from 'node:path';
4
- import os from 'node:os';
5
- import chalk from 'chalk';
6
- import { generateLockKey } from './hashKey.js';
7
- import { isPidAlive } from './pidCheck.js';
8
-
9
- interface LockData {
10
- pid: number;
11
- startedAt: number;
12
- command: string;
13
- target: string;
14
- args: any;
15
- }
16
-
17
- export class LockManager {
18
- private static lockFilePath: string | null = null;
19
-
20
- private static get lockDir(): string {
21
- return path.join(os.homedir(), '.crawlith', 'locks');
22
- }
23
-
24
- static async acquireLock(commandName: string, targetUrl: string, options: any, force: boolean = false): Promise<void> {
25
- const lockHash = generateLockKey(commandName, targetUrl, options);
26
-
27
- // Ensure lock directory exists
28
- // We can use sync or async here. Since this is one-time setup, async is fine.
29
- await fs.mkdir(this.lockDir, { recursive: true });
30
-
31
- const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
32
-
33
- // Check existing lock
34
- if (existsSync(lockPath)) {
35
- let isStale: boolean;
36
- let pid: number;
37
-
38
- try {
39
- const lockContent = readFileSync(lockPath, 'utf-8');
40
- const lockData = JSON.parse(lockContent);
41
- pid = lockData.pid;
42
- isStale = !isPidAlive(pid);
43
- } catch (_e) {
44
- // Corrupted -> Treat as stale
45
- isStale = true;
46
- pid = 0; // Fallback, though unused if isStale is true
47
- }
48
-
49
- if (force) {
50
- console.warn(chalk.yellow('Force mode enabled. Overriding existing lock.'));
51
- try { unlinkSync(lockPath); } catch { /* ignore */ }
52
- } else {
53
- if (!isStale) {
54
- console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (PID ${pid})`));
55
- process.exit(1);
56
- } else {
57
- console.log(chalk.gray('Detected stale lock. Continuing execution.'));
58
- try { unlinkSync(lockPath); } catch { /* ignore */ }
59
- }
60
- }
61
- }
62
-
63
- // Create new lock
64
- try {
65
- const data: LockData = {
66
- pid: process.pid,
67
- startedAt: Date.now(),
68
- command: commandName,
69
- target: targetUrl,
70
- args: options
71
- };
72
-
73
- // 'wx' flag ensures atomic creation, failing if file exists
74
- await fs.writeFile(lockPath, JSON.stringify(data, null, 2), { flag: 'wx', encoding: 'utf-8' });
75
-
76
- this.lockFilePath = lockPath;
77
- this.registerHandlers();
78
- } catch (error: any) {
79
- if (error.code === 'EEXIST') {
80
- // Race condition: another process created lock between our check and open
81
- console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
82
- process.exit(1);
83
- }
84
- throw error;
85
- }
86
- }
87
-
88
- static releaseLock(): void {
89
- if (this.lockFilePath && existsSync(this.lockFilePath)) {
90
- try {
91
- unlinkSync(this.lockFilePath);
92
- this.lockFilePath = null;
93
- } catch (_error) {
94
- // Ignore errors during cleanup
95
- }
96
- }
97
- }
98
-
99
- private static registerHandlers() {
100
- // Ensure cleanup only happens once
101
- const cleanup = () => {
102
- this.releaseLock();
103
- };
104
-
105
- // process.on('exit') is only called when process.exit() is called or event loop empties.
106
- // It requires synchronous cleanup.
107
- process.on('exit', cleanup);
108
-
109
- // Signals
110
- process.on('SIGINT', () => {
111
- cleanup();
112
- process.exit(130);
113
- });
114
- process.on('SIGTERM', () => {
115
- cleanup();
116
- process.exit(143);
117
- });
118
- process.on('uncaughtException', (err) => {
119
- console.error(chalk.red('Uncaught Exception:'), err);
120
- cleanup();
121
- process.exit(1);
122
- });
123
- }
124
- }
@@ -1,13 +0,0 @@
1
- export function isPidAlive(pid: number): boolean {
2
- try {
3
- process.kill(pid, 0);
4
- return true;
5
- } catch (error: any) {
6
- if (error.code === 'EPERM') {
7
- // Process exists but no permission to signal -> Alive
8
- return true;
9
- }
10
- // Process does not exist (ESRCH) or other error
11
- return false;
12
- }
13
- }