@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,74 @@
1
+ export function calculateMetrics(graph, _maxDepth) {
2
+ const nodes = graph.getNodes();
3
+ const edges = graph.getEdges();
4
+ const totalPages = nodes.length;
5
+ const totalEdges = edges.length;
6
+ // Authority Score (per node)
7
+ const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
8
+ const getAuthority = (node) => {
9
+ if (maxInLinks === 0)
10
+ return 0;
11
+ return Math.log(1 + node.inLinks) / Math.log(1 + maxInLinks);
12
+ };
13
+ // orphanPages: inLinks === 0 && depth > 0
14
+ const orphanPages = nodes
15
+ .filter(n => n.inLinks === 0 && n.depth > 0)
16
+ .map(n => n.url);
17
+ // nearOrphans: inLinks === 1 && depth >= 3
18
+ const nearOrphans = nodes
19
+ .filter(n => n.inLinks === 1 && n.depth >= 3)
20
+ .map(n => n.url);
21
+ // deepPages: depth >= 4
22
+ const deepPages = nodes
23
+ .filter(n => n.depth >= 4) // Per requirement
24
+ .map(n => n.url);
25
+ // crawlEfficiencyScore: 1 - (deepPagesCount / totalPages)
26
+ const deepPagesCount = deepPages.length;
27
+ const crawlEfficiencyScore = totalPages > 0 ? 1 - (deepPagesCount / totalPages) : 1;
28
+ // averageDepth: sum(depth) / totalPages
29
+ const sumDepth = nodes.reduce((acc, n) => acc + n.depth, 0);
30
+ const averageDepth = totalPages > 0 ? sumDepth / totalPages : 0;
31
+ // structuralEntropy: Shannon entropy over outDegree distribution
32
+ const outDegreeCounts = new Map();
33
+ nodes.forEach(n => {
34
+ outDegreeCounts.set(n.outLinks, (outDegreeCounts.get(n.outLinks) || 0) + 1);
35
+ });
36
+ let structuralEntropy = 0;
37
+ if (totalPages > 0) {
38
+ for (const count of outDegreeCounts.values()) {
39
+ const p = count / totalPages;
40
+ if (p > 0) {
41
+ structuralEntropy -= p * Math.log2(p);
42
+ }
43
+ }
44
+ }
45
+ // topAuthorityPages: Top 10 by authority
46
+ const topAuthorityPages = [...nodes]
47
+ .map(n => ({ url: n.url, authority: n.authorityScore ?? getAuthority(n) }))
48
+ .sort((a, b) => b.authority - a.authority)
49
+ .slice(0, 10);
50
+ // topPageRankPages: Top 10 by raw PageRank
51
+ const topPageRankPages = [...nodes]
52
+ .filter(n => n.pageRank !== undefined)
53
+ .map(n => ({ url: n.url, score: n.pageRank }))
54
+ .sort((a, b) => b.score - a.score)
55
+ .slice(0, 10);
56
+ const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
57
+ const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
58
+ return {
59
+ totalPages,
60
+ totalEdges,
61
+ orphanPages,
62
+ nearOrphans,
63
+ deepPages,
64
+ topAuthorityPages,
65
+ averageOutDegree,
66
+ maxDepthFound,
67
+ crawlEfficiencyScore,
68
+ averageDepth,
69
+ structuralEntropy,
70
+ topPageRankPages,
71
+ limitReached: graph.limitReached,
72
+ sessionStats: graph.sessionStats
73
+ };
74
+ }
@@ -0,0 +1,12 @@
1
+ import { Graph } from './graph.js';
2
+ interface PageRankOptions {
3
+ dampingFactor?: number;
4
+ maxIterations?: number;
5
+ convergenceThreshold?: number;
6
+ soft404WeightThreshold?: number;
7
+ }
8
+ /**
9
+ * Production-Grade Weighted PageRank Engine
10
+ */
11
+ export declare function computePageRank(graph: Graph, options?: PageRankOptions): void;
12
+ export {};
@@ -0,0 +1,102 @@
1
+ /**
2
+ * Production-Grade Weighted PageRank Engine
3
+ */
4
+ export function computePageRank(graph, options = {}) {
5
+ const d = options.dampingFactor ?? 0.85;
6
+ const maxIterations = options.maxIterations ?? 40;
7
+ const epsilon = options.convergenceThreshold ?? 1e-5;
8
+ const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
9
+ const allNodes = graph.getNodes();
10
+ const allEdges = graph.getEdges();
11
+ // 1. Filter Eligible Nodes
12
+ const eligibleNodes = allNodes.filter(node => {
13
+ if (node.noindex)
14
+ return false;
15
+ if (node.isCollapsed)
16
+ return false;
17
+ if (node.soft404Score && node.soft404Score > soft404Threshold)
18
+ return false;
19
+ if (node.canonical && node.canonical !== node.url)
20
+ return false;
21
+ if (node.status >= 400)
22
+ return false; // Don't pass rank to broken pages
23
+ return true;
24
+ });
25
+ const nodeCount = eligibleNodes.length;
26
+ if (nodeCount === 0)
27
+ return;
28
+ const nodeUrls = eligibleNodes.map(n => n.url);
29
+ const nodeMap = new Map();
30
+ eligibleNodes.forEach(n => nodeMap.set(n.url, n));
31
+ // Initialize PageRank
32
+ let pr = new Map();
33
+ nodeUrls.forEach(url => pr.set(url, 1 / nodeCount));
34
+ // Pre-calculate weighted outbound sums and inverted adjacency
35
+ const outWeights = new Map();
36
+ const incoming = new Map();
37
+ const sinks = [];
38
+ // Initialize outWeights for all eligible nodes
39
+ nodeUrls.forEach(url => outWeights.set(url, 0));
40
+ for (const edge of allEdges) {
41
+ if (nodeMap.has(edge.source) && nodeMap.has(edge.target)) {
42
+ const weight = edge.weight || 1.0;
43
+ const sources = incoming.get(edge.target) ?? [];
44
+ sources.push({ source: edge.source, weight });
45
+ incoming.set(edge.target, sources);
46
+ outWeights.set(edge.source, (outWeights.get(edge.source) || 0) + weight);
47
+ }
48
+ }
49
+ // Identify sinks
50
+ nodeUrls.forEach(url => {
51
+ if ((outWeights.get(url) || 0) === 0) {
52
+ sinks.push(url);
53
+ }
54
+ });
55
+ // Iterative Calculation
56
+ for (let i = 0; i < maxIterations; i++) {
57
+ const nextPr = new Map();
58
+ // Calculate total rank from sinks to redistribute
59
+ let sinkRankTotal = 0;
60
+ for (const url of sinks) {
61
+ sinkRankTotal += pr.get(url) || 0;
62
+ }
63
+ const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
64
+ for (const url of nodeUrls) {
65
+ let rankFromLinks = 0;
66
+ const sources = incoming.get(url) || [];
67
+ for (const edge of sources) {
68
+ const sourceRank = pr.get(edge.source) || 0;
69
+ const sourceOutWeight = outWeights.get(edge.source) || 1.0;
70
+ rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
71
+ }
72
+ const newRank = baseRank + d * rankFromLinks;
73
+ nextPr.set(url, newRank);
74
+ }
75
+ // Convergence check
76
+ let maxDelta = 0;
77
+ for (const url of nodeUrls) {
78
+ const delta = Math.abs(nextPr.get(url) - pr.get(url));
79
+ if (delta > maxDelta)
80
+ maxDelta = delta;
81
+ }
82
+ pr = nextPr;
83
+ if (maxDelta < epsilon)
84
+ break;
85
+ }
86
+ // 2. Normalization (0-100)
87
+ const ranks = Array.from(pr.values());
88
+ const minPR = Math.min(...ranks);
89
+ const maxPR = Math.max(...ranks);
90
+ const range = maxPR - minPR;
91
+ for (const node of eligibleNodes) {
92
+ const rawRank = pr.get(node.url);
93
+ node.pageRank = rawRank;
94
+ if (range > 1e-12) {
95
+ node.pageRankScore = 100 * (rawRank - minPR) / range;
96
+ }
97
+ else {
98
+ // If there's no range, all eligible pages are equally important.
99
+ node.pageRankScore = 100;
100
+ }
101
+ }
102
+ }
@@ -0,0 +1,17 @@
1
+ export declare class SimHash {
2
+ private static FNV_PRIME;
3
+ private static FNV_OFFSET_BASIS;
4
+ private static MAX_UINT64;
5
+ /**
6
+ * Generates a 64-bit FNV-1a hash for a given string token.
7
+ */
8
+ static fnv1a64(token: string): bigint;
9
+ /**
10
+ * Generates a 64-bit SimHash from an array of tokens.
11
+ */
12
+ static generate(tokens: string[]): bigint;
13
+ /**
14
+ * Computes the Hamming distance between two 64-bit hashes.
15
+ */
16
+ static hammingDistance(a: bigint, b: bigint): number;
17
+ }
@@ -0,0 +1,56 @@
1
+ export class SimHash {
2
+ static FNV_PRIME = 1099511628211n;
3
+ static FNV_OFFSET_BASIS = 14695981039346656037n;
4
+ static MAX_UINT64 = 0xffffffffffffffffn;
5
+ /**
6
+ * Generates a 64-bit FNV-1a hash for a given string token.
7
+ */
8
+ static fnv1a64(token) {
9
+ let hash = this.FNV_OFFSET_BASIS;
10
+ const len = token.length;
11
+ for (let i = 0; i < len; i++) {
12
+ hash ^= BigInt(token.charCodeAt(i));
13
+ // BigInt safe multiplication modulo 2^64
14
+ hash = (hash * this.FNV_PRIME) & this.MAX_UINT64;
15
+ }
16
+ return hash;
17
+ }
18
+ /**
19
+ * Generates a 64-bit SimHash from an array of tokens.
20
+ */
21
+ static generate(tokens) {
22
+ const v = new Int32Array(64);
23
+ for (const token of tokens) {
24
+ const hash = this.fnv1a64(token);
25
+ for (let i = 0n; i < 64n; i++) {
26
+ const bit = (hash >> i) & 1n;
27
+ if (bit === 1n) {
28
+ v[Number(i)]++;
29
+ }
30
+ else {
31
+ v[Number(i)]--;
32
+ }
33
+ }
34
+ }
35
+ let simhash = 0n;
36
+ for (let i = 0n; i < 64n; i++) {
37
+ if (v[Number(i)] > 0) {
38
+ simhash |= (1n << i);
39
+ }
40
+ }
41
+ return simhash;
42
+ }
43
+ /**
44
+ * Computes the Hamming distance between two 64-bit hashes.
45
+ */
46
+ static hammingDistance(a, b) {
47
+ let xor = a ^ b;
48
+ let distance = 0;
49
+ while (xor > 0n) {
50
+ // Kernighan's bit counting
51
+ xor &= xor - 1n;
52
+ distance++;
53
+ }
54
+ return distance;
55
+ }
56
+ }
@@ -0,0 +1,30 @@
1
+ export * from './crawler/crawl.js';
2
+ export * from './crawler/metricsRunner.js';
3
+ export * from './graph/metrics.js';
4
+ export * from './report/html.js';
5
+ export * from './report/sitegraph_template.js';
6
+ export * from './report/sitegraphExport.js';
7
+ export * from './graph/graph.js';
8
+ export * from './diff/compare.js';
9
+ export * from './scoring/orphanSeverity.js';
10
+ export * from './graph/pagerank.js';
11
+ export * from './graph/duplicate.js';
12
+ export * from './graph/cluster.js';
13
+ export * from './scoring/hits.js';
14
+ export * from './analysis/analyze.js';
15
+ export * from './analysis/content.js';
16
+ export * from './analysis/seo.js';
17
+ export * from './analysis/images.js';
18
+ export * from './analysis/links.js';
19
+ export * from './audit/index.js';
20
+ export * from './audit/types.js';
21
+ export * from './db/index.js';
22
+ export * from './db/graphLoader.js';
23
+ export * from './db/repositories/SiteRepository.js';
24
+ export * from './db/repositories/SnapshotRepository.js';
25
+ export * from './db/repositories/PageRepository.js';
26
+ export * from './db/repositories/EdgeRepository.js';
27
+ export * from './db/repositories/MetricsRepository.js';
28
+ export * from './lock/lockManager.js';
29
+ export * from './lock/hashKey.js';
30
+ export * from './utils/version.js';
package/dist/index.js ADDED
@@ -0,0 +1,30 @@
1
+ export * from './crawler/crawl.js';
2
+ export * from './crawler/metricsRunner.js';
3
+ export * from './graph/metrics.js';
4
+ export * from './report/html.js';
5
+ export * from './report/sitegraph_template.js';
6
+ export * from './report/sitegraphExport.js';
7
+ export * from './graph/graph.js';
8
+ export * from './diff/compare.js';
9
+ export * from './scoring/orphanSeverity.js';
10
+ export * from './graph/pagerank.js';
11
+ export * from './graph/duplicate.js';
12
+ export * from './graph/cluster.js';
13
+ export * from './scoring/hits.js';
14
+ export * from './analysis/analyze.js';
15
+ export * from './analysis/content.js';
16
+ export * from './analysis/seo.js';
17
+ export * from './analysis/images.js';
18
+ export * from './analysis/links.js';
19
+ export * from './audit/index.js';
20
+ export * from './audit/types.js';
21
+ export * from './db/index.js';
22
+ export * from './db/graphLoader.js';
23
+ export * from './db/repositories/SiteRepository.js';
24
+ export * from './db/repositories/SnapshotRepository.js';
25
+ export * from './db/repositories/PageRepository.js';
26
+ export * from './db/repositories/EdgeRepository.js';
27
+ export * from './db/repositories/MetricsRepository.js';
28
+ export * from './lock/lockManager.js';
29
+ export * from './lock/hashKey.js';
30
+ export * from './utils/version.js';
@@ -0,0 +1 @@
1
+ export declare function generateLockKey(commandName: string, targetUrl: string, options: any): string;
@@ -0,0 +1,44 @@
1
+ import crypto from 'node:crypto';
2
+ import { normalizeUrl } from '../crawler/normalize.js';
3
+ // Flags that change the nature of the crawl and should be part of the lock key
4
+ const RELEVANT_FLAGS = [
5
+ 'limit',
6
+ 'depth',
7
+ 'output',
8
+ 'sitemap',
9
+ 'incremental',
10
+ 'detectSoft404',
11
+ 'detectTraps',
12
+ 'includeSubdomains',
13
+ 'allow',
14
+ 'deny',
15
+ 'proxy',
16
+ 'ua',
17
+ 'maxRedirects',
18
+ 'rate',
19
+ 'maxBytes',
20
+ 'concurrency'
21
+ ];
22
+ export function generateLockKey(commandName, targetUrl, options) {
23
+ // Respect the query stripping option consistent with sitegraph logic
24
+ const stripQuery = !options.query;
25
+ const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
26
+ // Extract relevant options in a deterministic order
27
+ const lockOptions = {};
28
+ for (const key of RELEVANT_FLAGS) {
29
+ if (options[key] !== undefined) {
30
+ lockOptions[key] = options[key];
31
+ }
32
+ }
33
+ // Create composite key object
34
+ const compositeKey = {
35
+ command: commandName,
36
+ target: normalizedTarget,
37
+ options: lockOptions
38
+ };
39
+ // Stringify and hash
40
+ // Since we inserted keys in a deterministic order (RELEVANT_FLAGS order),
41
+ // JSON.stringify will produce a stable string in V8/Node.js.
42
+ const stableString = JSON.stringify(compositeKey);
43
+ return crypto.createHash('sha256').update(stableString).digest('hex');
44
+ }
@@ -0,0 +1,7 @@
1
+ export declare class LockManager {
2
+ private static lockFilePath;
3
+ private static get lockDir();
4
+ static acquireLock(commandName: string, targetUrl: string, options: any, force?: boolean): Promise<void>;
5
+ static releaseLock(): void;
6
+ private static registerHandlers;
7
+ }
@@ -0,0 +1,112 @@
1
+ import fs from 'node:fs/promises';
2
+ import { existsSync, unlinkSync, readFileSync } from 'node:fs';
3
+ import path from 'node:path';
4
+ import os from 'node:os';
5
+ import chalk from 'chalk';
6
+ import { generateLockKey } from './hashKey.js';
7
+ import { isPidAlive } from './pidCheck.js';
8
+ export class LockManager {
9
+ static lockFilePath = null;
10
+ static get lockDir() {
11
+ return path.join(os.homedir(), '.crawlith', 'locks');
12
+ }
13
+ static async acquireLock(commandName, targetUrl, options, force = false) {
14
+ const lockHash = generateLockKey(commandName, targetUrl, options);
15
+ // Ensure lock directory exists
16
+ // We can use sync or async here. Since this is one-time setup, async is fine.
17
+ await fs.mkdir(this.lockDir, { recursive: true });
18
+ const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
19
+ // Check existing lock
20
+ if (existsSync(lockPath)) {
21
+ let isStale;
22
+ let pid;
23
+ try {
24
+ const lockContent = readFileSync(lockPath, 'utf-8');
25
+ const lockData = JSON.parse(lockContent);
26
+ pid = lockData.pid;
27
+ isStale = !isPidAlive(pid);
28
+ }
29
+ catch (_e) {
30
+ // Corrupted -> Treat as stale
31
+ isStale = true;
32
+ pid = 0; // Fallback, though unused if isStale is true
33
+ }
34
+ if (force) {
35
+ console.warn(chalk.yellow('Force mode enabled. Overriding existing lock.'));
36
+ try {
37
+ unlinkSync(lockPath);
38
+ }
39
+ catch { /* ignore */ }
40
+ }
41
+ else {
42
+ if (!isStale) {
43
+ console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (PID ${pid})`));
44
+ process.exit(1);
45
+ }
46
+ else {
47
+ console.log(chalk.gray('Detected stale lock. Continuing execution.'));
48
+ try {
49
+ unlinkSync(lockPath);
50
+ }
51
+ catch { /* ignore */ }
52
+ }
53
+ }
54
+ }
55
+ // Create new lock
56
+ try {
57
+ const data = {
58
+ pid: process.pid,
59
+ startedAt: Date.now(),
60
+ command: commandName,
61
+ target: targetUrl,
62
+ args: options
63
+ };
64
+ // 'wx' flag ensures atomic creation, failing if file exists
65
+ await fs.writeFile(lockPath, JSON.stringify(data, null, 2), { flag: 'wx', encoding: 'utf-8' });
66
+ this.lockFilePath = lockPath;
67
+ this.registerHandlers();
68
+ }
69
+ catch (error) {
70
+ if (error.code === 'EEXIST') {
71
+ // Race condition: another process created lock between our check and open
72
+ console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
73
+ process.exit(1);
74
+ }
75
+ throw error;
76
+ }
77
+ }
78
+ static releaseLock() {
79
+ if (this.lockFilePath && existsSync(this.lockFilePath)) {
80
+ try {
81
+ unlinkSync(this.lockFilePath);
82
+ this.lockFilePath = null;
83
+ }
84
+ catch (_error) {
85
+ // Ignore errors during cleanup
86
+ }
87
+ }
88
+ }
89
+ static registerHandlers() {
90
+ // Ensure cleanup only happens once
91
+ const cleanup = () => {
92
+ this.releaseLock();
93
+ };
94
+ // process.on('exit') is only called when process.exit() is called or event loop empties.
95
+ // It requires synchronous cleanup.
96
+ process.on('exit', cleanup);
97
+ // Signals
98
+ process.on('SIGINT', () => {
99
+ cleanup();
100
+ process.exit(130);
101
+ });
102
+ process.on('SIGTERM', () => {
103
+ cleanup();
104
+ process.exit(143);
105
+ });
106
+ process.on('uncaughtException', (err) => {
107
+ console.error(chalk.red('Uncaught Exception:'), err);
108
+ cleanup();
109
+ process.exit(1);
110
+ });
111
+ }
112
+ }
@@ -0,0 +1 @@
1
+ export declare function isPidAlive(pid: number): boolean;
@@ -0,0 +1,14 @@
1
+ export function isPidAlive(pid) {
2
+ try {
3
+ process.kill(pid, 0);
4
+ return true;
5
+ }
6
+ catch (error) {
7
+ if (error.code === 'EPERM') {
8
+ // Process exists but no permission to signal -> Alive
9
+ return true;
10
+ }
11
+ // Process does not exist (ESRCH) or other error
12
+ return false;
13
+ }
14
+ }
@@ -0,0 +1,2 @@
1
+ import { Metrics } from '../graph/metrics.js';
2
+ export declare function generateHtml(graphData: any, metrics: Metrics): string;