@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -35,6 +35,11 @@ export interface GraphNode {
35
35
  clusterId?: number;
36
36
  bytesReceived?: number;
37
37
  linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
38
+ crawlStatus?: string;
39
+ wordCount?: number;
40
+ thinContentScore?: number;
41
+ externalLinkRatio?: number;
42
+ orphanScore?: number;
38
43
  }
39
44
 
40
45
  export interface GraphEdge {
@@ -60,7 +65,7 @@ export interface CrawlStats {
60
65
 
61
66
  export class Graph {
62
67
  nodes: Map<string, GraphNode> = new Map();
63
- // Using string "source|target" to ensure uniqueness efficiently. Mapping to weight.
68
+ // Using JSON string of [source, target] to ensure uniqueness. Mapping to weight.
64
69
  edges: Map<string, number> = new Map();
65
70
  limitReached: boolean = false;
66
71
  sessionStats: CrawlStats = {
@@ -73,6 +78,21 @@ export class Graph {
73
78
  duplicateClusters: { id: string; type: 'exact' | 'near' | 'template_heavy'; size: number; representative: string; severity: 'low' | 'medium' | 'high' }[] = [];
74
79
  contentClusters: ClusterInfo[] = [];
75
80
 
81
+ /**
82
+ * Generates a unique key for an edge.
83
+ */
84
+ static getEdgeKey(source: string, target: string): string {
85
+ return JSON.stringify([source, target]);
86
+ }
87
+
88
+ /**
89
+ * Parses an edge key back into source and target.
90
+ */
91
+ static parseEdgeKey(key: string): { source: string; target: string } {
92
+ const [source, target] = JSON.parse(key);
93
+ return { source, target };
94
+ }
95
+
76
96
  /**
77
97
  * Adds a node to the graph if it doesn't exist.
78
98
  * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
@@ -113,7 +133,7 @@ export class Graph {
113
133
  const targetNode = this.nodes.get(target);
114
134
 
115
135
  if (sourceNode && targetNode) {
116
- const edgeKey = `${source}|${target}`;
136
+ const edgeKey = Graph.getEdgeKey(source, target);
117
137
  if (!this.edges.has(edgeKey)) {
118
138
  this.edges.set(edgeKey, weight);
119
139
  sourceNode.outLinks++;
@@ -134,7 +154,7 @@ export class Graph {
134
154
 
135
155
  getEdges(): GraphEdge[] {
136
156
  return Array.from(this.edges.entries()).map(([edge, weight]) => {
137
- const [source, target] = edge.split('|');
157
+ const { source, target } = Graph.parseEdgeKey(edge);
138
158
  return { source, target, weight };
139
159
  });
140
160
  }
@@ -157,7 +177,7 @@ export class Graph {
157
177
  }
158
178
  if (json.edges) {
159
179
  for (const edge of json.edges) {
160
- const key = `${edge.source}|${edge.target}`;
180
+ const key = Graph.getEdgeKey(edge.source, edge.target);
161
181
  graph.edges.set(key, edge.weight || 1.0);
162
182
  }
163
183
  }
@@ -29,6 +29,21 @@ export function calculateMetrics(graph: Graph, _maxDepth: number): Metrics {
29
29
  const totalPages = nodes.length;
30
30
  const totalEdges = edges.length;
31
31
 
32
+ // Identify broken nodes
33
+ const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
34
+
35
+ // Populate brokenLinks per node
36
+ for (const node of nodes) {
37
+ const nodeEdges = edges.filter(e => e.source === node.url);
38
+ const broken = nodeEdges
39
+ .map(e => e.target)
40
+ .filter(targetUrl => brokenNodes.has(targetUrl));
41
+
42
+ if (broken.length > 0) {
43
+ node.brokenLinks = broken;
44
+ }
45
+ }
46
+
32
47
  // Authority Score (per node)
33
48
  const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
34
49
  const getAuthority = (node: GraphNode) => {
@@ -26,6 +26,7 @@ export function computePageRank(graph: Graph, options: PageRankOptions = {}) {
26
26
  if (node.soft404Score && node.soft404Score > soft404Threshold) return false;
27
27
  if (node.canonical && node.canonical !== node.url) return false;
28
28
  if (node.status >= 400) return false; // Don't pass rank to broken pages
29
+ if (node.status === 0) return false; // Don't pass rank to uncrawled/external pages
29
30
  return true;
30
31
  });
31
32
 
@@ -2,6 +2,8 @@ export class SimHash {
2
2
  private static FNV_PRIME = 1099511628211n;
3
3
  private static FNV_OFFSET_BASIS = 14695981039346656037n;
4
4
  private static MAX_UINT64 = 0xffffffffffffffffn;
5
+ public static readonly BANDS = 4;
6
+ public static readonly BAND_WIDTH = 16;
5
7
 
6
8
  /**
7
9
  * Generates a 64-bit FNV-1a hash for a given string token.
@@ -45,6 +47,19 @@ export class SimHash {
45
47
  return simhash;
46
48
  }
47
49
 
50
+ /**
51
+ * Splits a 64-bit SimHash into 4 bands of 16 bits.
52
+ */
53
+ static getBands(simhash: bigint): number[] {
54
+ const bands: number[] = [];
55
+ for (let i = 0; i < SimHash.BANDS; i++) {
56
+ // Extract 16-bit chunks
57
+ const chunk = Number((simhash >> BigInt(i * SimHash.BAND_WIDTH)) & 0xFFFFn);
58
+ bands.push(chunk);
59
+ }
60
+ return bands;
61
+ }
62
+
48
63
  /**
49
64
  * Computes the Hamming distance between two 64-bit hashes.
50
65
  */
package/src/index.ts CHANGED
@@ -1,15 +1,17 @@
1
1
  export * from './crawler/crawl.js';
2
+ export * from './crawler/normalize.js';
2
3
  export * from './crawler/metricsRunner.js';
3
4
  export * from './graph/metrics.js';
4
5
  export * from './report/html.js';
5
- export * from './report/sitegraph_template.js';
6
- export * from './report/sitegraphExport.js';
6
+ export * from './report/crawl_template.js';
7
+ export * from './report/crawlExport.js';
7
8
  export * from './graph/graph.js';
8
9
  export * from './diff/compare.js';
9
10
  export * from './scoring/orphanSeverity.js';
10
11
  export * from './graph/pagerank.js';
11
12
  export * from './graph/duplicate.js';
12
13
  export * from './graph/cluster.js';
14
+ export * from './scoring/health.js';
13
15
  export * from './scoring/hits.js';
14
16
  export * from './analysis/analyze.js';
15
17
  export * from './analysis/content.js';
@@ -28,3 +30,4 @@ export * from './db/repositories/MetricsRepository.js';
28
30
  export * from './lock/lockManager.js';
29
31
  export * from './lock/hashKey.js';
30
32
  export * from './utils/version.js';
33
+ export * from './events.js';
@@ -22,7 +22,7 @@ const RELEVANT_FLAGS = [
22
22
  ];
23
23
 
24
24
  export function generateLockKey(commandName: string, targetUrl: string, options: any): string {
25
- // Respect the query stripping option consistent with sitegraph logic
25
+ // Respect the query stripping option consistent with crawl logic
26
26
  const stripQuery = !options.query;
27
27
 
28
28
  const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
@@ -2,9 +2,9 @@ import fs from 'node:fs/promises';
2
2
  import { existsSync, unlinkSync, readFileSync } from 'node:fs';
3
3
  import path from 'node:path';
4
4
  import os from 'node:os';
5
- import chalk from 'chalk';
6
5
  import { generateLockKey } from './hashKey.js';
7
6
  import { isPidAlive } from './pidCheck.js';
7
+ import { EngineContext } from '../events.js';
8
8
 
9
9
  interface LockData {
10
10
  pid: number;
@@ -16,16 +16,17 @@ interface LockData {
16
16
 
17
17
  export class LockManager {
18
18
  private static lockFilePath: string | null = null;
19
+ private static context: EngineContext | null = null;
19
20
 
20
21
  private static get lockDir(): string {
21
22
  return path.join(os.homedir(), '.crawlith', 'locks');
22
23
  }
23
24
 
24
- static async acquireLock(commandName: string, targetUrl: string, options: any, force: boolean = false): Promise<void> {
25
+ static async acquireLock(commandName: string, targetUrl: string, options: any, context?: EngineContext, force: boolean = false): Promise<void> {
26
+ this.context = context || null;
25
27
  const lockHash = generateLockKey(commandName, targetUrl, options);
26
28
 
27
29
  // Ensure lock directory exists
28
- // We can use sync or async here. Since this is one-time setup, async is fine.
29
30
  await fs.mkdir(this.lockDir, { recursive: true });
30
31
 
31
32
  const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
@@ -43,18 +44,18 @@ export class LockManager {
43
44
  } catch (_e) {
44
45
  // Corrupted -> Treat as stale
45
46
  isStale = true;
46
- pid = 0; // Fallback, though unused if isStale is true
47
+ pid = 0;
47
48
  }
48
49
 
49
50
  if (force) {
50
- console.warn(chalk.yellow('Force mode enabled. Overriding existing lock.'));
51
+ this.log('warn', 'Force mode enabled. Overriding existing lock.');
51
52
  try { unlinkSync(lockPath); } catch { /* ignore */ }
52
53
  } else {
53
54
  if (!isStale) {
54
- console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (PID ${pid})`));
55
+ this.log('error', `Crawlith: command already running for ${targetUrl} (PID ${pid})`);
55
56
  process.exit(1);
56
57
  } else {
57
- console.log(chalk.gray('Detected stale lock. Continuing execution.'));
58
+ this.log('info', 'Detected stale lock. Continuing execution.');
58
59
  try { unlinkSync(lockPath); } catch { /* ignore */ }
59
60
  }
60
61
  }
@@ -77,8 +78,7 @@ export class LockManager {
77
78
  this.registerHandlers();
78
79
  } catch (error: any) {
79
80
  if (error.code === 'EEXIST') {
80
- // Race condition: another process created lock between our check and open
81
- console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
81
+ this.log('error', `Crawlith: command already running for ${targetUrl} (Race condition)`);
82
82
  process.exit(1);
83
83
  }
84
84
  throw error;
@@ -96,17 +96,25 @@ export class LockManager {
96
96
  }
97
97
  }
98
98
 
99
+ private static log(type: 'info' | 'warn' | 'error', message: string, error?: unknown) {
100
+ if (this.context) {
101
+ this.context.emit({ type, message, error });
102
+ } else {
103
+ // Fallback for legacy usage or when no context provided
104
+ if (type === 'error') console.error(message, error || '');
105
+ else if (type === 'warn') console.warn(message);
106
+ else console.log(message);
107
+ }
108
+ }
109
+
99
110
  private static registerHandlers() {
100
111
  // Ensure cleanup only happens once
101
112
  const cleanup = () => {
102
113
  this.releaseLock();
103
114
  };
104
115
 
105
- // process.on('exit') is only called when process.exit() is called or event loop empties.
106
- // It requires synchronous cleanup.
107
116
  process.on('exit', cleanup);
108
117
 
109
- // Signals
110
118
  process.on('SIGINT', () => {
111
119
  cleanup();
112
120
  process.exit(130);
@@ -116,7 +124,7 @@ export class LockManager {
116
124
  process.exit(143);
117
125
  });
118
126
  process.on('uncaughtException', (err) => {
119
- console.error(chalk.red('Uncaught Exception:'), err);
127
+ this.log('error', 'Uncaught Exception', err);
120
128
  cleanup();
121
129
  process.exit(1);
122
130
  });