@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -35,6 +35,11 @@ export interface GraphNode {
35
35
  clusterId?: number;
36
36
  bytesReceived?: number;
37
37
  linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
38
+ crawlStatus?: string;
39
+ wordCount?: number;
40
+ thinContentScore?: number;
41
+ externalLinkRatio?: number;
42
+ orphanScore?: number;
38
43
  }
39
44
  export interface GraphEdge {
40
45
  source: string;
@@ -72,6 +77,17 @@ export declare class Graph {
72
77
  severity: 'low' | 'medium' | 'high';
73
78
  }[];
74
79
  contentClusters: ClusterInfo[];
80
+ /**
81
+ * Generates a unique key for an edge.
82
+ */
83
+ static getEdgeKey(source: string, target: string): string;
84
+ /**
85
+ * Parses an edge key back into source and target.
86
+ */
87
+ static parseEdgeKey(key: string): {
88
+ source: string;
89
+ target: string;
90
+ };
75
91
  /**
76
92
  * Adds a node to the graph if it doesn't exist.
77
93
  * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
@@ -1,6 +1,6 @@
1
1
  export class Graph {
2
2
  nodes = new Map();
3
- // Using string "source|target" to ensure uniqueness efficiently. Mapping to weight.
3
+ // Using JSON string of [source, target] to ensure uniqueness. Mapping to weight.
4
4
  edges = new Map();
5
5
  limitReached = false;
6
6
  sessionStats = {
@@ -12,6 +12,19 @@ export class Graph {
12
12
  trapClusters = [];
13
13
  duplicateClusters = [];
14
14
  contentClusters = [];
15
+ /**
16
+ * Generates a unique key for an edge.
17
+ */
18
+ static getEdgeKey(source, target) {
19
+ return JSON.stringify([source, target]);
20
+ }
21
+ /**
22
+ * Parses an edge key back into source and target.
23
+ */
24
+ static parseEdgeKey(key) {
25
+ const [source, target] = JSON.parse(key);
26
+ return { source, target };
27
+ }
15
28
  /**
16
29
  * Adds a node to the graph if it doesn't exist.
17
30
  * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
@@ -50,7 +63,7 @@ export class Graph {
50
63
  const sourceNode = this.nodes.get(source);
51
64
  const targetNode = this.nodes.get(target);
52
65
  if (sourceNode && targetNode) {
53
- const edgeKey = `${source}|${target}`;
66
+ const edgeKey = Graph.getEdgeKey(source, target);
54
67
  if (!this.edges.has(edgeKey)) {
55
68
  this.edges.set(edgeKey, weight);
56
69
  sourceNode.outLinks++;
@@ -70,7 +83,7 @@ export class Graph {
70
83
  }
71
84
  getEdges() {
72
85
  return Array.from(this.edges.entries()).map(([edge, weight]) => {
73
- const [source, target] = edge.split('|');
86
+ const { source, target } = Graph.parseEdgeKey(edge);
74
87
  return { source, target, weight };
75
88
  });
76
89
  }
@@ -91,7 +104,7 @@ export class Graph {
91
104
  }
92
105
  if (json.edges) {
93
106
  for (const edge of json.edges) {
94
- const key = `${edge.source}|${edge.target}`;
107
+ const key = Graph.getEdgeKey(edge.source, edge.target);
95
108
  graph.edges.set(key, edge.weight || 1.0);
96
109
  }
97
110
  }
@@ -3,6 +3,18 @@ export function calculateMetrics(graph, _maxDepth) {
3
3
  const edges = graph.getEdges();
4
4
  const totalPages = nodes.length;
5
5
  const totalEdges = edges.length;
6
+ // Identify broken nodes
7
+ const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
8
+ // Populate brokenLinks per node
9
+ for (const node of nodes) {
10
+ const nodeEdges = edges.filter(e => e.source === node.url);
11
+ const broken = nodeEdges
12
+ .map(e => e.target)
13
+ .filter(targetUrl => brokenNodes.has(targetUrl));
14
+ if (broken.length > 0) {
15
+ node.brokenLinks = broken;
16
+ }
17
+ }
6
18
  // Authority Score (per node)
7
19
  const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
8
20
  const getAuthority = (node) => {
@@ -20,6 +20,8 @@ export function computePageRank(graph, options = {}) {
20
20
  return false;
21
21
  if (node.status >= 400)
22
22
  return false; // Don't pass rank to broken pages
23
+ if (node.status === 0)
24
+ return false; // Don't pass rank to uncrawled/external pages
23
25
  return true;
24
26
  });
25
27
  const nodeCount = eligibleNodes.length;
@@ -2,6 +2,8 @@ export declare class SimHash {
2
2
  private static FNV_PRIME;
3
3
  private static FNV_OFFSET_BASIS;
4
4
  private static MAX_UINT64;
5
+ static readonly BANDS = 4;
6
+ static readonly BAND_WIDTH = 16;
5
7
  /**
6
8
  * Generates a 64-bit FNV-1a hash for a given string token.
7
9
  */
@@ -10,6 +12,10 @@ export declare class SimHash {
10
12
  * Generates a 64-bit SimHash from an array of tokens.
11
13
  */
12
14
  static generate(tokens: string[]): bigint;
15
+ /**
16
+ * Splits a 64-bit SimHash into 4 bands of 16 bits.
17
+ */
18
+ static getBands(simhash: bigint): number[];
13
19
  /**
14
20
  * Computes the Hamming distance between two 64-bit hashes.
15
21
  */
@@ -2,6 +2,8 @@ export class SimHash {
2
2
  static FNV_PRIME = 1099511628211n;
3
3
  static FNV_OFFSET_BASIS = 14695981039346656037n;
4
4
  static MAX_UINT64 = 0xffffffffffffffffn;
5
+ static BANDS = 4;
6
+ static BAND_WIDTH = 16;
5
7
  /**
6
8
  * Generates a 64-bit FNV-1a hash for a given string token.
7
9
  */
@@ -40,6 +42,18 @@ export class SimHash {
40
42
  }
41
43
  return simhash;
42
44
  }
45
+ /**
46
+ * Splits a 64-bit SimHash into 4 bands of 16 bits.
47
+ */
48
+ static getBands(simhash) {
49
+ const bands = [];
50
+ for (let i = 0; i < SimHash.BANDS; i++) {
51
+ // Extract 16-bit chunks
52
+ const chunk = Number((simhash >> BigInt(i * SimHash.BAND_WIDTH)) & 0xffffn);
53
+ bands.push(chunk);
54
+ }
55
+ return bands;
56
+ }
43
57
  /**
44
58
  * Computes the Hamming distance between two 64-bit hashes.
45
59
  */
package/dist/index.d.ts CHANGED
@@ -1,15 +1,17 @@
1
1
  export * from './crawler/crawl.js';
2
+ export * from './crawler/normalize.js';
2
3
  export * from './crawler/metricsRunner.js';
3
4
  export * from './graph/metrics.js';
4
5
  export * from './report/html.js';
5
- export * from './report/sitegraph_template.js';
6
- export * from './report/sitegraphExport.js';
6
+ export * from './report/crawl_template.js';
7
+ export * from './report/crawlExport.js';
7
8
  export * from './graph/graph.js';
8
9
  export * from './diff/compare.js';
9
10
  export * from './scoring/orphanSeverity.js';
10
11
  export * from './graph/pagerank.js';
11
12
  export * from './graph/duplicate.js';
12
13
  export * from './graph/cluster.js';
14
+ export * from './scoring/health.js';
13
15
  export * from './scoring/hits.js';
14
16
  export * from './analysis/analyze.js';
15
17
  export * from './analysis/content.js';
@@ -28,3 +30,4 @@ export * from './db/repositories/MetricsRepository.js';
28
30
  export * from './lock/lockManager.js';
29
31
  export * from './lock/hashKey.js';
30
32
  export * from './utils/version.js';
33
+ export * from './events.js';
package/dist/index.js CHANGED
@@ -1,15 +1,17 @@
1
1
  export * from './crawler/crawl.js';
2
+ export * from './crawler/normalize.js';
2
3
  export * from './crawler/metricsRunner.js';
3
4
  export * from './graph/metrics.js';
4
5
  export * from './report/html.js';
5
- export * from './report/sitegraph_template.js';
6
- export * from './report/sitegraphExport.js';
6
+ export * from './report/crawl_template.js';
7
+ export * from './report/crawlExport.js';
7
8
  export * from './graph/graph.js';
8
9
  export * from './diff/compare.js';
9
10
  export * from './scoring/orphanSeverity.js';
10
11
  export * from './graph/pagerank.js';
11
12
  export * from './graph/duplicate.js';
12
13
  export * from './graph/cluster.js';
14
+ export * from './scoring/health.js';
13
15
  export * from './scoring/hits.js';
14
16
  export * from './analysis/analyze.js';
15
17
  export * from './analysis/content.js';
@@ -28,3 +30,4 @@ export * from './db/repositories/MetricsRepository.js';
28
30
  export * from './lock/lockManager.js';
29
31
  export * from './lock/hashKey.js';
30
32
  export * from './utils/version.js';
33
+ export * from './events.js';
@@ -20,7 +20,7 @@ const RELEVANT_FLAGS = [
20
20
  'concurrency'
21
21
  ];
22
22
  export function generateLockKey(commandName, targetUrl, options) {
23
- // Respect the query stripping option consistent with sitegraph logic
23
+ // Respect the query stripping option consistent with crawl logic
24
24
  const stripQuery = !options.query;
25
25
  const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
26
26
  // Extract relevant options in a deterministic order
@@ -1,7 +1,10 @@
1
+ import { EngineContext } from '../events.js';
1
2
  export declare class LockManager {
2
3
  private static lockFilePath;
4
+ private static context;
3
5
  private static get lockDir();
4
- static acquireLock(commandName: string, targetUrl: string, options: any, force?: boolean): Promise<void>;
6
+ static acquireLock(commandName: string, targetUrl: string, options: any, context?: EngineContext, force?: boolean): Promise<void>;
5
7
  static releaseLock(): void;
8
+ private static log;
6
9
  private static registerHandlers;
7
10
  }
@@ -2,18 +2,18 @@ import fs from 'node:fs/promises';
2
2
  import { existsSync, unlinkSync, readFileSync } from 'node:fs';
3
3
  import path from 'node:path';
4
4
  import os from 'node:os';
5
- import chalk from 'chalk';
6
5
  import { generateLockKey } from './hashKey.js';
7
6
  import { isPidAlive } from './pidCheck.js';
8
7
  export class LockManager {
9
8
  static lockFilePath = null;
9
+ static context = null;
10
10
  static get lockDir() {
11
11
  return path.join(os.homedir(), '.crawlith', 'locks');
12
12
  }
13
- static async acquireLock(commandName, targetUrl, options, force = false) {
13
+ static async acquireLock(commandName, targetUrl, options, context, force = false) {
14
+ this.context = context || null;
14
15
  const lockHash = generateLockKey(commandName, targetUrl, options);
15
16
  // Ensure lock directory exists
16
- // We can use sync or async here. Since this is one-time setup, async is fine.
17
17
  await fs.mkdir(this.lockDir, { recursive: true });
18
18
  const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
19
19
  // Check existing lock
@@ -29,10 +29,10 @@ export class LockManager {
29
29
  catch (_e) {
30
30
  // Corrupted -> Treat as stale
31
31
  isStale = true;
32
- pid = 0; // Fallback, though unused if isStale is true
32
+ pid = 0;
33
33
  }
34
34
  if (force) {
35
- console.warn(chalk.yellow('Force mode enabled. Overriding existing lock.'));
35
+ this.log('warn', 'Force mode enabled. Overriding existing lock.');
36
36
  try {
37
37
  unlinkSync(lockPath);
38
38
  }
@@ -40,11 +40,11 @@ export class LockManager {
40
40
  }
41
41
  else {
42
42
  if (!isStale) {
43
- console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (PID ${pid})`));
43
+ this.log('error', `Crawlith: command already running for ${targetUrl} (PID ${pid})`);
44
44
  process.exit(1);
45
45
  }
46
46
  else {
47
- console.log(chalk.gray('Detected stale lock. Continuing execution.'));
47
+ this.log('info', 'Detected stale lock. Continuing execution.');
48
48
  try {
49
49
  unlinkSync(lockPath);
50
50
  }
@@ -68,8 +68,7 @@ export class LockManager {
68
68
  }
69
69
  catch (error) {
70
70
  if (error.code === 'EEXIST') {
71
- // Race condition: another process created lock between our check and open
72
- console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
71
+ this.log('error', `Crawlith: command already running for ${targetUrl} (Race condition)`);
73
72
  process.exit(1);
74
73
  }
75
74
  throw error;
@@ -86,15 +85,26 @@ export class LockManager {
86
85
  }
87
86
  }
88
87
  }
88
+ static log(type, message, error) {
89
+ if (this.context) {
90
+ this.context.emit({ type, message, error });
91
+ }
92
+ else {
93
+ // Fallback for legacy usage or when no context provided
94
+ if (type === 'error')
95
+ console.error(message, error || '');
96
+ else if (type === 'warn')
97
+ console.warn(message);
98
+ else
99
+ console.log(message);
100
+ }
101
+ }
89
102
  static registerHandlers() {
90
103
  // Ensure cleanup only happens once
91
104
  const cleanup = () => {
92
105
  this.releaseLock();
93
106
  };
94
- // process.on('exit') is only called when process.exit() is called or event loop empties.
95
- // It requires synchronous cleanup.
96
107
  process.on('exit', cleanup);
97
- // Signals
98
108
  process.on('SIGINT', () => {
99
109
  cleanup();
100
110
  process.exit(130);
@@ -104,7 +114,7 @@ export class LockManager {
104
114
  process.exit(143);
105
115
  });
106
116
  process.on('uncaughtException', (err) => {
107
- console.error(chalk.red('Uncaught Exception:'), err);
117
+ this.log('error', 'Uncaught Exception', err);
108
118
  cleanup();
109
119
  process.exit(1);
110
120
  });