@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/src/graph/graph.ts
CHANGED
|
@@ -35,6 +35,11 @@ export interface GraphNode {
|
|
|
35
35
|
clusterId?: number;
|
|
36
36
|
bytesReceived?: number;
|
|
37
37
|
linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
|
|
38
|
+
crawlStatus?: string;
|
|
39
|
+
wordCount?: number;
|
|
40
|
+
thinContentScore?: number;
|
|
41
|
+
externalLinkRatio?: number;
|
|
42
|
+
orphanScore?: number;
|
|
38
43
|
}
|
|
39
44
|
|
|
40
45
|
export interface GraphEdge {
|
|
@@ -60,7 +65,7 @@ export interface CrawlStats {
|
|
|
60
65
|
|
|
61
66
|
export class Graph {
|
|
62
67
|
nodes: Map<string, GraphNode> = new Map();
|
|
63
|
-
// Using string
|
|
68
|
+
// Using JSON string of [source, target] to ensure uniqueness. Mapping to weight.
|
|
64
69
|
edges: Map<string, number> = new Map();
|
|
65
70
|
limitReached: boolean = false;
|
|
66
71
|
sessionStats: CrawlStats = {
|
|
@@ -73,6 +78,21 @@ export class Graph {
|
|
|
73
78
|
duplicateClusters: { id: string; type: 'exact' | 'near' | 'template_heavy'; size: number; representative: string; severity: 'low' | 'medium' | 'high' }[] = [];
|
|
74
79
|
contentClusters: ClusterInfo[] = [];
|
|
75
80
|
|
|
81
|
+
/**
|
|
82
|
+
* Generates a unique key for an edge.
|
|
83
|
+
*/
|
|
84
|
+
static getEdgeKey(source: string, target: string): string {
|
|
85
|
+
return JSON.stringify([source, target]);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Parses an edge key back into source and target.
|
|
90
|
+
*/
|
|
91
|
+
static parseEdgeKey(key: string): { source: string; target: string } {
|
|
92
|
+
const [source, target] = JSON.parse(key);
|
|
93
|
+
return { source, target };
|
|
94
|
+
}
|
|
95
|
+
|
|
76
96
|
/**
|
|
77
97
|
* Adds a node to the graph if it doesn't exist.
|
|
78
98
|
* If it exists, updates the status if the new status is non-zero (meaning we crawled it).
|
|
@@ -113,7 +133,7 @@ export class Graph {
|
|
|
113
133
|
const targetNode = this.nodes.get(target);
|
|
114
134
|
|
|
115
135
|
if (sourceNode && targetNode) {
|
|
116
|
-
const edgeKey =
|
|
136
|
+
const edgeKey = Graph.getEdgeKey(source, target);
|
|
117
137
|
if (!this.edges.has(edgeKey)) {
|
|
118
138
|
this.edges.set(edgeKey, weight);
|
|
119
139
|
sourceNode.outLinks++;
|
|
@@ -134,7 +154,7 @@ export class Graph {
|
|
|
134
154
|
|
|
135
155
|
getEdges(): GraphEdge[] {
|
|
136
156
|
return Array.from(this.edges.entries()).map(([edge, weight]) => {
|
|
137
|
-
const
|
|
157
|
+
const { source, target } = Graph.parseEdgeKey(edge);
|
|
138
158
|
return { source, target, weight };
|
|
139
159
|
});
|
|
140
160
|
}
|
|
@@ -157,7 +177,7 @@ export class Graph {
|
|
|
157
177
|
}
|
|
158
178
|
if (json.edges) {
|
|
159
179
|
for (const edge of json.edges) {
|
|
160
|
-
const key =
|
|
180
|
+
const key = Graph.getEdgeKey(edge.source, edge.target);
|
|
161
181
|
graph.edges.set(key, edge.weight || 1.0);
|
|
162
182
|
}
|
|
163
183
|
}
|
package/src/graph/metrics.ts
CHANGED
|
@@ -29,6 +29,21 @@ export function calculateMetrics(graph: Graph, _maxDepth: number): Metrics {
|
|
|
29
29
|
const totalPages = nodes.length;
|
|
30
30
|
const totalEdges = edges.length;
|
|
31
31
|
|
|
32
|
+
// Identify broken nodes
|
|
33
|
+
const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
|
|
34
|
+
|
|
35
|
+
// Populate brokenLinks per node
|
|
36
|
+
for (const node of nodes) {
|
|
37
|
+
const nodeEdges = edges.filter(e => e.source === node.url);
|
|
38
|
+
const broken = nodeEdges
|
|
39
|
+
.map(e => e.target)
|
|
40
|
+
.filter(targetUrl => brokenNodes.has(targetUrl));
|
|
41
|
+
|
|
42
|
+
if (broken.length > 0) {
|
|
43
|
+
node.brokenLinks = broken;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
32
47
|
// Authority Score (per node)
|
|
33
48
|
const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
|
|
34
49
|
const getAuthority = (node: GraphNode) => {
|
package/src/graph/pagerank.ts
CHANGED
|
@@ -26,6 +26,7 @@ export function computePageRank(graph: Graph, options: PageRankOptions = {}) {
|
|
|
26
26
|
if (node.soft404Score && node.soft404Score > soft404Threshold) return false;
|
|
27
27
|
if (node.canonical && node.canonical !== node.url) return false;
|
|
28
28
|
if (node.status >= 400) return false; // Don't pass rank to broken pages
|
|
29
|
+
if (node.status === 0) return false; // Don't pass rank to uncrawled/external pages
|
|
29
30
|
return true;
|
|
30
31
|
});
|
|
31
32
|
|
package/src/graph/simhash.ts
CHANGED
|
@@ -2,6 +2,8 @@ export class SimHash {
|
|
|
2
2
|
private static FNV_PRIME = 1099511628211n;
|
|
3
3
|
private static FNV_OFFSET_BASIS = 14695981039346656037n;
|
|
4
4
|
private static MAX_UINT64 = 0xffffffffffffffffn;
|
|
5
|
+
public static readonly BANDS = 4;
|
|
6
|
+
public static readonly BAND_WIDTH = 16;
|
|
5
7
|
|
|
6
8
|
/**
|
|
7
9
|
* Generates a 64-bit FNV-1a hash for a given string token.
|
|
@@ -45,6 +47,19 @@ export class SimHash {
|
|
|
45
47
|
return simhash;
|
|
46
48
|
}
|
|
47
49
|
|
|
50
|
+
/**
|
|
51
|
+
* Splits a 64-bit SimHash into 4 bands of 16 bits.
|
|
52
|
+
*/
|
|
53
|
+
static getBands(simhash: bigint): number[] {
|
|
54
|
+
const bands: number[] = [];
|
|
55
|
+
for (let i = 0; i < SimHash.BANDS; i++) {
|
|
56
|
+
// Extract 16-bit chunks
|
|
57
|
+
const chunk = Number((simhash >> BigInt(i * SimHash.BAND_WIDTH)) & 0xFFFFn);
|
|
58
|
+
bands.push(chunk);
|
|
59
|
+
}
|
|
60
|
+
return bands;
|
|
61
|
+
}
|
|
62
|
+
|
|
48
63
|
/**
|
|
49
64
|
* Computes the Hamming distance between two 64-bit hashes.
|
|
50
65
|
*/
|
package/src/index.ts
CHANGED
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
export * from './crawler/crawl.js';
|
|
2
|
+
export * from './crawler/normalize.js';
|
|
2
3
|
export * from './crawler/metricsRunner.js';
|
|
3
4
|
export * from './graph/metrics.js';
|
|
4
5
|
export * from './report/html.js';
|
|
5
|
-
export * from './report/
|
|
6
|
-
export * from './report/
|
|
6
|
+
export * from './report/crawl_template.js';
|
|
7
|
+
export * from './report/crawlExport.js';
|
|
7
8
|
export * from './graph/graph.js';
|
|
8
9
|
export * from './diff/compare.js';
|
|
9
10
|
export * from './scoring/orphanSeverity.js';
|
|
10
11
|
export * from './graph/pagerank.js';
|
|
11
12
|
export * from './graph/duplicate.js';
|
|
12
13
|
export * from './graph/cluster.js';
|
|
14
|
+
export * from './scoring/health.js';
|
|
13
15
|
export * from './scoring/hits.js';
|
|
14
16
|
export * from './analysis/analyze.js';
|
|
15
17
|
export * from './analysis/content.js';
|
|
@@ -28,3 +30,4 @@ export * from './db/repositories/MetricsRepository.js';
|
|
|
28
30
|
export * from './lock/lockManager.js';
|
|
29
31
|
export * from './lock/hashKey.js';
|
|
30
32
|
export * from './utils/version.js';
|
|
33
|
+
export * from './events.js';
|
package/src/lock/hashKey.ts
CHANGED
|
@@ -22,7 +22,7 @@ const RELEVANT_FLAGS = [
|
|
|
22
22
|
];
|
|
23
23
|
|
|
24
24
|
export function generateLockKey(commandName: string, targetUrl: string, options: any): string {
|
|
25
|
-
// Respect the query stripping option consistent with
|
|
25
|
+
// Respect the query stripping option consistent with crawl logic
|
|
26
26
|
const stripQuery = !options.query;
|
|
27
27
|
|
|
28
28
|
const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
|
package/src/lock/lockManager.ts
CHANGED
|
@@ -2,9 +2,9 @@ import fs from 'node:fs/promises';
|
|
|
2
2
|
import { existsSync, unlinkSync, readFileSync } from 'node:fs';
|
|
3
3
|
import path from 'node:path';
|
|
4
4
|
import os from 'node:os';
|
|
5
|
-
import chalk from 'chalk';
|
|
6
5
|
import { generateLockKey } from './hashKey.js';
|
|
7
6
|
import { isPidAlive } from './pidCheck.js';
|
|
7
|
+
import { EngineContext } from '../events.js';
|
|
8
8
|
|
|
9
9
|
interface LockData {
|
|
10
10
|
pid: number;
|
|
@@ -16,16 +16,17 @@ interface LockData {
|
|
|
16
16
|
|
|
17
17
|
export class LockManager {
|
|
18
18
|
private static lockFilePath: string | null = null;
|
|
19
|
+
private static context: EngineContext | null = null;
|
|
19
20
|
|
|
20
21
|
private static get lockDir(): string {
|
|
21
22
|
return path.join(os.homedir(), '.crawlith', 'locks');
|
|
22
23
|
}
|
|
23
24
|
|
|
24
|
-
static async acquireLock(commandName: string, targetUrl: string, options: any, force: boolean = false): Promise<void> {
|
|
25
|
+
static async acquireLock(commandName: string, targetUrl: string, options: any, context?: EngineContext, force: boolean = false): Promise<void> {
|
|
26
|
+
this.context = context || null;
|
|
25
27
|
const lockHash = generateLockKey(commandName, targetUrl, options);
|
|
26
28
|
|
|
27
29
|
// Ensure lock directory exists
|
|
28
|
-
// We can use sync or async here. Since this is one-time setup, async is fine.
|
|
29
30
|
await fs.mkdir(this.lockDir, { recursive: true });
|
|
30
31
|
|
|
31
32
|
const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
|
|
@@ -43,18 +44,18 @@ export class LockManager {
|
|
|
43
44
|
} catch (_e) {
|
|
44
45
|
// Corrupted -> Treat as stale
|
|
45
46
|
isStale = true;
|
|
46
|
-
pid = 0;
|
|
47
|
+
pid = 0;
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
if (force) {
|
|
50
|
-
|
|
51
|
+
this.log('warn', 'Force mode enabled. Overriding existing lock.');
|
|
51
52
|
try { unlinkSync(lockPath); } catch { /* ignore */ }
|
|
52
53
|
} else {
|
|
53
54
|
if (!isStale) {
|
|
54
|
-
|
|
55
|
+
this.log('error', `Crawlith: command already running for ${targetUrl} (PID ${pid})`);
|
|
55
56
|
process.exit(1);
|
|
56
57
|
} else {
|
|
57
|
-
|
|
58
|
+
this.log('info', 'Detected stale lock. Continuing execution.');
|
|
58
59
|
try { unlinkSync(lockPath); } catch { /* ignore */ }
|
|
59
60
|
}
|
|
60
61
|
}
|
|
@@ -77,8 +78,7 @@ export class LockManager {
|
|
|
77
78
|
this.registerHandlers();
|
|
78
79
|
} catch (error: any) {
|
|
79
80
|
if (error.code === 'EEXIST') {
|
|
80
|
-
|
|
81
|
-
console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
|
|
81
|
+
this.log('error', `Crawlith: command already running for ${targetUrl} (Race condition)`);
|
|
82
82
|
process.exit(1);
|
|
83
83
|
}
|
|
84
84
|
throw error;
|
|
@@ -96,17 +96,25 @@ export class LockManager {
|
|
|
96
96
|
}
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
+
private static log(type: 'info' | 'warn' | 'error', message: string, error?: unknown) {
|
|
100
|
+
if (this.context) {
|
|
101
|
+
this.context.emit({ type, message, error });
|
|
102
|
+
} else {
|
|
103
|
+
// Fallback for legacy usage or when no context provided
|
|
104
|
+
if (type === 'error') console.error(message, error || '');
|
|
105
|
+
else if (type === 'warn') console.warn(message);
|
|
106
|
+
else console.log(message);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
99
110
|
private static registerHandlers() {
|
|
100
111
|
// Ensure cleanup only happens once
|
|
101
112
|
const cleanup = () => {
|
|
102
113
|
this.releaseLock();
|
|
103
114
|
};
|
|
104
115
|
|
|
105
|
-
// process.on('exit') is only called when process.exit() is called or event loop empties.
|
|
106
|
-
// It requires synchronous cleanup.
|
|
107
116
|
process.on('exit', cleanup);
|
|
108
117
|
|
|
109
|
-
// Signals
|
|
110
118
|
process.on('SIGINT', () => {
|
|
111
119
|
cleanup();
|
|
112
120
|
process.exit(130);
|
|
@@ -116,7 +124,7 @@ export class LockManager {
|
|
|
116
124
|
process.exit(143);
|
|
117
125
|
});
|
|
118
126
|
process.on('uncaughtException', (err) => {
|
|
119
|
-
|
|
127
|
+
this.log('error', 'Uncaught Exception', err);
|
|
120
128
|
cleanup();
|
|
121
129
|
process.exit(1);
|
|
122
130
|
});
|