@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
export class SimHash {
|
|
2
|
+
private static FNV_PRIME = 1099511628211n;
|
|
3
|
+
private static FNV_OFFSET_BASIS = 14695981039346656037n;
|
|
4
|
+
private static MAX_UINT64 = 0xffffffffffffffffn;
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Generates a 64-bit FNV-1a hash for a given string token.
|
|
8
|
+
*/
|
|
9
|
+
static fnv1a64(token: string): bigint {
|
|
10
|
+
let hash = this.FNV_OFFSET_BASIS;
|
|
11
|
+
const len = token.length;
|
|
12
|
+
for (let i = 0; i < len; i++) {
|
|
13
|
+
hash ^= BigInt(token.charCodeAt(i));
|
|
14
|
+
// BigInt safe multiplication modulo 2^64
|
|
15
|
+
hash = (hash * this.FNV_PRIME) & this.MAX_UINT64;
|
|
16
|
+
}
|
|
17
|
+
return hash;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Generates a 64-bit SimHash from an array of tokens.
|
|
22
|
+
*/
|
|
23
|
+
static generate(tokens: string[]): bigint {
|
|
24
|
+
const v = new Int32Array(64);
|
|
25
|
+
|
|
26
|
+
for (const token of tokens) {
|
|
27
|
+
const hash = this.fnv1a64(token);
|
|
28
|
+
for (let i = 0n; i < 64n; i++) {
|
|
29
|
+
const bit = (hash >> i) & 1n;
|
|
30
|
+
if (bit === 1n) {
|
|
31
|
+
v[Number(i)]++;
|
|
32
|
+
} else {
|
|
33
|
+
v[Number(i)]--;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
let simhash = 0n;
|
|
39
|
+
for (let i = 0n; i < 64n; i++) {
|
|
40
|
+
if (v[Number(i)] > 0) {
|
|
41
|
+
simhash |= (1n << i);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return simhash;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Computes the Hamming distance between two 64-bit hashes.
|
|
50
|
+
*/
|
|
51
|
+
static hammingDistance(a: bigint, b: bigint): number {
|
|
52
|
+
let xor = a ^ b;
|
|
53
|
+
let distance = 0;
|
|
54
|
+
while (xor > 0n) {
|
|
55
|
+
// Kernighan's bit counting
|
|
56
|
+
xor &= xor - 1n;
|
|
57
|
+
distance++;
|
|
58
|
+
}
|
|
59
|
+
return distance;
|
|
60
|
+
}
|
|
61
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export * from './crawler/crawl.js';
|
|
2
|
+
export * from './crawler/metricsRunner.js';
|
|
3
|
+
export * from './graph/metrics.js';
|
|
4
|
+
export * from './report/html.js';
|
|
5
|
+
export * from './report/sitegraph_template.js';
|
|
6
|
+
export * from './report/sitegraphExport.js';
|
|
7
|
+
export * from './graph/graph.js';
|
|
8
|
+
export * from './diff/compare.js';
|
|
9
|
+
export * from './scoring/orphanSeverity.js';
|
|
10
|
+
export * from './graph/pagerank.js';
|
|
11
|
+
export * from './graph/duplicate.js';
|
|
12
|
+
export * from './graph/cluster.js';
|
|
13
|
+
export * from './scoring/hits.js';
|
|
14
|
+
export * from './analysis/analyze.js';
|
|
15
|
+
export * from './analysis/content.js';
|
|
16
|
+
export * from './analysis/seo.js';
|
|
17
|
+
export * from './analysis/images.js';
|
|
18
|
+
export * from './analysis/links.js';
|
|
19
|
+
export * from './audit/index.js';
|
|
20
|
+
export * from './audit/types.js';
|
|
21
|
+
export * from './db/index.js';
|
|
22
|
+
export * from './db/graphLoader.js';
|
|
23
|
+
export * from './db/repositories/SiteRepository.js';
|
|
24
|
+
export * from './db/repositories/SnapshotRepository.js';
|
|
25
|
+
export * from './db/repositories/PageRepository.js';
|
|
26
|
+
export * from './db/repositories/EdgeRepository.js';
|
|
27
|
+
export * from './db/repositories/MetricsRepository.js';
|
|
28
|
+
export * from './lock/lockManager.js';
|
|
29
|
+
export * from './lock/hashKey.js';
|
|
30
|
+
export * from './utils/version.js';
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import crypto from 'node:crypto';
|
|
2
|
+
import { normalizeUrl } from '../crawler/normalize.js';
|
|
3
|
+
|
|
4
|
+
// Flags that change the nature of the crawl and should be part of the lock key
|
|
5
|
+
const RELEVANT_FLAGS = [
|
|
6
|
+
'limit',
|
|
7
|
+
'depth',
|
|
8
|
+
'output',
|
|
9
|
+
'sitemap',
|
|
10
|
+
'incremental',
|
|
11
|
+
'detectSoft404',
|
|
12
|
+
'detectTraps',
|
|
13
|
+
'includeSubdomains',
|
|
14
|
+
'allow',
|
|
15
|
+
'deny',
|
|
16
|
+
'proxy',
|
|
17
|
+
'ua',
|
|
18
|
+
'maxRedirects',
|
|
19
|
+
'rate',
|
|
20
|
+
'maxBytes',
|
|
21
|
+
'concurrency'
|
|
22
|
+
];
|
|
23
|
+
|
|
24
|
+
export function generateLockKey(commandName: string, targetUrl: string, options: any): string {
|
|
25
|
+
// Respect the query stripping option consistent with sitegraph logic
|
|
26
|
+
const stripQuery = !options.query;
|
|
27
|
+
|
|
28
|
+
const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
|
|
29
|
+
|
|
30
|
+
// Extract relevant options in a deterministic order
|
|
31
|
+
const lockOptions: Record<string, any> = {};
|
|
32
|
+
for (const key of RELEVANT_FLAGS) {
|
|
33
|
+
if (options[key] !== undefined) {
|
|
34
|
+
lockOptions[key] = options[key];
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Create composite key object
|
|
39
|
+
const compositeKey = {
|
|
40
|
+
command: commandName,
|
|
41
|
+
target: normalizedTarget,
|
|
42
|
+
options: lockOptions
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
// Stringify and hash
|
|
46
|
+
// Since we inserted keys in a deterministic order (RELEVANT_FLAGS order),
|
|
47
|
+
// JSON.stringify will produce a stable string in V8/Node.js.
|
|
48
|
+
const stableString = JSON.stringify(compositeKey);
|
|
49
|
+
|
|
50
|
+
return crypto.createHash('sha256').update(stableString).digest('hex');
|
|
51
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import { existsSync, unlinkSync, readFileSync } from 'node:fs';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import chalk from 'chalk';
|
|
6
|
+
import { generateLockKey } from './hashKey.js';
|
|
7
|
+
import { isPidAlive } from './pidCheck.js';
|
|
8
|
+
|
|
9
|
+
interface LockData {
|
|
10
|
+
pid: number;
|
|
11
|
+
startedAt: number;
|
|
12
|
+
command: string;
|
|
13
|
+
target: string;
|
|
14
|
+
args: any;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class LockManager {
|
|
18
|
+
private static lockFilePath: string | null = null;
|
|
19
|
+
|
|
20
|
+
private static get lockDir(): string {
|
|
21
|
+
return path.join(os.homedir(), '.crawlith', 'locks');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
static async acquireLock(commandName: string, targetUrl: string, options: any, force: boolean = false): Promise<void> {
|
|
25
|
+
const lockHash = generateLockKey(commandName, targetUrl, options);
|
|
26
|
+
|
|
27
|
+
// Ensure lock directory exists
|
|
28
|
+
// We can use sync or async here. Since this is one-time setup, async is fine.
|
|
29
|
+
await fs.mkdir(this.lockDir, { recursive: true });
|
|
30
|
+
|
|
31
|
+
const lockPath = path.join(this.lockDir, `${lockHash}.lock`);
|
|
32
|
+
|
|
33
|
+
// Check existing lock
|
|
34
|
+
if (existsSync(lockPath)) {
|
|
35
|
+
let isStale: boolean;
|
|
36
|
+
let pid: number;
|
|
37
|
+
|
|
38
|
+
try {
|
|
39
|
+
const lockContent = readFileSync(lockPath, 'utf-8');
|
|
40
|
+
const lockData = JSON.parse(lockContent);
|
|
41
|
+
pid = lockData.pid;
|
|
42
|
+
isStale = !isPidAlive(pid);
|
|
43
|
+
} catch (_e) {
|
|
44
|
+
// Corrupted -> Treat as stale
|
|
45
|
+
isStale = true;
|
|
46
|
+
pid = 0; // Fallback, though unused if isStale is true
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (force) {
|
|
50
|
+
console.warn(chalk.yellow('Force mode enabled. Overriding existing lock.'));
|
|
51
|
+
try { unlinkSync(lockPath); } catch { /* ignore */ }
|
|
52
|
+
} else {
|
|
53
|
+
if (!isStale) {
|
|
54
|
+
console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (PID ${pid})`));
|
|
55
|
+
process.exit(1);
|
|
56
|
+
} else {
|
|
57
|
+
console.log(chalk.gray('Detected stale lock. Continuing execution.'));
|
|
58
|
+
try { unlinkSync(lockPath); } catch { /* ignore */ }
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Create new lock
|
|
64
|
+
try {
|
|
65
|
+
const data: LockData = {
|
|
66
|
+
pid: process.pid,
|
|
67
|
+
startedAt: Date.now(),
|
|
68
|
+
command: commandName,
|
|
69
|
+
target: targetUrl,
|
|
70
|
+
args: options
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
// 'wx' flag ensures atomic creation, failing if file exists
|
|
74
|
+
await fs.writeFile(lockPath, JSON.stringify(data, null, 2), { flag: 'wx', encoding: 'utf-8' });
|
|
75
|
+
|
|
76
|
+
this.lockFilePath = lockPath;
|
|
77
|
+
this.registerHandlers();
|
|
78
|
+
} catch (error: any) {
|
|
79
|
+
if (error.code === 'EEXIST') {
|
|
80
|
+
// Race condition: another process created lock between our check and open
|
|
81
|
+
console.error(chalk.red(`Crawlith: command already running for ${targetUrl} (Race condition)`));
|
|
82
|
+
process.exit(1);
|
|
83
|
+
}
|
|
84
|
+
throw error;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
static releaseLock(): void {
|
|
89
|
+
if (this.lockFilePath && existsSync(this.lockFilePath)) {
|
|
90
|
+
try {
|
|
91
|
+
unlinkSync(this.lockFilePath);
|
|
92
|
+
this.lockFilePath = null;
|
|
93
|
+
} catch (_error) {
|
|
94
|
+
// Ignore errors during cleanup
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
private static registerHandlers() {
|
|
100
|
+
// Ensure cleanup only happens once
|
|
101
|
+
const cleanup = () => {
|
|
102
|
+
this.releaseLock();
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
// process.on('exit') is only called when process.exit() is called or event loop empties.
|
|
106
|
+
// It requires synchronous cleanup.
|
|
107
|
+
process.on('exit', cleanup);
|
|
108
|
+
|
|
109
|
+
// Signals
|
|
110
|
+
process.on('SIGINT', () => {
|
|
111
|
+
cleanup();
|
|
112
|
+
process.exit(130);
|
|
113
|
+
});
|
|
114
|
+
process.on('SIGTERM', () => {
|
|
115
|
+
cleanup();
|
|
116
|
+
process.exit(143);
|
|
117
|
+
});
|
|
118
|
+
process.on('uncaughtException', (err) => {
|
|
119
|
+
console.error(chalk.red('Uncaught Exception:'), err);
|
|
120
|
+
cleanup();
|
|
121
|
+
process.exit(1);
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export function isPidAlive(pid: number): boolean {
|
|
2
|
+
try {
|
|
3
|
+
process.kill(pid, 0);
|
|
4
|
+
return true;
|
|
5
|
+
} catch (error: any) {
|
|
6
|
+
if (error.code === 'EPERM') {
|
|
7
|
+
// Process exists but no permission to signal -> Alive
|
|
8
|
+
return true;
|
|
9
|
+
}
|
|
10
|
+
// Process does not exist (ESRCH) or other error
|
|
11
|
+
return false;
|
|
12
|
+
}
|
|
13
|
+
}
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import { Metrics } from '../graph/metrics.js';
|
|
2
|
+
|
|
3
|
+
function safeJson(data: any): string {
|
|
4
|
+
return JSON.stringify(data).replace(/</g, '\\u003c');
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export function generateHtml(graphData: any, metrics: Metrics): string {
|
|
8
|
+
const graphJson = safeJson(graphData);
|
|
9
|
+
|
|
10
|
+
return `<!DOCTYPE html>
|
|
11
|
+
<html lang="en">
|
|
12
|
+
<head>
|
|
13
|
+
<meta charset="UTF-8">
|
|
14
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
15
|
+
<title>Crawlith Site Graph</title>
|
|
16
|
+
<style>
|
|
17
|
+
body { margin: 0; overflow: hidden; font-family: sans-serif; }
|
|
18
|
+
#graph { width: 100vw; height: 100vh; background: #f0f0f0; }
|
|
19
|
+
.tooltip {
|
|
20
|
+
position: absolute;
|
|
21
|
+
background: white;
|
|
22
|
+
border: 1px solid #ccc;
|
|
23
|
+
padding: 10px;
|
|
24
|
+
pointer-events: none;
|
|
25
|
+
font-size: 12px;
|
|
26
|
+
box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
|
|
27
|
+
display: none;
|
|
28
|
+
}
|
|
29
|
+
#metrics {
|
|
30
|
+
position: absolute;
|
|
31
|
+
top: 10px;
|
|
32
|
+
left: 10px;
|
|
33
|
+
background: rgba(255, 255, 255, 0.9);
|
|
34
|
+
padding: 15px;
|
|
35
|
+
border-radius: 5px;
|
|
36
|
+
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
|
37
|
+
max-width: 320px;
|
|
38
|
+
max-height: 90vh;
|
|
39
|
+
overflow-y: auto;
|
|
40
|
+
z-index: 100;
|
|
41
|
+
}
|
|
42
|
+
h1 { font-size: 18px; margin-top: 0; }
|
|
43
|
+
h2 { font-size: 14px; margin: 15px 0 5px; border-bottom: 1px solid #ddd; }
|
|
44
|
+
ul { padding-left: 20px; margin: 5px 0; }
|
|
45
|
+
.legend { margin-top: 10px; font-size: 11px; }
|
|
46
|
+
.legend-item { display: flex; align-items: center; margin-bottom: 3px; }
|
|
47
|
+
.dot { width: 8px; height: 8px; border-radius: 50%; margin-right: 5px; }
|
|
48
|
+
.stat-row { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
|
|
49
|
+
.stat-label { color: #666; }
|
|
50
|
+
.stat-value { font-weight: bold; }
|
|
51
|
+
</style>
|
|
52
|
+
</head>
|
|
53
|
+
<body>
|
|
54
|
+
<div id="metrics">
|
|
55
|
+
<h1>Crawlith Site Graph</h1>
|
|
56
|
+
|
|
57
|
+
<div class="stat-row">
|
|
58
|
+
<span class="stat-label">Discovered Pages:</span>
|
|
59
|
+
<span class="stat-value">${metrics.totalPages}</span>
|
|
60
|
+
</div>
|
|
61
|
+
${metrics.sessionStats ? `
|
|
62
|
+
<div class="stat-row">
|
|
63
|
+
<span class="stat-label">Session Crawl:</span>
|
|
64
|
+
<span class="stat-value">${metrics.sessionStats.pagesFetched} pages</span>
|
|
65
|
+
</div>
|
|
66
|
+
${metrics.sessionStats.pagesCached > 0 ? `
|
|
67
|
+
<div class="stat-row" style="font-size: 11px; margin-top: -3px;">
|
|
68
|
+
<span class="stat-label" style="padding-left: 10px;">- Reuse Cached:</span>
|
|
69
|
+
<span class="stat-value">${metrics.sessionStats.pagesCached}</span>
|
|
70
|
+
</div>` : ''}
|
|
71
|
+
` : ''}
|
|
72
|
+
<div class="stat-row">
|
|
73
|
+
<span class="stat-label">Total Edges:</span>
|
|
74
|
+
<span class="stat-value">${metrics.totalEdges}</span>
|
|
75
|
+
</div>
|
|
76
|
+
<div class="stat-row">
|
|
77
|
+
<span class="stat-label">Max Depth:</span>
|
|
78
|
+
<span class="stat-value">${metrics.maxDepthFound}</span>
|
|
79
|
+
</div>
|
|
80
|
+
<div class="stat-row">
|
|
81
|
+
<span class="stat-label">Avg Out-Degree:</span>
|
|
82
|
+
<span class="stat-value">${metrics.averageOutDegree.toFixed(2)}</span>
|
|
83
|
+
</div>
|
|
84
|
+
|
|
85
|
+
<div class="legend">
|
|
86
|
+
<div class="legend-item"><div class="dot" style="background: red;"></div>Orphan (In-Links: 0)</div>
|
|
87
|
+
<div class="legend-item"><div class="dot" style="background: orange;"></div>Deep (Depth >= 4)</div>
|
|
88
|
+
<div class="legend-item"><div class="dot" style="background: blue;"></div>Normal</div>
|
|
89
|
+
</div>
|
|
90
|
+
|
|
91
|
+
${metrics.topAuthorityPages.length > 0 ? `
|
|
92
|
+
<h3>Top Authority</h3>
|
|
93
|
+
<ul>
|
|
94
|
+
${metrics.topAuthorityPages.map(p => `<li><a href="${p.url}" target="_blank">${new URL(p.url).pathname}</a> (${p.authority.toFixed(2)})</li>`).join('')}
|
|
95
|
+
</ul>
|
|
96
|
+
` : ''}
|
|
97
|
+
|
|
98
|
+
${metrics.orphanPages.length > 0 ? `
|
|
99
|
+
<h3>Orphan Pages (${metrics.orphanPages.length})</h3>
|
|
100
|
+
<details>
|
|
101
|
+
<summary>Show list</summary>
|
|
102
|
+
<ul>
|
|
103
|
+
${metrics.orphanPages.slice(0, 20).map(url => `<li><a href="${url}" target="_blank">${url}</a></li>`).join('')}
|
|
104
|
+
${metrics.orphanPages.length > 20 ? `<li>... and ${metrics.orphanPages.length - 20} more</li>` : ''}
|
|
105
|
+
</ul>
|
|
106
|
+
</details>
|
|
107
|
+
` : ''}
|
|
108
|
+
</div>
|
|
109
|
+
<div id="graph"></div>
|
|
110
|
+
<div class="tooltip" id="tooltip"></div>
|
|
111
|
+
|
|
112
|
+
<script src="https://d3js.org/d3.v7.min.js"></script>
|
|
113
|
+
<script>
|
|
114
|
+
// Make data available globally
|
|
115
|
+
window.GRAPH_DATA = ${graphJson};
|
|
116
|
+
|
|
117
|
+
const data = window.GRAPH_DATA;
|
|
118
|
+
const width = window.innerWidth;
|
|
119
|
+
const height = window.innerHeight;
|
|
120
|
+
|
|
121
|
+
const svg = d3.select("#graph").append("svg")
|
|
122
|
+
.attr("width", width)
|
|
123
|
+
.attr("height", height)
|
|
124
|
+
.call(d3.zoom().on("zoom", (event) => {
|
|
125
|
+
g.attr("transform", event.transform);
|
|
126
|
+
}));
|
|
127
|
+
|
|
128
|
+
const g = svg.append("g");
|
|
129
|
+
|
|
130
|
+
// Define arrow marker
|
|
131
|
+
svg.append("defs").selectAll("marker")
|
|
132
|
+
.data(["arrow"])
|
|
133
|
+
.enter().append("marker")
|
|
134
|
+
.attr("id", d => d)
|
|
135
|
+
.attr("viewBox", "0 -5 10 10")
|
|
136
|
+
.attr("refX", 15)
|
|
137
|
+
.attr("refY", 0)
|
|
138
|
+
.attr("markerWidth", 6)
|
|
139
|
+
.attr("markerHeight", 6)
|
|
140
|
+
.attr("orient", "auto")
|
|
141
|
+
.append("path")
|
|
142
|
+
.attr("d", "M0,-5L10,0L0,5")
|
|
143
|
+
.attr("fill", "#999");
|
|
144
|
+
|
|
145
|
+
const simulation = d3.forceSimulation(data.nodes)
|
|
146
|
+
.force("link", d3.forceLink(data.edges).id(d => d.url).distance(100))
|
|
147
|
+
.force("charge", d3.forceManyBody().strength(-300))
|
|
148
|
+
.force("center", d3.forceCenter(width / 2, height / 2))
|
|
149
|
+
.force("collide", d3.forceCollide().radius(d => Math.sqrt((d.inLinks || 0) + 1) * 5 + 2));
|
|
150
|
+
|
|
151
|
+
const link = g.append("g")
|
|
152
|
+
.attr("stroke", "#999")
|
|
153
|
+
.attr("stroke-opacity", 0.6)
|
|
154
|
+
.selectAll("line")
|
|
155
|
+
.data(data.edges)
|
|
156
|
+
.join("line")
|
|
157
|
+
.attr("stroke-width", 1)
|
|
158
|
+
.attr("marker-end", "url(#arrow)");
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
const node = g.append("g")
|
|
162
|
+
.attr("stroke", "#fff")
|
|
163
|
+
.attr("stroke-width", 1.5)
|
|
164
|
+
.selectAll("circle")
|
|
165
|
+
.data(data.nodes)
|
|
166
|
+
.join("circle")
|
|
167
|
+
.attr("r", d => Math.sqrt((d.inLinks || 0) + 1) * 3 + 2)
|
|
168
|
+
.attr("fill", d => {
|
|
169
|
+
if (d.inLinks === 0 && d.depth > 0) return "red";
|
|
170
|
+
if (d.depth >= 4) return "orange";
|
|
171
|
+
return "blue";
|
|
172
|
+
})
|
|
173
|
+
.call(d3.drag()
|
|
174
|
+
.on("start", dragstarted)
|
|
175
|
+
.on("drag", dragged)
|
|
176
|
+
.on("end", dragended));
|
|
177
|
+
|
|
178
|
+
const tooltip = d3.select("#tooltip");
|
|
179
|
+
|
|
180
|
+
node.on("mouseover", (event, d) => {
|
|
181
|
+
tooltip.style("display", "block")
|
|
182
|
+
.html(\`
|
|
183
|
+
<strong>URL:</strong> \${d.url}<br>
|
|
184
|
+
<strong>Depth:</strong> \${d.depth}<br>
|
|
185
|
+
<strong>In-Links:</strong> \${d.inLinks}<br>
|
|
186
|
+
<strong>Out-Links:</strong> \${d.outLinks}<br>
|
|
187
|
+
<strong>Status:</strong> \${d.status}
|
|
188
|
+
\`)
|
|
189
|
+
.style("left", (event.pageX + 10) + "px")
|
|
190
|
+
.style("top", (event.pageY - 10) + "px");
|
|
191
|
+
})
|
|
192
|
+
.on("mouseout", () => {
|
|
193
|
+
tooltip.style("display", "none");
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
simulation.on("tick", () => {
|
|
197
|
+
link
|
|
198
|
+
.attr("x1", d => d.source.x)
|
|
199
|
+
.attr("y1", d => d.source.y)
|
|
200
|
+
.attr("x2", d => d.target.x)
|
|
201
|
+
.attr("y2", d => d.target.y);
|
|
202
|
+
|
|
203
|
+
node
|
|
204
|
+
.attr("cx", d => d.x)
|
|
205
|
+
.attr("cy", d => d.y);
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
function dragstarted(event, d) {
|
|
209
|
+
if (!event.active) simulation.alphaTarget(0.3).restart();
|
|
210
|
+
d.fx = d.x;
|
|
211
|
+
d.fy = d.y;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function dragged(event, d) {
|
|
215
|
+
d.fx = event.x;
|
|
216
|
+
d.fy = event.y;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function dragended(event, d) {
|
|
220
|
+
if (!event.active) simulation.alphaTarget(0);
|
|
221
|
+
d.fx = null;
|
|
222
|
+
d.fy = null;
|
|
223
|
+
}
|
|
224
|
+
</script>
|
|
225
|
+
</body>
|
|
226
|
+
</html>`;
|
|
227
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
export function renderSitegraphCsvNodes(graphData: any): string {
|
|
2
|
+
const nodeHeaders = ['URL', 'Depth', 'Status', 'InboundLinks', 'OutboundLinks', 'PageRankScore'];
|
|
3
|
+
const nodeRows = graphData.nodes.map((n: any) => {
|
|
4
|
+
const outbound = graphData.edges.filter((e: any) => e.source === n.url).length;
|
|
5
|
+
const inbound = graphData.edges.filter((e: any) => e.target === n.url).length;
|
|
6
|
+
const statusStr = n.status === 0 ? 'Pending/Limit' : n.status;
|
|
7
|
+
return [n.url, n.depth, statusStr, inbound, outbound, (n.pageRankScore || 0).toFixed(3)].join(',');
|
|
8
|
+
});
|
|
9
|
+
return [nodeHeaders.join(','), ...nodeRows].join('\n');
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export function renderSitegraphCsvEdges(graphData: any): string {
|
|
13
|
+
const edgeHeaders = ['Source', 'Target', 'Weight'];
|
|
14
|
+
const edgeRows = graphData.edges.map((e: any) => [e.source, e.target, e.weight].join(','));
|
|
15
|
+
return [edgeHeaders.join(','), ...edgeRows].join('\n');
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function renderSitegraphMarkdown(url: string, graphData: any, metrics: any, graph: any): string {
|
|
19
|
+
const md = [
|
|
20
|
+
`# Crawlith Crawl Summary - ${url}`,
|
|
21
|
+
'',
|
|
22
|
+
`## 📊 Metrics`,
|
|
23
|
+
`- Total Pages Discovered: ${metrics.totalPages}`,
|
|
24
|
+
`- Session Pages Crawled: ${graph.sessionStats?.pagesFetched ?? 0}`,
|
|
25
|
+
`- Total Edges: ${metrics.totalEdges}`,
|
|
26
|
+
`- Avg Depth: ${metrics.averageDepth.toFixed(2)}`,
|
|
27
|
+
`- Max Depth: ${metrics.maxDepthFound}`,
|
|
28
|
+
`- Crawl Efficiency: ${(metrics.crawlEfficiencyScore * 100).toFixed(1)}%`,
|
|
29
|
+
'',
|
|
30
|
+
`## 📄 Top Pages (by In-degree)`,
|
|
31
|
+
];
|
|
32
|
+
|
|
33
|
+
const topPages = [...graphData.nodes]
|
|
34
|
+
.map((n: any) => ({ ...n, inLinks: graphData.edges.filter((e: any) => e.target === n.url).length }))
|
|
35
|
+
.sort((a, b) => b.inLinks - a.inLinks)
|
|
36
|
+
.slice(0, 10);
|
|
37
|
+
|
|
38
|
+
md.push('| URL | Inbound | Status |');
|
|
39
|
+
md.push('| :--- | :--- | :--- |');
|
|
40
|
+
topPages.forEach(p => {
|
|
41
|
+
const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
|
|
42
|
+
md.push(`| ${p.url} | ${p.inLinks} | ${statusStr} |`);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
if (metrics.topPageRankPages?.length > 0) {
|
|
46
|
+
md.push('');
|
|
47
|
+
md.push('## 🏆 Top PageRank Pages');
|
|
48
|
+
md.push('| URL | Score |');
|
|
49
|
+
md.push('| :--- | :--- |');
|
|
50
|
+
metrics.topPageRankPages.slice(0, 10).forEach((p: any) => {
|
|
51
|
+
const node = graph.nodes?.get ? graph.nodes.get(p.url) : graph.getNodes?.().find((x: any) => x.url === p.url);
|
|
52
|
+
const score = node?.pageRankScore ?? 0;
|
|
53
|
+
md.push(`| ${p.url} | ${score.toFixed(3)}/100 |`);
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return md.join('\n');
|
|
58
|
+
}
|