@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -2,6 +2,8 @@ import { Database } from 'better-sqlite3';
|
|
|
2
2
|
export interface Site {
|
|
3
3
|
id: number;
|
|
4
4
|
domain: string;
|
|
5
|
+
preferred_url: string | null;
|
|
6
|
+
ssl: number | null;
|
|
5
7
|
created_at: string;
|
|
6
8
|
settings_json: string | null;
|
|
7
9
|
is_active: number;
|
|
@@ -13,6 +15,10 @@ export declare class SiteRepository {
|
|
|
13
15
|
getSite(domain: string): Site | undefined;
|
|
14
16
|
getAllSites(): Site[];
|
|
15
17
|
createSite(domain: string): number;
|
|
18
|
+
updateSitePreference(id: number, prefs: {
|
|
19
|
+
preferred_url: string;
|
|
20
|
+
ssl: number;
|
|
21
|
+
}): void;
|
|
16
22
|
firstOrCreateSite(domain: string): Site;
|
|
17
23
|
deleteSite(id: number): void;
|
|
18
24
|
}
|
|
@@ -17,6 +17,10 @@ export class SiteRepository {
|
|
|
17
17
|
const info = stmt.run(domain);
|
|
18
18
|
return info.lastInsertRowid;
|
|
19
19
|
}
|
|
20
|
+
updateSitePreference(id, prefs) {
|
|
21
|
+
const stmt = this.db.prepare('UPDATE sites SET preferred_url = ?, ssl = ? WHERE id = ?');
|
|
22
|
+
stmt.run(prefs.preferred_url, prefs.ssl, id);
|
|
23
|
+
}
|
|
20
24
|
firstOrCreateSite(domain) {
|
|
21
25
|
let site = this.getSite(domain);
|
|
22
26
|
if (!site) {
|
|
@@ -2,11 +2,11 @@ import { Database } from 'better-sqlite3';
|
|
|
2
2
|
export interface Snapshot {
|
|
3
3
|
id: number;
|
|
4
4
|
site_id: number;
|
|
5
|
-
|
|
5
|
+
run_type: 'completed' | 'incremental' | 'single';
|
|
6
6
|
created_at: string;
|
|
7
7
|
node_count: number;
|
|
8
8
|
edge_count: number;
|
|
9
|
-
status: 'running' | 'completed' | 'failed';
|
|
9
|
+
status: 'queued' | 'running' | 'completed' | 'failed' | 'cancelled';
|
|
10
10
|
limit_reached: number;
|
|
11
11
|
health_score: number | null;
|
|
12
12
|
orphan_count: number | null;
|
|
@@ -15,10 +15,17 @@ export interface Snapshot {
|
|
|
15
15
|
export declare class SnapshotRepository {
|
|
16
16
|
private db;
|
|
17
17
|
constructor(db: Database);
|
|
18
|
-
createSnapshot(siteId: number,
|
|
19
|
-
getLatestSnapshot(siteId: number, status?: '
|
|
18
|
+
createSnapshot(siteId: number, runType: Snapshot['run_type'], status?: Snapshot['status']): number;
|
|
19
|
+
getLatestSnapshot(siteId: number, status?: Snapshot['status'], includeSingle?: boolean): Snapshot | undefined;
|
|
20
|
+
touchSnapshot(id: number): void;
|
|
20
21
|
getSnapshotCount(siteId: number): number;
|
|
21
|
-
|
|
22
|
+
/**
|
|
23
|
+
* Returns true if the site has ever had a completed full or incremental crawl.
|
|
24
|
+
* Single snapshots (from page --live) do NOT count as a "first crawl".
|
|
25
|
+
*/
|
|
26
|
+
hasFullCrawl(siteId: number): boolean;
|
|
27
|
+
updateSnapshotStatus(id: number, status: Snapshot['status'], stats?: Partial<Snapshot>): void;
|
|
22
28
|
getSnapshot(id: number): Snapshot | undefined;
|
|
23
29
|
deleteSnapshot(id: number): void;
|
|
30
|
+
pruneSnapshots(siteId: number, maxSnapshots: number, maxSingleSnapshots: number, protectedSnapshotId?: number): void;
|
|
24
31
|
}
|
|
@@ -3,18 +3,16 @@ export class SnapshotRepository {
|
|
|
3
3
|
constructor(db) {
|
|
4
4
|
this.db = db;
|
|
5
5
|
}
|
|
6
|
-
createSnapshot(siteId,
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
// If two snapshots created in same second, ORDER BY created_at DESC is unstable or equal.
|
|
10
|
-
// We should rely on ID for stability if timestamps are equal, but the query uses created_at.
|
|
11
|
-
// Let's ensure we can also order by ID as tie-breaker.
|
|
12
|
-
const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
|
|
13
|
-
const info = stmt.run(siteId, type, status);
|
|
6
|
+
createSnapshot(siteId, runType, status = 'running') {
|
|
7
|
+
const stmt = this.db.prepare('INSERT INTO snapshots (site_id, run_type, status) VALUES (?, ?, ?)');
|
|
8
|
+
const info = stmt.run(siteId, runType, status);
|
|
14
9
|
return info.lastInsertRowid;
|
|
15
10
|
}
|
|
16
|
-
getLatestSnapshot(siteId, status) {
|
|
17
|
-
let sql = 'SELECT * FROM snapshots WHERE site_id = ?
|
|
11
|
+
getLatestSnapshot(siteId, status, includeSingle = false) {
|
|
12
|
+
let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
|
|
13
|
+
if (!includeSingle) {
|
|
14
|
+
sql += ' AND run_type != \'single\'';
|
|
15
|
+
}
|
|
18
16
|
const params = [siteId];
|
|
19
17
|
if (status) {
|
|
20
18
|
sql += ' AND status = ?';
|
|
@@ -23,10 +21,21 @@ export class SnapshotRepository {
|
|
|
23
21
|
sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
|
|
24
22
|
return this.db.prepare(sql).get(...params);
|
|
25
23
|
}
|
|
24
|
+
touchSnapshot(id) {
|
|
25
|
+
this.db.prepare(`UPDATE snapshots SET created_at = datetime('now') WHERE id = ?`).run(id);
|
|
26
|
+
}
|
|
26
27
|
getSnapshotCount(siteId) {
|
|
27
28
|
const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId);
|
|
28
29
|
return result.count;
|
|
29
30
|
}
|
|
31
|
+
/**
|
|
32
|
+
* Returns true if the site has ever had a completed full or incremental crawl.
|
|
33
|
+
* Single snapshots (from page --live) do NOT count as a "first crawl".
|
|
34
|
+
*/
|
|
35
|
+
hasFullCrawl(siteId) {
|
|
36
|
+
const result = this.db.prepare(`SELECT COUNT(*) as count FROM snapshots WHERE site_id = ? AND run_type IN ('completed', 'incremental') AND status = 'completed'`).get(siteId);
|
|
37
|
+
return result.count > 0;
|
|
38
|
+
}
|
|
30
39
|
updateSnapshotStatus(id, status, stats = {}) {
|
|
31
40
|
const sets = ['status = ?'];
|
|
32
41
|
const params = [status];
|
|
@@ -73,4 +82,33 @@ export class SnapshotRepository {
|
|
|
73
82
|
});
|
|
74
83
|
tx();
|
|
75
84
|
}
|
|
85
|
+
pruneSnapshots(siteId, maxSnapshots, maxSingleSnapshots, protectedSnapshotId) {
|
|
86
|
+
const tx = this.db.transaction(() => {
|
|
87
|
+
const singlesToDelete = this.db.prepare(`
|
|
88
|
+
SELECT id
|
|
89
|
+
FROM snapshots
|
|
90
|
+
WHERE site_id = ? AND run_type = 'single'
|
|
91
|
+
ORDER BY created_at DESC, id DESC
|
|
92
|
+
LIMIT -1 OFFSET ?
|
|
93
|
+
`).all(siteId, Math.max(0, maxSingleSnapshots));
|
|
94
|
+
const fullToDelete = this.db.prepare(`
|
|
95
|
+
SELECT id
|
|
96
|
+
FROM snapshots
|
|
97
|
+
WHERE site_id = ? AND run_type IN ('completed', 'incremental')
|
|
98
|
+
ORDER BY created_at DESC, id DESC
|
|
99
|
+
LIMIT -1 OFFSET ?
|
|
100
|
+
`).all(siteId, Math.max(0, maxSnapshots));
|
|
101
|
+
const ids = [...singlesToDelete, ...fullToDelete]
|
|
102
|
+
.map(r => r.id)
|
|
103
|
+
.filter(id => id !== protectedSnapshotId);
|
|
104
|
+
for (const id of ids) {
|
|
105
|
+
// Inline delete logic to keep operation inside this transaction.
|
|
106
|
+
this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
|
|
107
|
+
this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
|
|
108
|
+
this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
|
|
109
|
+
this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
tx();
|
|
113
|
+
}
|
|
76
114
|
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export interface ResetOptions {
|
|
2
|
+
reportsDir?: string;
|
|
3
|
+
dryRun?: boolean;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Completely resets the Crawlith state.
|
|
7
|
+
* Deletes the database, clears all locks, and optionally wipes the reports directory.
|
|
8
|
+
*/
|
|
9
|
+
export declare function resetCrawlith(options?: ResetOptions): Promise<void>;
|
package/dist/db/reset.js
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import os from 'node:os';
|
|
4
|
+
import { closeDb, getDb, getDbPath } from './index.js';
|
|
5
|
+
import { LockManager } from '../lock/lockManager.js';
|
|
6
|
+
/**
|
|
7
|
+
* Completely resets the Crawlith state.
|
|
8
|
+
* Deletes the database, clears all locks, and optionally wipes the reports directory.
|
|
9
|
+
*/
|
|
10
|
+
export async function resetCrawlith(options = {}) {
|
|
11
|
+
const { reportsDir, dryRun = false } = options;
|
|
12
|
+
if (dryRun) {
|
|
13
|
+
return;
|
|
14
|
+
}
|
|
15
|
+
// 1. Close database connection to release file handles
|
|
16
|
+
closeDb();
|
|
17
|
+
// 2. Clear all locks
|
|
18
|
+
await LockManager.clearAllLocks();
|
|
19
|
+
// 3. Remove the entire state directory (includes DB)
|
|
20
|
+
const dbPath = getDbPath();
|
|
21
|
+
if (dbPath !== ':memory:') {
|
|
22
|
+
const crawlithDir = path.join(os.homedir(), '.crawlith');
|
|
23
|
+
await fs.rm(crawlithDir, { recursive: true, force: true });
|
|
24
|
+
}
|
|
25
|
+
// 4. Remove reports directory if specified
|
|
26
|
+
if (reportsDir) {
|
|
27
|
+
const resolvedReportsDir = path.resolve(reportsDir);
|
|
28
|
+
await fs.rm(resolvedReportsDir, { recursive: true, force: true });
|
|
29
|
+
}
|
|
30
|
+
// 5. Re-initialize database to ensure schema is fresh for next use
|
|
31
|
+
getDb();
|
|
32
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { Database, Statement } from 'better-sqlite3';
|
|
2
|
+
export declare class Statements {
|
|
3
|
+
private db;
|
|
4
|
+
getPageIdByUrl: Statement;
|
|
5
|
+
insertPluginReport: Statement;
|
|
6
|
+
getPluginReport: Statement;
|
|
7
|
+
deleteSnapshotPlugins: Statement;
|
|
8
|
+
getSnapshot: Statement;
|
|
9
|
+
getMigration: Statement;
|
|
10
|
+
insertMigration: Statement;
|
|
11
|
+
constructor(db: Database);
|
|
12
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
export class Statements {
|
|
2
|
+
db;
|
|
3
|
+
getPageIdByUrl;
|
|
4
|
+
insertPluginReport;
|
|
5
|
+
getPluginReport;
|
|
6
|
+
deleteSnapshotPlugins;
|
|
7
|
+
getSnapshot;
|
|
8
|
+
getMigration;
|
|
9
|
+
insertMigration;
|
|
10
|
+
constructor(db) {
|
|
11
|
+
this.db = db;
|
|
12
|
+
this.getPageIdByUrl = this.db.prepare(`
|
|
13
|
+
SELECT id FROM pages
|
|
14
|
+
WHERE site_id = (SELECT site_id FROM snapshots WHERE id = ?)
|
|
15
|
+
AND normalized_url = ?
|
|
16
|
+
`);
|
|
17
|
+
this.insertPluginReport = this.db.prepare(`
|
|
18
|
+
INSERT OR REPLACE INTO plugin_reports
|
|
19
|
+
(snapshot_id, plugin_name, data, total_score, score_count, score_weight_sum, score_calculated_at)
|
|
20
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
21
|
+
`);
|
|
22
|
+
this.getPluginReport = this.db.prepare(`
|
|
23
|
+
SELECT data FROM plugin_reports
|
|
24
|
+
WHERE snapshot_id = ? AND plugin_name = ?
|
|
25
|
+
ORDER BY created_at DESC LIMIT 1
|
|
26
|
+
`);
|
|
27
|
+
this.deleteSnapshotPlugins = this.db.prepare(`
|
|
28
|
+
DELETE FROM plugin_reports WHERE snapshot_id = ?
|
|
29
|
+
`);
|
|
30
|
+
this.getSnapshot = this.db.prepare(`
|
|
31
|
+
SELECT id FROM snapshots WHERE id = ?
|
|
32
|
+
`);
|
|
33
|
+
this.getMigration = this.db.prepare(`
|
|
34
|
+
SELECT plugin_name FROM plugin_migrations WHERE plugin_name = ?
|
|
35
|
+
`);
|
|
36
|
+
this.insertMigration = this.db.prepare(`
|
|
37
|
+
INSERT INTO plugin_migrations (plugin_name) VALUES (?)
|
|
38
|
+
`);
|
|
39
|
+
}
|
|
40
|
+
}
|
package/dist/diff/compare.d.ts
CHANGED
|
@@ -12,11 +12,6 @@ export interface DiffResult {
|
|
|
12
12
|
oldCanonical: string | null;
|
|
13
13
|
newCanonical: string | null;
|
|
14
14
|
}[];
|
|
15
|
-
changedDuplicateGroup: {
|
|
16
|
-
url: string;
|
|
17
|
-
oldGroup: string | null;
|
|
18
|
-
newGroup: string | null;
|
|
19
|
-
}[];
|
|
20
15
|
metricDeltas: {
|
|
21
16
|
structuralEntropy: number;
|
|
22
17
|
orphanCount: number;
|
package/dist/diff/compare.js
CHANGED
|
@@ -6,7 +6,6 @@ export function compareGraphs(oldGraph, newGraph) {
|
|
|
6
6
|
const removedUrls = [];
|
|
7
7
|
const changedStatus = [];
|
|
8
8
|
const changedCanonical = [];
|
|
9
|
-
const changedDuplicateGroup = [];
|
|
10
9
|
// Added & Changed
|
|
11
10
|
for (const [url, newNode] of newNodes) {
|
|
12
11
|
const oldNode = oldNodes.get(url);
|
|
@@ -26,16 +25,6 @@ export function compareGraphs(oldGraph, newGraph) {
|
|
|
26
25
|
newCanonical: newNode.canonical || null
|
|
27
26
|
});
|
|
28
27
|
}
|
|
29
|
-
// Changed Duplicate Group
|
|
30
|
-
const oldGroup = oldNode.duplicateClusterId || null;
|
|
31
|
-
const newGroup = newNode.duplicateClusterId || null;
|
|
32
|
-
if (oldGroup !== newGroup) {
|
|
33
|
-
changedDuplicateGroup.push({
|
|
34
|
-
url,
|
|
35
|
-
oldGroup,
|
|
36
|
-
newGroup
|
|
37
|
-
});
|
|
38
|
-
}
|
|
39
28
|
}
|
|
40
29
|
}
|
|
41
30
|
// Removed
|
|
@@ -58,7 +47,6 @@ export function compareGraphs(oldGraph, newGraph) {
|
|
|
58
47
|
removedUrls,
|
|
59
48
|
changedStatus,
|
|
60
49
|
changedCanonical,
|
|
61
|
-
changedDuplicateGroup,
|
|
62
50
|
metricDeltas
|
|
63
51
|
};
|
|
64
52
|
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { Graph } from '../graph/graph.js';
|
|
2
|
+
export interface DiffOptions {
|
|
3
|
+
onlyCritical?: boolean;
|
|
4
|
+
}
|
|
5
|
+
export interface SnapshotDiff {
|
|
6
|
+
newPages: string[];
|
|
7
|
+
removedPages: string[];
|
|
8
|
+
changedPages: {
|
|
9
|
+
url: string;
|
|
10
|
+
changes: string[];
|
|
11
|
+
severity: 'low' | 'medium' | 'high';
|
|
12
|
+
}[];
|
|
13
|
+
}
|
|
14
|
+
export declare class DiffService {
|
|
15
|
+
compare(oldGraph: Graph | undefined, newGraph: Graph, _options?: DiffOptions): SnapshotDiff;
|
|
16
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export class DiffService {
|
|
2
|
+
compare(oldGraph, newGraph, _options = {}) {
|
|
3
|
+
if (!oldGraph) {
|
|
4
|
+
return {
|
|
5
|
+
newPages: Array.from(newGraph.nodes.keys()),
|
|
6
|
+
removedPages: [],
|
|
7
|
+
changedPages: []
|
|
8
|
+
};
|
|
9
|
+
}
|
|
10
|
+
const oldUrls = new Set(oldGraph.nodes.keys());
|
|
11
|
+
const newUrls = new Set(newGraph.nodes.keys());
|
|
12
|
+
const newPages = Array.from(newUrls).filter(u => !oldUrls.has(u));
|
|
13
|
+
const removedPages = Array.from(oldUrls).filter(u => !newUrls.has(u));
|
|
14
|
+
const changedPages = [];
|
|
15
|
+
for (const url of newUrls) {
|
|
16
|
+
if (oldUrls.has(url)) {
|
|
17
|
+
const oldNode = oldGraph.nodes.get(url);
|
|
18
|
+
const newNode = newGraph.nodes.get(url);
|
|
19
|
+
const changes = [];
|
|
20
|
+
let severity = 'low';
|
|
21
|
+
if (oldNode.status !== newNode.status) {
|
|
22
|
+
changes.push(`status: ${oldNode.status} -> ${newNode.status}`);
|
|
23
|
+
severity = 'high';
|
|
24
|
+
}
|
|
25
|
+
if (oldNode.contentHash !== newNode.contentHash) {
|
|
26
|
+
changes.push('content changed');
|
|
27
|
+
if (severity !== 'high')
|
|
28
|
+
severity = 'medium';
|
|
29
|
+
}
|
|
30
|
+
if (oldNode.noindex !== newNode.noindex) {
|
|
31
|
+
changes.push(`noindex: ${oldNode.noindex} -> ${newNode.noindex}`);
|
|
32
|
+
severity = 'high';
|
|
33
|
+
}
|
|
34
|
+
if (changes.length > 0) {
|
|
35
|
+
changedPages.push({ url, changes, severity });
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
return { newPages, removedPages, changedPages };
|
|
40
|
+
}
|
|
41
|
+
}
|
package/dist/events.d.ts
CHANGED
|
@@ -15,6 +15,14 @@ export type CrawlEvent = {
|
|
|
15
15
|
} | {
|
|
16
16
|
type: 'crawl:limit-reached';
|
|
17
17
|
limit: number;
|
|
18
|
+
} | {
|
|
19
|
+
type: 'crawl:progress';
|
|
20
|
+
pagesCrawled: number;
|
|
21
|
+
queued: number;
|
|
22
|
+
active: number;
|
|
23
|
+
nodesFound: number;
|
|
24
|
+
edgesFound: number;
|
|
25
|
+
phase?: string;
|
|
18
26
|
} | {
|
|
19
27
|
type: 'queue:enqueue';
|
|
20
28
|
url: string;
|
package/dist/graph/graph.d.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
export interface GraphNode {
|
|
2
2
|
url: string;
|
|
3
|
+
isInternal?: boolean;
|
|
3
4
|
depth: number;
|
|
4
5
|
inLinks: number;
|
|
5
6
|
outLinks: number;
|
|
@@ -9,50 +10,48 @@ export interface GraphNode {
|
|
|
9
10
|
nofollow?: boolean;
|
|
10
11
|
brokenLinks?: string[];
|
|
11
12
|
redirectChain?: string[];
|
|
13
|
+
discoveredViaSitemap?: boolean;
|
|
12
14
|
incrementalStatus?: 'new' | 'changed' | 'unchanged' | 'deleted';
|
|
13
15
|
etag?: string;
|
|
14
16
|
lastModified?: string;
|
|
15
17
|
contentHash?: string;
|
|
16
18
|
html?: string;
|
|
17
|
-
pageRank?: number;
|
|
18
|
-
pageRankScore?: number;
|
|
19
|
-
authorityScore?: number;
|
|
20
|
-
hubScore?: number;
|
|
21
|
-
duplicateClusterId?: string;
|
|
22
|
-
duplicateType?: 'exact' | 'near' | 'template_heavy' | 'none';
|
|
23
|
-
isClusterPrimary?: boolean;
|
|
24
|
-
isCollapsed?: boolean;
|
|
25
|
-
collapseInto?: string;
|
|
26
19
|
simhash?: string;
|
|
27
20
|
uniqueTokenRatio?: number;
|
|
28
|
-
soft404Score?: number;
|
|
29
|
-
soft404Signals?: string[];
|
|
30
21
|
crawlTrapFlag?: boolean;
|
|
31
22
|
crawlTrapRisk?: number;
|
|
32
23
|
trapType?: string;
|
|
33
24
|
securityError?: string;
|
|
34
25
|
retries?: number;
|
|
35
|
-
clusterId?: number;
|
|
36
26
|
bytesReceived?: number;
|
|
37
|
-
linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
|
|
38
27
|
crawlStatus?: string;
|
|
39
28
|
wordCount?: number;
|
|
40
29
|
thinContentScore?: number;
|
|
41
30
|
externalLinkRatio?: number;
|
|
31
|
+
h1Count?: number;
|
|
32
|
+
h2Count?: number;
|
|
33
|
+
title?: string;
|
|
34
|
+
clusterId?: number;
|
|
35
|
+
duplicateClusterId?: string;
|
|
36
|
+
duplicateType?: 'exact' | 'near' | 'template_heavy';
|
|
37
|
+
pagerankScore?: number;
|
|
38
|
+
hubScore?: number;
|
|
39
|
+
authScore?: number;
|
|
40
|
+
linkRole?: string;
|
|
41
|
+
soft404Score?: number;
|
|
42
|
+
headingScore?: number;
|
|
42
43
|
orphanScore?: number;
|
|
44
|
+
orphanType?: string;
|
|
45
|
+
impactLevel?: string;
|
|
46
|
+
headingData?: string;
|
|
47
|
+
isClusterPrimary?: boolean;
|
|
48
|
+
isCollapsed?: boolean;
|
|
43
49
|
}
|
|
44
50
|
export interface GraphEdge {
|
|
45
51
|
source: string;
|
|
46
52
|
target: string;
|
|
47
53
|
weight: number;
|
|
48
54
|
}
|
|
49
|
-
export interface ClusterInfo {
|
|
50
|
-
id: number;
|
|
51
|
-
count: number;
|
|
52
|
-
primaryUrl: string;
|
|
53
|
-
risk: 'low' | 'medium' | 'high';
|
|
54
|
-
sharedPathPrefix?: string;
|
|
55
|
-
}
|
|
56
55
|
export interface CrawlStats {
|
|
57
56
|
pagesFetched: number;
|
|
58
57
|
pagesCached: number;
|
|
@@ -64,19 +63,6 @@ export declare class Graph {
|
|
|
64
63
|
edges: Map<string, number>;
|
|
65
64
|
limitReached: boolean;
|
|
66
65
|
sessionStats: CrawlStats;
|
|
67
|
-
trapClusters: {
|
|
68
|
-
pattern: string;
|
|
69
|
-
type: string;
|
|
70
|
-
count: number;
|
|
71
|
-
}[];
|
|
72
|
-
duplicateClusters: {
|
|
73
|
-
id: string;
|
|
74
|
-
type: 'exact' | 'near' | 'template_heavy';
|
|
75
|
-
size: number;
|
|
76
|
-
representative: string;
|
|
77
|
-
severity: 'low' | 'medium' | 'high';
|
|
78
|
-
}[];
|
|
79
|
-
contentClusters: ClusterInfo[];
|
|
80
66
|
/**
|
|
81
67
|
* Generates a unique key for an edge.
|
|
82
68
|
*/
|
|
@@ -93,7 +79,7 @@ export declare class Graph {
|
|
|
93
79
|
* If it exists, updates the status if the new status is non-zero (meaning we crawled it).
|
|
94
80
|
* Depth is only set on creation (BFS guarantees shortest path first).
|
|
95
81
|
*/
|
|
96
|
-
addNode(url: string, depth: number, status?: number): void;
|
|
82
|
+
addNode(url: string, depth: number, status?: number, isInternal?: boolean): void;
|
|
97
83
|
updateNodeData(url: string, data: Partial<GraphNode>): void;
|
|
98
84
|
/**
|
|
99
85
|
* Adds a directed edge between two nodes.
|
|
@@ -106,14 +92,6 @@ export declare class Graph {
|
|
|
106
92
|
toJSON(): {
|
|
107
93
|
nodes: GraphNode[];
|
|
108
94
|
edges: GraphEdge[];
|
|
109
|
-
duplicateClusters: {
|
|
110
|
-
id: string;
|
|
111
|
-
type: "exact" | "near" | "template_heavy";
|
|
112
|
-
size: number;
|
|
113
|
-
representative: string;
|
|
114
|
-
severity: "low" | "medium" | "high";
|
|
115
|
-
}[];
|
|
116
|
-
contentClusters: ClusterInfo[];
|
|
117
95
|
};
|
|
118
96
|
static fromJSON(json: any): Graph;
|
|
119
97
|
}
|
package/dist/graph/graph.js
CHANGED
|
@@ -9,32 +9,33 @@ export class Graph {
|
|
|
9
9
|
pagesSkipped: 0,
|
|
10
10
|
totalFound: 0
|
|
11
11
|
};
|
|
12
|
-
trapClusters = [];
|
|
13
|
-
duplicateClusters = [];
|
|
14
|
-
contentClusters = [];
|
|
15
12
|
/**
|
|
16
13
|
* Generates a unique key for an edge.
|
|
17
14
|
*/
|
|
18
15
|
static getEdgeKey(source, target) {
|
|
19
|
-
return
|
|
16
|
+
return source + '\x00' + target;
|
|
20
17
|
}
|
|
21
18
|
/**
|
|
22
19
|
* Parses an edge key back into source and target.
|
|
23
20
|
*/
|
|
24
21
|
static parseEdgeKey(key) {
|
|
25
|
-
const
|
|
26
|
-
return {
|
|
22
|
+
const splitIndex = key.indexOf('\x00');
|
|
23
|
+
return {
|
|
24
|
+
source: key.slice(0, splitIndex),
|
|
25
|
+
target: key.slice(splitIndex + 1)
|
|
26
|
+
};
|
|
27
27
|
}
|
|
28
28
|
/**
|
|
29
29
|
* Adds a node to the graph if it doesn't exist.
|
|
30
30
|
* If it exists, updates the status if the new status is non-zero (meaning we crawled it).
|
|
31
31
|
* Depth is only set on creation (BFS guarantees shortest path first).
|
|
32
32
|
*/
|
|
33
|
-
addNode(url, depth, status = 0) {
|
|
33
|
+
addNode(url, depth, status = 0, isInternal = true) {
|
|
34
34
|
const existing = this.nodes.get(url);
|
|
35
35
|
if (!existing) {
|
|
36
36
|
this.nodes.set(url, {
|
|
37
37
|
url,
|
|
38
|
+
isInternal,
|
|
38
39
|
depth,
|
|
39
40
|
status,
|
|
40
41
|
inLinks: 0,
|
|
@@ -46,6 +47,9 @@ export class Graph {
|
|
|
46
47
|
if (status !== 0) {
|
|
47
48
|
existing.status = status;
|
|
48
49
|
}
|
|
50
|
+
if (isInternal !== undefined) {
|
|
51
|
+
existing.isInternal = isInternal;
|
|
52
|
+
}
|
|
49
53
|
}
|
|
50
54
|
}
|
|
51
55
|
updateNodeData(url, data) {
|
|
@@ -90,9 +94,7 @@ export class Graph {
|
|
|
90
94
|
toJSON() {
|
|
91
95
|
return {
|
|
92
96
|
nodes: this.getNodes(),
|
|
93
|
-
edges: this.getEdges()
|
|
94
|
-
duplicateClusters: this.duplicateClusters,
|
|
95
|
-
contentClusters: this.contentClusters
|
|
97
|
+
edges: this.getEdges()
|
|
96
98
|
};
|
|
97
99
|
}
|
|
98
100
|
static fromJSON(json) {
|
|
@@ -108,12 +110,6 @@ export class Graph {
|
|
|
108
110
|
graph.edges.set(key, edge.weight || 1.0);
|
|
109
111
|
}
|
|
110
112
|
}
|
|
111
|
-
if (json.duplicateClusters) {
|
|
112
|
-
graph.duplicateClusters = json.duplicateClusters;
|
|
113
|
-
}
|
|
114
|
-
if (json.contentClusters) {
|
|
115
|
-
graph.contentClusters = json.contentClusters;
|
|
116
|
-
}
|
|
117
113
|
return graph;
|
|
118
114
|
}
|
|
119
115
|
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { Graph } from './graph.js';
|
|
2
|
+
export type LinkRole = 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
|
|
3
|
+
export interface HITSRow {
|
|
4
|
+
authority_score: number;
|
|
5
|
+
hub_score: number;
|
|
6
|
+
link_role: LinkRole;
|
|
7
|
+
}
|
|
8
|
+
export interface HITSOptions {
|
|
9
|
+
iterations?: number;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Service to compute Hub and Authority scores using the HITS algorithm.
|
|
13
|
+
* Operates purely on the internal link graph.
|
|
14
|
+
*/
|
|
15
|
+
export declare class HITSService {
|
|
16
|
+
/**
|
|
17
|
+
* Computes Hub and Authority scores using the HITS algorithm.
|
|
18
|
+
* @param {Graph} graph - The link graph to analyze.
|
|
19
|
+
* @param {HITSOptions} options - Algorithm options (e.g. number of iterations).
|
|
20
|
+
* @returns {Map<string, HITSRow>} A map of page URLs to their HITS results.
|
|
21
|
+
*/
|
|
22
|
+
evaluate(graph: Graph, options?: HITSOptions): Map<string, HITSRow>;
|
|
23
|
+
}
|