@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/db/index.js
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
|
-
import Database from 'better-sqlite3';
|
|
2
1
|
import path from 'node:path';
|
|
3
2
|
import fs from 'node:fs';
|
|
4
3
|
import os from 'node:os';
|
|
5
|
-
import {
|
|
4
|
+
import { CrawlithDB } from './CrawlithDB.js';
|
|
6
5
|
let dbInstance = null;
|
|
6
|
+
let crawlithDbInstance = null;
|
|
7
|
+
export * from './repositories/SiteRepository.js';
|
|
8
|
+
export * from './repositories/SnapshotRepository.js';
|
|
9
|
+
export * from './CrawlithDB.js';
|
|
7
10
|
export function getDbPath() {
|
|
8
11
|
if (process.env.NODE_ENV === 'test') {
|
|
9
12
|
return ':memory:';
|
|
@@ -20,41 +23,50 @@ export function getDbPath() {
|
|
|
20
23
|
}
|
|
21
24
|
return path.join(crawlithDir, 'crawlith.db');
|
|
22
25
|
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
+
/**
|
|
27
|
+
* Returns the higher-level CrawlithDB wrapper for plugins and new code.
|
|
28
|
+
*/
|
|
29
|
+
export function getCrawlithDB() {
|
|
30
|
+
if (crawlithDbInstance) {
|
|
31
|
+
return crawlithDbInstance;
|
|
26
32
|
}
|
|
27
33
|
const dbPath = getDbPath();
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
db.pragma('synchronous = NORMAL');
|
|
32
|
-
db.pragma('foreign_keys = ON');
|
|
33
|
-
db.pragma('temp_store = MEMORY');
|
|
34
|
-
db.pragma('mmap_size = 30000000000');
|
|
35
|
-
db.pragma('cache_size = -20000');
|
|
36
|
-
db.pragma('busy_timeout = 5000');
|
|
37
|
-
// Security controls
|
|
38
|
-
// Ensure file permissions are 600 (user read/write only)
|
|
34
|
+
crawlithDbInstance = new CrawlithDB(dbPath);
|
|
35
|
+
dbInstance = crawlithDbInstance.unsafeGetRawDb();
|
|
36
|
+
// Migrations for existing tables
|
|
39
37
|
try {
|
|
40
|
-
|
|
38
|
+
dbInstance.exec(`ALTER TABLE pages ADD COLUMN discovered_via_sitemap INTEGER DEFAULT 0;`);
|
|
41
39
|
}
|
|
42
|
-
catch (_e) {
|
|
43
|
-
|
|
44
|
-
|
|
40
|
+
catch (_e) { /* ignore */ }
|
|
41
|
+
// Security controls: Ensure file permissions are 600 (user read/write only)
|
|
42
|
+
if (dbPath !== ':memory:') {
|
|
43
|
+
try {
|
|
44
|
+
fs.chmodSync(dbPath, 0o600);
|
|
45
|
+
}
|
|
46
|
+
catch (_e) {
|
|
47
|
+
// might fail if file doesn't exist yet but better-sqlite3 should have created it
|
|
48
|
+
}
|
|
45
49
|
}
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
+
return crawlithDbInstance;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Returns the raw better-sqlite3 Database instance for legacy repositories.
|
|
54
|
+
*/
|
|
55
|
+
export function getDb() {
|
|
56
|
+
if (dbInstance) {
|
|
57
|
+
return dbInstance;
|
|
50
58
|
}
|
|
51
|
-
//
|
|
52
|
-
|
|
53
|
-
dbInstance
|
|
54
|
-
return db;
|
|
59
|
+
// Initializing via getCrawlithDB ensures consistent configuration
|
|
60
|
+
getCrawlithDB();
|
|
61
|
+
return dbInstance;
|
|
55
62
|
}
|
|
56
63
|
export function closeDb() {
|
|
57
|
-
if (
|
|
64
|
+
if (crawlithDbInstance) {
|
|
65
|
+
crawlithDbInstance.close();
|
|
66
|
+
crawlithDbInstance = null;
|
|
67
|
+
dbInstance = null;
|
|
68
|
+
}
|
|
69
|
+
else if (dbInstance) {
|
|
58
70
|
dbInstance.close();
|
|
59
71
|
dbInstance = null;
|
|
60
72
|
}
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
export function
|
|
1
|
+
export function runBaseMigrations(db) {
|
|
2
2
|
// Sites Table
|
|
3
3
|
db.exec(`
|
|
4
4
|
CREATE TABLE IF NOT EXISTS sites (
|
|
5
5
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
6
6
|
domain TEXT UNIQUE NOT NULL,
|
|
7
|
+
preferred_url TEXT,
|
|
8
|
+
ssl INTEGER,
|
|
7
9
|
created_at TEXT DEFAULT (datetime('now')),
|
|
8
10
|
settings_json TEXT,
|
|
9
11
|
is_active INTEGER DEFAULT 1
|
|
@@ -14,18 +16,36 @@ export function initSchema(db) {
|
|
|
14
16
|
CREATE TABLE IF NOT EXISTS snapshots (
|
|
15
17
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
16
18
|
site_id INTEGER NOT NULL,
|
|
17
|
-
|
|
19
|
+
run_type TEXT CHECK(run_type IN ('completed', 'incremental', 'single')) NOT NULL DEFAULT 'completed',
|
|
18
20
|
created_at TEXT DEFAULT (datetime('now')),
|
|
19
21
|
node_count INTEGER DEFAULT 0,
|
|
20
22
|
edge_count INTEGER DEFAULT 0,
|
|
21
|
-
status TEXT CHECK(status IN ('running', 'completed', 'failed')) DEFAULT 'running',
|
|
23
|
+
status TEXT CHECK(status IN ('queued', 'running', 'completed', 'failed', 'cancelled')) DEFAULT 'running',
|
|
22
24
|
limit_reached INTEGER DEFAULT 0,
|
|
23
25
|
health_score REAL,
|
|
24
26
|
orphan_count INTEGER,
|
|
25
27
|
thin_content_count INTEGER,
|
|
28
|
+
total_score REAL,
|
|
29
|
+
score_count INTEGER,
|
|
30
|
+
score_weight_sum REAL,
|
|
31
|
+
score_calculated_at TEXT,
|
|
26
32
|
FOREIGN KEY(site_id) REFERENCES sites(id) ON DELETE CASCADE
|
|
27
33
|
);
|
|
28
34
|
`);
|
|
35
|
+
// Migration for snapshots: run_type and status
|
|
36
|
+
try {
|
|
37
|
+
db.exec(`ALTER TABLE snapshots ADD COLUMN run_type TEXT CHECK(run_type IN ('completed', 'incremental', 'single')) NOT NULL DEFAULT 'completed';`);
|
|
38
|
+
}
|
|
39
|
+
catch (_e) { /* ignore */ }
|
|
40
|
+
try {
|
|
41
|
+
// If type column exists, populate run_type from it
|
|
42
|
+
db.exec(`UPDATE snapshots SET run_type = CASE WHEN type = 'partial' THEN 'single' ELSE 'completed' END WHERE run_type IS NULL OR run_type = 'full' OR run_type = 'completed';`);
|
|
43
|
+
}
|
|
44
|
+
catch (_e) { /* ignore */ }
|
|
45
|
+
try {
|
|
46
|
+
db.exec(`ALTER TABLE snapshots DROP COLUMN type;`);
|
|
47
|
+
}
|
|
48
|
+
catch (_e) { /* ignore */ }
|
|
29
49
|
// Pages Table
|
|
30
50
|
db.exec(`
|
|
31
51
|
CREATE TABLE IF NOT EXISTS pages (
|
|
@@ -41,14 +61,15 @@ export function initSchema(db) {
|
|
|
41
61
|
etag TEXT,
|
|
42
62
|
last_modified TEXT,
|
|
43
63
|
html TEXT,
|
|
44
|
-
soft404_score REAL,
|
|
45
64
|
noindex INTEGER DEFAULT 0,
|
|
46
65
|
nofollow INTEGER DEFAULT 0,
|
|
47
66
|
security_error TEXT,
|
|
48
67
|
retries INTEGER DEFAULT 0,
|
|
49
68
|
depth INTEGER DEFAULT 0,
|
|
69
|
+
discovered_via_sitemap INTEGER DEFAULT 0,
|
|
50
70
|
redirect_chain TEXT,
|
|
51
71
|
bytes_received INTEGER,
|
|
72
|
+
is_internal INTEGER DEFAULT 1,
|
|
52
73
|
crawl_trap_flag INTEGER DEFAULT 0,
|
|
53
74
|
crawl_trap_risk REAL,
|
|
54
75
|
trap_type TEXT,
|
|
@@ -60,7 +81,15 @@ export function initSchema(db) {
|
|
|
60
81
|
UNIQUE(site_id, normalized_url)
|
|
61
82
|
);
|
|
62
83
|
`);
|
|
63
|
-
//
|
|
84
|
+
// Migrations for existing tables
|
|
85
|
+
try {
|
|
86
|
+
db.exec(`ALTER TABLE pages ADD COLUMN is_internal INTEGER DEFAULT 1;`);
|
|
87
|
+
}
|
|
88
|
+
catch (_e) { /* ignore */ }
|
|
89
|
+
try {
|
|
90
|
+
db.exec(`ALTER TABLE pages ADD COLUMN discovered_via_sitemap INTEGER DEFAULT 0;`);
|
|
91
|
+
}
|
|
92
|
+
catch (_e) { /* ignore */ }
|
|
64
93
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_pages_site_last_seen ON pages(site_id, last_seen_snapshot_id);`);
|
|
65
94
|
// Edges Table
|
|
66
95
|
db.exec(`
|
|
@@ -76,26 +105,29 @@ export function initSchema(db) {
|
|
|
76
105
|
FOREIGN KEY(target_page_id) REFERENCES pages(id) ON DELETE CASCADE
|
|
77
106
|
);
|
|
78
107
|
`);
|
|
79
|
-
// Index for Edges
|
|
80
|
-
db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot_source ON edges(snapshot_id, source_page_id);`);
|
|
81
108
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot ON edges(snapshot_id);`);
|
|
82
109
|
// Metrics Table
|
|
83
110
|
db.exec(`
|
|
84
111
|
CREATE TABLE IF NOT EXISTS metrics (
|
|
85
112
|
snapshot_id INTEGER NOT NULL,
|
|
86
113
|
page_id INTEGER NOT NULL,
|
|
87
|
-
authority_score REAL,
|
|
88
|
-
hub_score REAL,
|
|
89
|
-
pagerank REAL,
|
|
90
|
-
pagerank_score REAL,
|
|
91
|
-
link_role TEXT CHECK(link_role IN ('hub', 'authority', 'power', 'balanced', 'peripheral')),
|
|
92
114
|
crawl_status TEXT,
|
|
93
115
|
word_count INTEGER,
|
|
94
116
|
thin_content_score REAL,
|
|
95
117
|
external_link_ratio REAL,
|
|
96
118
|
orphan_score INTEGER,
|
|
119
|
+
pagerank_score REAL,
|
|
120
|
+
hub_score REAL,
|
|
121
|
+
auth_score REAL,
|
|
122
|
+
link_role TEXT,
|
|
97
123
|
duplicate_cluster_id TEXT,
|
|
98
|
-
duplicate_type TEXT
|
|
124
|
+
duplicate_type TEXT,
|
|
125
|
+
cluster_id INTEGER,
|
|
126
|
+
soft404_score REAL,
|
|
127
|
+
heading_score REAL,
|
|
128
|
+
orphan_type TEXT,
|
|
129
|
+
impact_level TEXT,
|
|
130
|
+
heading_data TEXT,
|
|
99
131
|
is_cluster_primary INTEGER DEFAULT 0,
|
|
100
132
|
PRIMARY KEY(snapshot_id, page_id),
|
|
101
133
|
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
|
|
@@ -129,41 +161,56 @@ export function initSchema(db) {
|
|
|
129
161
|
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
|
|
130
162
|
);
|
|
131
163
|
`);
|
|
132
|
-
//
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
164
|
+
// Plugin Migrations Table
|
|
165
|
+
db.exec(`
|
|
166
|
+
CREATE TABLE IF NOT EXISTS plugin_migrations (
|
|
167
|
+
plugin_name TEXT PRIMARY KEY,
|
|
168
|
+
executed_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
169
|
+
);
|
|
170
|
+
`);
|
|
171
|
+
// Universal Plugin Reports Table
|
|
172
|
+
db.exec(`
|
|
173
|
+
CREATE TABLE IF NOT EXISTS plugin_reports (
|
|
174
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
175
|
+
snapshot_id INTEGER NOT NULL,
|
|
176
|
+
plugin_name TEXT NOT NULL,
|
|
177
|
+
data TEXT NOT NULL,
|
|
178
|
+
total_score REAL,
|
|
179
|
+
score_count INTEGER,
|
|
180
|
+
score_weight_sum REAL,
|
|
181
|
+
score_calculated_at TEXT,
|
|
182
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
183
|
+
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
|
|
184
|
+
);
|
|
185
|
+
`);
|
|
186
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_plugin_reports_snapshot ON plugin_reports(snapshot_id);`);
|
|
187
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_plugin_reports_composite ON plugin_reports(snapshot_id, plugin_name);`);
|
|
188
|
+
// Migrations for metrics
|
|
189
|
+
const metricsCols = [
|
|
157
190
|
['pagerank_score', 'REAL'],
|
|
191
|
+
['hub_score', 'REAL'],
|
|
192
|
+
['auth_score', 'REAL'],
|
|
158
193
|
['link_role', 'TEXT'],
|
|
159
|
-
['
|
|
160
|
-
['
|
|
161
|
-
['
|
|
194
|
+
['cluster_id', 'INTEGER'],
|
|
195
|
+
['soft404_score', 'REAL'],
|
|
196
|
+
['heading_score', 'REAL'],
|
|
197
|
+
['orphan_type', 'TEXT'],
|
|
198
|
+
['impact_level', 'TEXT'],
|
|
199
|
+
['heading_data', 'TEXT'],
|
|
162
200
|
];
|
|
163
|
-
for (const [col, type] of
|
|
201
|
+
for (const [col, type] of metricsCols) {
|
|
164
202
|
try {
|
|
165
203
|
db.exec(`ALTER TABLE metrics ADD COLUMN ${col} ${type}`);
|
|
166
204
|
}
|
|
167
|
-
catch { /*
|
|
205
|
+
catch { /* ignore */ }
|
|
206
|
+
}
|
|
207
|
+
// Final site column migrations
|
|
208
|
+
try {
|
|
209
|
+
db.exec('ALTER TABLE sites ADD COLUMN preferred_url TEXT');
|
|
210
|
+
}
|
|
211
|
+
catch { /* ignore */ }
|
|
212
|
+
try {
|
|
213
|
+
db.exec('ALTER TABLE sites ADD COLUMN ssl INTEGER');
|
|
168
214
|
}
|
|
215
|
+
catch { /* ignore */ }
|
|
169
216
|
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export declare class PluginRegistry {
|
|
2
|
+
private registeredPlugins;
|
|
3
|
+
private registeredTables;
|
|
4
|
+
registerPlugin(pluginName: string): void;
|
|
5
|
+
isPluginRegistered(pluginName: string): boolean;
|
|
6
|
+
registerTable(tableName: string, pluginName: string): void;
|
|
7
|
+
getPluginForTable(tableName: string): string | undefined;
|
|
8
|
+
isTableRegistered(tableName: string): boolean;
|
|
9
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export class PluginRegistry {
|
|
2
|
+
registeredPlugins = new Set();
|
|
3
|
+
registeredTables = new Map(); // tableName -> pluginName
|
|
4
|
+
registerPlugin(pluginName) {
|
|
5
|
+
this.registeredPlugins.add(pluginName);
|
|
6
|
+
}
|
|
7
|
+
isPluginRegistered(pluginName) {
|
|
8
|
+
return this.registeredPlugins.has(pluginName);
|
|
9
|
+
}
|
|
10
|
+
registerTable(tableName, pluginName) {
|
|
11
|
+
this.registeredTables.set(tableName, pluginName);
|
|
12
|
+
}
|
|
13
|
+
getPluginForTable(tableName) {
|
|
14
|
+
return this.registeredTables.get(tableName);
|
|
15
|
+
}
|
|
16
|
+
isTableRegistered(tableName) {
|
|
17
|
+
return this.registeredTables.has(tableName);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
@@ -12,5 +12,18 @@ export declare class EdgeRepository {
|
|
|
12
12
|
private insertStmt;
|
|
13
13
|
constructor(db: Database);
|
|
14
14
|
insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight?: number, rel?: string): void;
|
|
15
|
+
insertEdges(edges: {
|
|
16
|
+
snapshot_id: number;
|
|
17
|
+
source_page_id: number;
|
|
18
|
+
target_page_id: number;
|
|
19
|
+
weight: number;
|
|
20
|
+
rel: string;
|
|
21
|
+
}[]): void;
|
|
22
|
+
/**
|
|
23
|
+
* Remove all edges originating from a specific page within a snapshot.
|
|
24
|
+
* Used when re-crawling a page into a reused partial snapshot to avoid duplicates.
|
|
25
|
+
*/
|
|
26
|
+
deleteEdgesForPage(snapshotId: number, sourcePageId: number): void;
|
|
15
27
|
getEdgesBySnapshot(snapshotId: number): Edge[];
|
|
28
|
+
getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge>;
|
|
16
29
|
}
|
|
@@ -11,7 +11,27 @@ export class EdgeRepository {
|
|
|
11
11
|
insertEdge(snapshotId, sourcePageId, targetPageId, weight = 1.0, rel = 'internal') {
|
|
12
12
|
this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
|
|
13
13
|
}
|
|
14
|
+
insertEdges(edges) {
|
|
15
|
+
if (edges.length === 0)
|
|
16
|
+
return;
|
|
17
|
+
const tx = this.db.transaction((edgesBatch) => {
|
|
18
|
+
for (const edge of edgesBatch) {
|
|
19
|
+
this.insertStmt.run(edge.snapshot_id, edge.source_page_id, edge.target_page_id, edge.weight, edge.rel);
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
tx(edges);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Remove all edges originating from a specific page within a snapshot.
|
|
26
|
+
* Used when re-crawling a page into a reused partial snapshot to avoid duplicates.
|
|
27
|
+
*/
|
|
28
|
+
deleteEdgesForPage(snapshotId, sourcePageId) {
|
|
29
|
+
this.db.prepare('DELETE FROM edges WHERE snapshot_id = ? AND source_page_id = ?').run(snapshotId, sourcePageId);
|
|
30
|
+
}
|
|
14
31
|
getEdgesBySnapshot(snapshotId) {
|
|
15
32
|
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId);
|
|
16
33
|
}
|
|
34
|
+
getEdgesIteratorBySnapshot(snapshotId) {
|
|
35
|
+
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').iterate(snapshotId);
|
|
36
|
+
}
|
|
17
37
|
}
|
|
@@ -2,25 +2,33 @@ import { Database } from 'better-sqlite3';
|
|
|
2
2
|
export interface DbMetrics {
|
|
3
3
|
snapshot_id: number;
|
|
4
4
|
page_id: number;
|
|
5
|
-
authority_score: number | null;
|
|
6
|
-
hub_score: number | null;
|
|
7
|
-
pagerank: number | null;
|
|
8
|
-
pagerank_score: number | null;
|
|
9
|
-
link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
|
|
10
5
|
crawl_status: string | null;
|
|
11
6
|
word_count: number | null;
|
|
12
7
|
thin_content_score: number | null;
|
|
13
8
|
external_link_ratio: number | null;
|
|
14
|
-
|
|
9
|
+
pagerank_score: number | null;
|
|
10
|
+
hub_score: number | null;
|
|
11
|
+
auth_score: number | null;
|
|
12
|
+
link_role: string | null;
|
|
15
13
|
duplicate_cluster_id: string | null;
|
|
16
|
-
duplicate_type:
|
|
17
|
-
|
|
14
|
+
duplicate_type: string | null;
|
|
15
|
+
cluster_id: number | null;
|
|
16
|
+
soft404_score: number | null;
|
|
17
|
+
heading_score: number | null;
|
|
18
|
+
orphan_score: number | null;
|
|
19
|
+
orphan_type: string | null;
|
|
20
|
+
impact_level: string | null;
|
|
21
|
+
heading_data: string | null;
|
|
22
|
+
is_cluster_primary: number | null;
|
|
18
23
|
}
|
|
19
24
|
export declare class MetricsRepository {
|
|
20
25
|
private db;
|
|
21
26
|
private insertStmt;
|
|
27
|
+
private getByPageStmt;
|
|
22
28
|
constructor(db: Database);
|
|
23
29
|
insertMetrics(metrics: DbMetrics): void;
|
|
24
30
|
getMetrics(snapshotId: number): DbMetrics[];
|
|
31
|
+
getMetricsIterator(snapshotId: number): IterableIterator<DbMetrics>;
|
|
25
32
|
getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined;
|
|
33
|
+
insertMany(metricsList: DbMetrics[]): void;
|
|
26
34
|
}
|
|
@@ -1,17 +1,27 @@
|
|
|
1
1
|
export class MetricsRepository {
|
|
2
2
|
db;
|
|
3
3
|
insertStmt;
|
|
4
|
+
getByPageStmt;
|
|
4
5
|
constructor(db) {
|
|
5
6
|
this.db = db;
|
|
7
|
+
this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
|
|
6
8
|
this.insertStmt = this.db.prepare(`
|
|
7
9
|
INSERT OR REPLACE INTO metrics (
|
|
8
|
-
snapshot_id, page_id,
|
|
9
|
-
|
|
10
|
-
|
|
10
|
+
snapshot_id, page_id,
|
|
11
|
+
crawl_status, word_count, thin_content_score, external_link_ratio,
|
|
12
|
+
pagerank_score, hub_score, auth_score, link_role,
|
|
13
|
+
duplicate_cluster_id, duplicate_type, cluster_id,
|
|
14
|
+
soft404_score, heading_score,
|
|
15
|
+
orphan_score, orphan_type, impact_level,
|
|
16
|
+
heading_data, is_cluster_primary
|
|
11
17
|
) VALUES (
|
|
12
|
-
@snapshot_id, @page_id,
|
|
13
|
-
@
|
|
14
|
-
@
|
|
18
|
+
@snapshot_id, @page_id,
|
|
19
|
+
@crawl_status, @word_count, @thin_content_score, @external_link_ratio,
|
|
20
|
+
@pagerank_score, @hub_score, @auth_score, @link_role,
|
|
21
|
+
@duplicate_cluster_id, @duplicate_type, @cluster_id,
|
|
22
|
+
@soft404_score, @heading_score,
|
|
23
|
+
@orphan_score, @orphan_type, @impact_level,
|
|
24
|
+
@heading_data, @is_cluster_primary
|
|
15
25
|
)
|
|
16
26
|
`);
|
|
17
27
|
}
|
|
@@ -21,7 +31,18 @@ export class MetricsRepository {
|
|
|
21
31
|
getMetrics(snapshotId) {
|
|
22
32
|
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId);
|
|
23
33
|
}
|
|
34
|
+
getMetricsIterator(snapshotId) {
|
|
35
|
+
return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').iterate(snapshotId);
|
|
36
|
+
}
|
|
24
37
|
getMetricsForPage(snapshotId, pageId) {
|
|
25
|
-
return this.
|
|
38
|
+
return this.getByPageStmt.get(snapshotId, pageId);
|
|
39
|
+
}
|
|
40
|
+
insertMany(metricsList) {
|
|
41
|
+
const insert = this.insertStmt;
|
|
42
|
+
const tx = this.db.transaction((items) => {
|
|
43
|
+
for (const item of items)
|
|
44
|
+
insert.run(item);
|
|
45
|
+
});
|
|
46
|
+
tx(metricsList);
|
|
26
47
|
}
|
|
27
48
|
}
|
|
@@ -12,14 +12,15 @@ export interface Page {
|
|
|
12
12
|
etag: string | null;
|
|
13
13
|
last_modified: string | null;
|
|
14
14
|
html: string | null;
|
|
15
|
-
soft404_score: number | null;
|
|
16
15
|
noindex: number;
|
|
17
16
|
nofollow: number;
|
|
18
17
|
security_error: string | null;
|
|
19
18
|
retries: number;
|
|
20
19
|
depth: number;
|
|
20
|
+
discovered_via_sitemap: number;
|
|
21
21
|
redirect_chain: string | null;
|
|
22
22
|
bytes_received: number | null;
|
|
23
|
+
is_internal: number;
|
|
23
24
|
crawl_trap_flag: number;
|
|
24
25
|
crawl_trap_risk: number | null;
|
|
25
26
|
trap_type: string | null;
|
|
@@ -42,6 +43,18 @@ export declare class PageRepository {
|
|
|
42
43
|
last_seen_snapshot_id: number;
|
|
43
44
|
}): number;
|
|
44
45
|
getPage(siteId: number, url: string): Page | undefined;
|
|
45
|
-
|
|
46
|
+
getPagesByUrls(siteId: number, urls: string[]): Page[];
|
|
47
|
+
upsertMany(pages: (Partial<Page> & {
|
|
48
|
+
site_id: number;
|
|
49
|
+
normalized_url: string;
|
|
50
|
+
last_seen_snapshot_id: number;
|
|
51
|
+
})[]): Map<string, number>;
|
|
52
|
+
getPagesBySnapshot(snapshotId: number, runType?: string): Page[];
|
|
53
|
+
getPagesIdentityBySnapshot(snapshotId: number): {
|
|
54
|
+
id: number;
|
|
55
|
+
normalized_url: string;
|
|
56
|
+
}[];
|
|
57
|
+
getPagesIteratorBySnapshot(snapshotId: number, runType?: string): IterableIterator<Page>;
|
|
46
58
|
getIdByUrl(siteId: number, url: string): number | undefined;
|
|
59
|
+
reconcileInternalUrls(siteId: number, siteOrigin: string): void;
|
|
47
60
|
}
|