@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/db/index.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
import Database from 'better-sqlite3';
|
|
2
1
|
import path from 'node:path';
|
|
3
2
|
import fs from 'node:fs';
|
|
4
3
|
import os from 'node:os';
|
|
5
|
-
import {
|
|
4
|
+
import { CrawlithDB } from './CrawlithDB.js';
|
|
6
5
|
let dbInstance = null;
|
|
6
|
+
let crawlithDbInstance = null;
|
|
7
7
|
export * from './repositories/SiteRepository.js';
|
|
8
8
|
export * from './repositories/SnapshotRepository.js';
|
|
9
|
-
export
|
|
9
|
+
export * from './CrawlithDB.js';
|
|
10
10
|
export function getDbPath() {
|
|
11
11
|
if (process.env.NODE_ENV === 'test') {
|
|
12
12
|
return ':memory:';
|
|
@@ -23,42 +23,50 @@ export function getDbPath() {
|
|
|
23
23
|
}
|
|
24
24
|
return path.join(crawlithDir, 'crawlith.db');
|
|
25
25
|
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
/**
|
|
27
|
+
* Returns the higher-level CrawlithDB wrapper for plugins and new code.
|
|
28
|
+
*/
|
|
29
|
+
export function getCrawlithDB() {
|
|
30
|
+
if (crawlithDbInstance) {
|
|
31
|
+
return crawlithDbInstance;
|
|
29
32
|
}
|
|
30
33
|
const dbPath = getDbPath();
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
db.pragma('synchronous = NORMAL');
|
|
35
|
-
db.pragma('foreign_keys = ON');
|
|
36
|
-
db.pragma('temp_store = MEMORY');
|
|
37
|
-
db.pragma('mmap_size = 30000000000');
|
|
38
|
-
db.pragma('cache_size = -20000');
|
|
39
|
-
db.pragma('busy_timeout = 5000');
|
|
40
|
-
// Security controls
|
|
41
|
-
// Ensure file permissions are 600 (user read/write only)
|
|
34
|
+
crawlithDbInstance = new CrawlithDB(dbPath);
|
|
35
|
+
dbInstance = crawlithDbInstance.unsafeGetRawDb();
|
|
36
|
+
// Migrations for existing tables
|
|
42
37
|
try {
|
|
43
|
-
|
|
38
|
+
dbInstance.exec(`ALTER TABLE pages ADD COLUMN discovered_via_sitemap INTEGER DEFAULT 0;`);
|
|
44
39
|
}
|
|
45
|
-
catch (_e) {
|
|
46
|
-
|
|
47
|
-
|
|
40
|
+
catch (_e) { /* ignore */ }
|
|
41
|
+
// Security controls: Ensure file permissions are 600 (user read/write only)
|
|
42
|
+
if (dbPath !== ':memory:') {
|
|
43
|
+
try {
|
|
44
|
+
fs.chmodSync(dbPath, 0o600);
|
|
45
|
+
}
|
|
46
|
+
catch (_e) {
|
|
47
|
+
// might fail if file doesn't exist yet but better-sqlite3 should have created it
|
|
48
|
+
}
|
|
48
49
|
}
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
50
|
+
return crawlithDbInstance;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Returns the raw better-sqlite3 Database instance for legacy repositories.
|
|
54
|
+
*/
|
|
55
|
+
export function getDb() {
|
|
56
|
+
if (dbInstance) {
|
|
57
|
+
return dbInstance;
|
|
54
58
|
}
|
|
55
|
-
//
|
|
56
|
-
|
|
57
|
-
dbInstance
|
|
58
|
-
return db;
|
|
59
|
+
// Initializing via getCrawlithDB ensures consistent configuration
|
|
60
|
+
getCrawlithDB();
|
|
61
|
+
return dbInstance;
|
|
59
62
|
}
|
|
60
63
|
export function closeDb() {
|
|
61
|
-
if (
|
|
64
|
+
if (crawlithDbInstance) {
|
|
65
|
+
crawlithDbInstance.close();
|
|
66
|
+
crawlithDbInstance = null;
|
|
67
|
+
dbInstance = null;
|
|
68
|
+
}
|
|
69
|
+
else if (dbInstance) {
|
|
62
70
|
dbInstance.close();
|
|
63
71
|
dbInstance = null;
|
|
64
72
|
}
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
export function
|
|
1
|
+
export function runBaseMigrations(db) {
|
|
2
2
|
// Sites Table
|
|
3
3
|
db.exec(`
|
|
4
4
|
CREATE TABLE IF NOT EXISTS sites (
|
|
5
5
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
6
6
|
domain TEXT UNIQUE NOT NULL,
|
|
7
|
+
preferred_url TEXT,
|
|
8
|
+
ssl INTEGER,
|
|
7
9
|
created_at TEXT DEFAULT (datetime('now')),
|
|
8
10
|
settings_json TEXT,
|
|
9
11
|
is_active INTEGER DEFAULT 1
|
|
@@ -14,18 +16,36 @@ export function initSchema(db) {
|
|
|
14
16
|
CREATE TABLE IF NOT EXISTS snapshots (
|
|
15
17
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
16
18
|
site_id INTEGER NOT NULL,
|
|
17
|
-
|
|
19
|
+
run_type TEXT CHECK(run_type IN ('completed', 'incremental', 'single')) NOT NULL DEFAULT 'completed',
|
|
18
20
|
created_at TEXT DEFAULT (datetime('now')),
|
|
19
21
|
node_count INTEGER DEFAULT 0,
|
|
20
22
|
edge_count INTEGER DEFAULT 0,
|
|
21
|
-
status TEXT CHECK(status IN ('running', 'completed', 'failed')) DEFAULT 'running',
|
|
23
|
+
status TEXT CHECK(status IN ('queued', 'running', 'completed', 'failed', 'cancelled')) DEFAULT 'running',
|
|
22
24
|
limit_reached INTEGER DEFAULT 0,
|
|
23
25
|
health_score REAL,
|
|
24
26
|
orphan_count INTEGER,
|
|
25
27
|
thin_content_count INTEGER,
|
|
28
|
+
total_score REAL,
|
|
29
|
+
score_count INTEGER,
|
|
30
|
+
score_weight_sum REAL,
|
|
31
|
+
score_calculated_at TEXT,
|
|
26
32
|
FOREIGN KEY(site_id) REFERENCES sites(id) ON DELETE CASCADE
|
|
27
33
|
);
|
|
28
34
|
`);
|
|
35
|
+
// Migration for snapshots: run_type and status
|
|
36
|
+
try {
|
|
37
|
+
db.exec(`ALTER TABLE snapshots ADD COLUMN run_type TEXT CHECK(run_type IN ('completed', 'incremental', 'single')) NOT NULL DEFAULT 'completed';`);
|
|
38
|
+
}
|
|
39
|
+
catch (_e) { /* ignore */ }
|
|
40
|
+
try {
|
|
41
|
+
// If type column exists, populate run_type from it
|
|
42
|
+
db.exec(`UPDATE snapshots SET run_type = CASE WHEN type = 'partial' THEN 'single' ELSE 'completed' END WHERE run_type IS NULL OR run_type = 'full' OR run_type = 'completed';`);
|
|
43
|
+
}
|
|
44
|
+
catch (_e) { /* ignore */ }
|
|
45
|
+
try {
|
|
46
|
+
db.exec(`ALTER TABLE snapshots DROP COLUMN type;`);
|
|
47
|
+
}
|
|
48
|
+
catch (_e) { /* ignore */ }
|
|
29
49
|
// Pages Table
|
|
30
50
|
db.exec(`
|
|
31
51
|
CREATE TABLE IF NOT EXISTS pages (
|
|
@@ -41,14 +61,15 @@ export function initSchema(db) {
|
|
|
41
61
|
etag TEXT,
|
|
42
62
|
last_modified TEXT,
|
|
43
63
|
html TEXT,
|
|
44
|
-
soft404_score REAL,
|
|
45
64
|
noindex INTEGER DEFAULT 0,
|
|
46
65
|
nofollow INTEGER DEFAULT 0,
|
|
47
66
|
security_error TEXT,
|
|
48
67
|
retries INTEGER DEFAULT 0,
|
|
49
68
|
depth INTEGER DEFAULT 0,
|
|
69
|
+
discovered_via_sitemap INTEGER DEFAULT 0,
|
|
50
70
|
redirect_chain TEXT,
|
|
51
71
|
bytes_received INTEGER,
|
|
72
|
+
is_internal INTEGER DEFAULT 1,
|
|
52
73
|
crawl_trap_flag INTEGER DEFAULT 0,
|
|
53
74
|
crawl_trap_risk REAL,
|
|
54
75
|
trap_type TEXT,
|
|
@@ -60,7 +81,15 @@ export function initSchema(db) {
|
|
|
60
81
|
UNIQUE(site_id, normalized_url)
|
|
61
82
|
);
|
|
62
83
|
`);
|
|
63
|
-
//
|
|
84
|
+
// Migrations for existing tables
|
|
85
|
+
try {
|
|
86
|
+
db.exec(`ALTER TABLE pages ADD COLUMN is_internal INTEGER DEFAULT 1;`);
|
|
87
|
+
}
|
|
88
|
+
catch (_e) { /* ignore */ }
|
|
89
|
+
try {
|
|
90
|
+
db.exec(`ALTER TABLE pages ADD COLUMN discovered_via_sitemap INTEGER DEFAULT 0;`);
|
|
91
|
+
}
|
|
92
|
+
catch (_e) { /* ignore */ }
|
|
64
93
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_pages_site_last_seen ON pages(site_id, last_seen_snapshot_id);`);
|
|
65
94
|
// Edges Table
|
|
66
95
|
db.exec(`
|
|
@@ -76,26 +105,29 @@ export function initSchema(db) {
|
|
|
76
105
|
FOREIGN KEY(target_page_id) REFERENCES pages(id) ON DELETE CASCADE
|
|
77
106
|
);
|
|
78
107
|
`);
|
|
79
|
-
// Index for Edges
|
|
80
|
-
db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot_source ON edges(snapshot_id, source_page_id);`);
|
|
81
108
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot ON edges(snapshot_id);`);
|
|
82
109
|
// Metrics Table
|
|
83
110
|
db.exec(`
|
|
84
111
|
CREATE TABLE IF NOT EXISTS metrics (
|
|
85
112
|
snapshot_id INTEGER NOT NULL,
|
|
86
113
|
page_id INTEGER NOT NULL,
|
|
87
|
-
authority_score REAL,
|
|
88
|
-
hub_score REAL,
|
|
89
|
-
pagerank REAL,
|
|
90
|
-
pagerank_score REAL,
|
|
91
|
-
link_role TEXT CHECK(link_role IN ('hub', 'authority', 'power', 'balanced', 'peripheral')),
|
|
92
114
|
crawl_status TEXT,
|
|
93
115
|
word_count INTEGER,
|
|
94
116
|
thin_content_score REAL,
|
|
95
117
|
external_link_ratio REAL,
|
|
96
118
|
orphan_score INTEGER,
|
|
119
|
+
pagerank_score REAL,
|
|
120
|
+
hub_score REAL,
|
|
121
|
+
auth_score REAL,
|
|
122
|
+
link_role TEXT,
|
|
97
123
|
duplicate_cluster_id TEXT,
|
|
98
|
-
duplicate_type TEXT
|
|
124
|
+
duplicate_type TEXT,
|
|
125
|
+
cluster_id INTEGER,
|
|
126
|
+
soft404_score REAL,
|
|
127
|
+
heading_score REAL,
|
|
128
|
+
orphan_type TEXT,
|
|
129
|
+
impact_level TEXT,
|
|
130
|
+
heading_data TEXT,
|
|
99
131
|
is_cluster_primary INTEGER DEFAULT 0,
|
|
100
132
|
PRIMARY KEY(snapshot_id, page_id),
|
|
101
133
|
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
|
|
@@ -129,41 +161,56 @@ export function initSchema(db) {
|
|
|
129
161
|
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
|
|
130
162
|
);
|
|
131
163
|
`);
|
|
132
|
-
//
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
164
|
+
// Plugin Migrations Table
|
|
165
|
+
db.exec(`
|
|
166
|
+
CREATE TABLE IF NOT EXISTS plugin_migrations (
|
|
167
|
+
plugin_name TEXT PRIMARY KEY,
|
|
168
|
+
executed_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
169
|
+
);
|
|
170
|
+
`);
|
|
171
|
+
// Universal Plugin Reports Table
|
|
172
|
+
db.exec(`
|
|
173
|
+
CREATE TABLE IF NOT EXISTS plugin_reports (
|
|
174
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
175
|
+
snapshot_id INTEGER NOT NULL,
|
|
176
|
+
plugin_name TEXT NOT NULL,
|
|
177
|
+
data TEXT NOT NULL,
|
|
178
|
+
total_score REAL,
|
|
179
|
+
score_count INTEGER,
|
|
180
|
+
score_weight_sum REAL,
|
|
181
|
+
score_calculated_at TEXT,
|
|
182
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
183
|
+
FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
|
|
184
|
+
);
|
|
185
|
+
`);
|
|
186
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_plugin_reports_snapshot ON plugin_reports(snapshot_id);`);
|
|
187
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_plugin_reports_composite ON plugin_reports(snapshot_id, plugin_name);`);
|
|
188
|
+
// Migrations for metrics
|
|
189
|
+
const metricsCols = [
|
|
157
190
|
['pagerank_score', 'REAL'],
|
|
191
|
+
['hub_score', 'REAL'],
|
|
192
|
+
['auth_score', 'REAL'],
|
|
158
193
|
['link_role', 'TEXT'],
|
|
159
|
-
['
|
|
160
|
-
['
|
|
161
|
-
['
|
|
194
|
+
['cluster_id', 'INTEGER'],
|
|
195
|
+
['soft404_score', 'REAL'],
|
|
196
|
+
['heading_score', 'REAL'],
|
|
197
|
+
['orphan_type', 'TEXT'],
|
|
198
|
+
['impact_level', 'TEXT'],
|
|
199
|
+
['heading_data', 'TEXT'],
|
|
162
200
|
];
|
|
163
|
-
for (const [col, type] of
|
|
201
|
+
for (const [col, type] of metricsCols) {
|
|
164
202
|
try {
|
|
165
203
|
db.exec(`ALTER TABLE metrics ADD COLUMN ${col} ${type}`);
|
|
166
204
|
}
|
|
167
|
-
catch { /*
|
|
205
|
+
catch { /* ignore */ }
|
|
206
|
+
}
|
|
207
|
+
// Final site column migrations
|
|
208
|
+
try {
|
|
209
|
+
db.exec('ALTER TABLE sites ADD COLUMN preferred_url TEXT');
|
|
210
|
+
}
|
|
211
|
+
catch { /* ignore */ }
|
|
212
|
+
try {
|
|
213
|
+
db.exec('ALTER TABLE sites ADD COLUMN ssl INTEGER');
|
|
168
214
|
}
|
|
215
|
+
catch { /* ignore */ }
|
|
169
216
|
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export declare class PluginRegistry {
|
|
2
|
+
private registeredPlugins;
|
|
3
|
+
private registeredTables;
|
|
4
|
+
registerPlugin(pluginName: string): void;
|
|
5
|
+
isPluginRegistered(pluginName: string): boolean;
|
|
6
|
+
registerTable(tableName: string, pluginName: string): void;
|
|
7
|
+
getPluginForTable(tableName: string): string | undefined;
|
|
8
|
+
isTableRegistered(tableName: string): boolean;
|
|
9
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export class PluginRegistry {
|
|
2
|
+
registeredPlugins = new Set();
|
|
3
|
+
registeredTables = new Map(); // tableName -> pluginName
|
|
4
|
+
registerPlugin(pluginName) {
|
|
5
|
+
this.registeredPlugins.add(pluginName);
|
|
6
|
+
}
|
|
7
|
+
isPluginRegistered(pluginName) {
|
|
8
|
+
return this.registeredPlugins.has(pluginName);
|
|
9
|
+
}
|
|
10
|
+
registerTable(tableName, pluginName) {
|
|
11
|
+
this.registeredTables.set(tableName, pluginName);
|
|
12
|
+
}
|
|
13
|
+
getPluginForTable(tableName) {
|
|
14
|
+
return this.registeredTables.get(tableName);
|
|
15
|
+
}
|
|
16
|
+
isTableRegistered(tableName) {
|
|
17
|
+
return this.registeredTables.has(tableName);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
@@ -19,6 +19,11 @@ export declare class EdgeRepository {
|
|
|
19
19
|
weight: number;
|
|
20
20
|
rel: string;
|
|
21
21
|
}[]): void;
|
|
22
|
+
/**
|
|
23
|
+
* Remove all edges originating from a specific page within a snapshot.
|
|
24
|
+
* Used when re-crawling a page into a reused partial snapshot to avoid duplicates.
|
|
25
|
+
*/
|
|
26
|
+
deleteEdgesForPage(snapshotId: number, sourcePageId: number): void;
|
|
22
27
|
getEdgesBySnapshot(snapshotId: number): Edge[];
|
|
23
28
|
getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge>;
|
|
24
29
|
}
|
|
@@ -21,6 +21,13 @@ export class EdgeRepository {
|
|
|
21
21
|
});
|
|
22
22
|
tx(edges);
|
|
23
23
|
}
|
|
24
|
+
/**
|
|
25
|
+
* Remove all edges originating from a specific page within a snapshot.
|
|
26
|
+
* Used when re-crawling a page into a reused partial snapshot to avoid duplicates.
|
|
27
|
+
*/
|
|
28
|
+
deleteEdgesForPage(snapshotId, sourcePageId) {
|
|
29
|
+
this.db.prepare('DELETE FROM edges WHERE snapshot_id = ? AND source_page_id = ?').run(snapshotId, sourcePageId);
|
|
30
|
+
}
|
|
24
31
|
getEdgesBySnapshot(snapshotId) {
|
|
25
32
|
return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId);
|
|
26
33
|
}
|
|
@@ -2,19 +2,24 @@ import { Database } from 'better-sqlite3';
|
|
|
2
2
|
export interface DbMetrics {
|
|
3
3
|
snapshot_id: number;
|
|
4
4
|
page_id: number;
|
|
5
|
-
authority_score: number | null;
|
|
6
|
-
hub_score: number | null;
|
|
7
|
-
pagerank: number | null;
|
|
8
|
-
pagerank_score: number | null;
|
|
9
|
-
link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
|
|
10
5
|
crawl_status: string | null;
|
|
11
6
|
word_count: number | null;
|
|
12
7
|
thin_content_score: number | null;
|
|
13
8
|
external_link_ratio: number | null;
|
|
14
|
-
|
|
9
|
+
pagerank_score: number | null;
|
|
10
|
+
hub_score: number | null;
|
|
11
|
+
auth_score: number | null;
|
|
12
|
+
link_role: string | null;
|
|
15
13
|
duplicate_cluster_id: string | null;
|
|
16
|
-
duplicate_type:
|
|
17
|
-
|
|
14
|
+
duplicate_type: string | null;
|
|
15
|
+
cluster_id: number | null;
|
|
16
|
+
soft404_score: number | null;
|
|
17
|
+
heading_score: number | null;
|
|
18
|
+
orphan_score: number | null;
|
|
19
|
+
orphan_type: string | null;
|
|
20
|
+
impact_level: string | null;
|
|
21
|
+
heading_data: string | null;
|
|
22
|
+
is_cluster_primary: number | null;
|
|
18
23
|
}
|
|
19
24
|
export declare class MetricsRepository {
|
|
20
25
|
private db;
|
|
@@ -7,13 +7,21 @@ export class MetricsRepository {
|
|
|
7
7
|
this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
|
|
8
8
|
this.insertStmt = this.db.prepare(`
|
|
9
9
|
INSERT OR REPLACE INTO metrics (
|
|
10
|
-
snapshot_id, page_id,
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
snapshot_id, page_id,
|
|
11
|
+
crawl_status, word_count, thin_content_score, external_link_ratio,
|
|
12
|
+
pagerank_score, hub_score, auth_score, link_role,
|
|
13
|
+
duplicate_cluster_id, duplicate_type, cluster_id,
|
|
14
|
+
soft404_score, heading_score,
|
|
15
|
+
orphan_score, orphan_type, impact_level,
|
|
16
|
+
heading_data, is_cluster_primary
|
|
13
17
|
) VALUES (
|
|
14
|
-
@snapshot_id, @page_id,
|
|
15
|
-
@
|
|
16
|
-
@
|
|
18
|
+
@snapshot_id, @page_id,
|
|
19
|
+
@crawl_status, @word_count, @thin_content_score, @external_link_ratio,
|
|
20
|
+
@pagerank_score, @hub_score, @auth_score, @link_role,
|
|
21
|
+
@duplicate_cluster_id, @duplicate_type, @cluster_id,
|
|
22
|
+
@soft404_score, @heading_score,
|
|
23
|
+
@orphan_score, @orphan_type, @impact_level,
|
|
24
|
+
@heading_data, @is_cluster_primary
|
|
17
25
|
)
|
|
18
26
|
`);
|
|
19
27
|
}
|
|
@@ -12,14 +12,15 @@ export interface Page {
|
|
|
12
12
|
etag: string | null;
|
|
13
13
|
last_modified: string | null;
|
|
14
14
|
html: string | null;
|
|
15
|
-
soft404_score: number | null;
|
|
16
15
|
noindex: number;
|
|
17
16
|
nofollow: number;
|
|
18
17
|
security_error: string | null;
|
|
19
18
|
retries: number;
|
|
20
19
|
depth: number;
|
|
20
|
+
discovered_via_sitemap: number;
|
|
21
21
|
redirect_chain: string | null;
|
|
22
22
|
bytes_received: number | null;
|
|
23
|
+
is_internal: number;
|
|
23
24
|
crawl_trap_flag: number;
|
|
24
25
|
crawl_trap_risk: number | null;
|
|
25
26
|
trap_type: string | null;
|
|
@@ -48,11 +49,12 @@ export declare class PageRepository {
|
|
|
48
49
|
normalized_url: string;
|
|
49
50
|
last_seen_snapshot_id: number;
|
|
50
51
|
})[]): Map<string, number>;
|
|
51
|
-
getPagesBySnapshot(snapshotId: number): Page[];
|
|
52
|
+
getPagesBySnapshot(snapshotId: number, runType?: string): Page[];
|
|
52
53
|
getPagesIdentityBySnapshot(snapshotId: number): {
|
|
53
54
|
id: number;
|
|
54
55
|
normalized_url: string;
|
|
55
56
|
}[];
|
|
56
|
-
getPagesIteratorBySnapshot(snapshotId: number): IterableIterator<Page>;
|
|
57
|
+
getPagesIteratorBySnapshot(snapshotId: number, runType?: string): IterableIterator<Page>;
|
|
57
58
|
getIdByUrl(siteId: number, url: string): number | undefined;
|
|
59
|
+
reconcileInternalUrls(siteId: number, siteOrigin: string): void;
|
|
58
60
|
}
|
|
@@ -8,17 +8,18 @@ export class PageRepository {
|
|
|
8
8
|
INSERT INTO pages (
|
|
9
9
|
site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
|
|
10
10
|
http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
|
|
11
|
-
|
|
12
|
-
redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
11
|
+
noindex, nofollow, security_error, retries, depth,
|
|
12
|
+
discovered_via_sitemap, redirect_chain, bytes_received, is_internal, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
13
13
|
updated_at
|
|
14
14
|
) VALUES (
|
|
15
15
|
@site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
|
|
16
16
|
@http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
|
|
17
|
-
@
|
|
18
|
-
@redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
17
|
+
@noindex, @nofollow, @security_error, @retries, @depth,
|
|
18
|
+
@discovered_via_sitemap, @redirect_chain, @bytes_received, @is_internal, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
19
19
|
datetime('now')
|
|
20
20
|
)
|
|
21
21
|
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
22
|
+
first_seen_snapshot_id = COALESCE(pages.first_seen_snapshot_id, excluded.first_seen_snapshot_id),
|
|
22
23
|
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
23
24
|
http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
|
|
24
25
|
canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
|
|
@@ -27,14 +28,15 @@ export class PageRepository {
|
|
|
27
28
|
etag = COALESCE(excluded.etag, pages.etag),
|
|
28
29
|
last_modified = COALESCE(excluded.last_modified, pages.last_modified),
|
|
29
30
|
html = COALESCE(excluded.html, pages.html),
|
|
30
|
-
soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
|
|
31
31
|
noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
|
|
32
32
|
nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
|
|
33
33
|
security_error = COALESCE(excluded.security_error, pages.security_error),
|
|
34
34
|
retries = MAX(pages.retries, excluded.retries),
|
|
35
35
|
depth = MIN(pages.depth, excluded.depth),
|
|
36
|
+
discovered_via_sitemap = MAX(pages.discovered_via_sitemap, excluded.discovered_via_sitemap),
|
|
36
37
|
redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
|
|
37
38
|
bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
|
|
39
|
+
is_internal = COALESCE(excluded.is_internal, pages.is_internal),
|
|
38
40
|
crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
|
|
39
41
|
crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
|
|
40
42
|
trap_type = COALESCE(excluded.trap_type, pages.trap_type),
|
|
@@ -55,14 +57,15 @@ export class PageRepository {
|
|
|
55
57
|
etag: page.etag ?? null,
|
|
56
58
|
last_modified: page.last_modified ?? null,
|
|
57
59
|
html: page.html ?? null,
|
|
58
|
-
soft404_score: page.soft404_score ?? null,
|
|
59
60
|
noindex: page.noindex ?? 0,
|
|
60
61
|
nofollow: page.nofollow ?? 0,
|
|
61
62
|
security_error: page.security_error ?? null,
|
|
62
63
|
retries: page.retries ?? 0,
|
|
63
64
|
depth: page.depth ?? 0,
|
|
65
|
+
discovered_via_sitemap: page.discovered_via_sitemap ?? 0,
|
|
64
66
|
redirect_chain: page.redirect_chain ?? null,
|
|
65
67
|
bytes_received: page.bytes_received ?? null,
|
|
68
|
+
is_internal: page.is_internal ?? 1,
|
|
66
69
|
crawl_trap_flag: page.crawl_trap_flag ?? 0,
|
|
67
70
|
crawl_trap_risk: page.crawl_trap_risk ?? null,
|
|
68
71
|
trap_type: page.trap_type ?? null,
|
|
@@ -103,17 +106,18 @@ export class PageRepository {
|
|
|
103
106
|
INSERT INTO pages (
|
|
104
107
|
site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
|
|
105
108
|
http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
|
|
106
|
-
|
|
107
|
-
redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
109
|
+
noindex, nofollow, security_error, retries, depth,
|
|
110
|
+
discovered_via_sitemap, redirect_chain, bytes_received, is_internal, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
108
111
|
updated_at
|
|
109
112
|
) VALUES (
|
|
110
113
|
@site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
|
|
111
114
|
@http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
|
|
112
|
-
@
|
|
113
|
-
@redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
115
|
+
@noindex, @nofollow, @security_error, @retries, @depth,
|
|
116
|
+
@discovered_via_sitemap, @redirect_chain, @bytes_received, @is_internal, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
114
117
|
datetime('now')
|
|
115
118
|
)
|
|
116
119
|
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
120
|
+
first_seen_snapshot_id = COALESCE(pages.first_seen_snapshot_id, excluded.first_seen_snapshot_id),
|
|
117
121
|
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
118
122
|
http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
|
|
119
123
|
canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
|
|
@@ -122,14 +126,15 @@ export class PageRepository {
|
|
|
122
126
|
etag = COALESCE(excluded.etag, pages.etag),
|
|
123
127
|
last_modified = COALESCE(excluded.last_modified, pages.last_modified),
|
|
124
128
|
html = COALESCE(excluded.html, pages.html),
|
|
125
|
-
soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
|
|
126
129
|
noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
|
|
127
130
|
nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
|
|
128
131
|
security_error = COALESCE(excluded.security_error, pages.security_error),
|
|
129
132
|
retries = MAX(pages.retries, excluded.retries),
|
|
130
133
|
depth = MIN(pages.depth, excluded.depth),
|
|
134
|
+
discovered_via_sitemap = MAX(pages.discovered_via_sitemap, excluded.discovered_via_sitemap),
|
|
131
135
|
redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
|
|
132
136
|
bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
|
|
137
|
+
is_internal = COALESCE(excluded.is_internal, pages.is_internal),
|
|
133
138
|
crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
|
|
134
139
|
crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
|
|
135
140
|
trap_type = COALESCE(excluded.trap_type, pages.trap_type),
|
|
@@ -151,14 +156,15 @@ export class PageRepository {
|
|
|
151
156
|
etag: page.etag ?? null,
|
|
152
157
|
last_modified: page.last_modified ?? null,
|
|
153
158
|
html: page.html ?? null,
|
|
154
|
-
soft404_score: page.soft404_score ?? null,
|
|
155
159
|
noindex: page.noindex ?? 0,
|
|
156
160
|
nofollow: page.nofollow ?? 0,
|
|
157
161
|
security_error: page.security_error ?? null,
|
|
158
162
|
retries: page.retries ?? 0,
|
|
159
163
|
depth: page.depth ?? 0,
|
|
164
|
+
discovered_via_sitemap: page.discovered_via_sitemap ?? 0,
|
|
160
165
|
redirect_chain: page.redirect_chain ?? null,
|
|
161
166
|
bytes_received: page.bytes_received ?? null,
|
|
167
|
+
is_internal: page.is_internal ?? 1,
|
|
162
168
|
crawl_trap_flag: page.crawl_trap_flag ?? 0,
|
|
163
169
|
crawl_trap_risk: page.crawl_trap_risk ?? null,
|
|
164
170
|
trap_type: page.trap_type ?? null,
|
|
@@ -170,17 +176,62 @@ export class PageRepository {
|
|
|
170
176
|
tx(pages);
|
|
171
177
|
return urlToId;
|
|
172
178
|
}
|
|
173
|
-
getPagesBySnapshot(snapshotId) {
|
|
174
|
-
|
|
179
|
+
getPagesBySnapshot(snapshotId, runType = 'completed') {
|
|
180
|
+
if (runType === 'single') {
|
|
181
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').all(snapshotId);
|
|
182
|
+
}
|
|
183
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId);
|
|
175
184
|
}
|
|
176
185
|
getPagesIdentityBySnapshot(snapshotId) {
|
|
177
|
-
|
|
186
|
+
// For identities, always loading all up to this point is fine for the crawler to map URLs to IDs.
|
|
187
|
+
return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId);
|
|
178
188
|
}
|
|
179
|
-
getPagesIteratorBySnapshot(snapshotId) {
|
|
180
|
-
|
|
189
|
+
getPagesIteratorBySnapshot(snapshotId, runType = 'completed') {
|
|
190
|
+
if (runType === 'single') {
|
|
191
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').iterate(snapshotId);
|
|
192
|
+
}
|
|
193
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').iterate(snapshotId, snapshotId);
|
|
181
194
|
}
|
|
182
195
|
getIdByUrl(siteId, url) {
|
|
183
196
|
const row = this.getIdStmt.get(siteId, url);
|
|
184
197
|
return row?.id;
|
|
185
198
|
}
|
|
199
|
+
reconcileInternalUrls(siteId, siteOrigin) {
|
|
200
|
+
const origin = siteOrigin.replace(/\/+$/, '');
|
|
201
|
+
const tx = this.db.transaction(() => {
|
|
202
|
+
const rows = this.db
|
|
203
|
+
.prepare("SELECT id, normalized_url FROM pages WHERE site_id = ? AND (normalized_url LIKE 'http://%' OR normalized_url LIKE 'https://%')")
|
|
204
|
+
.all(siteId);
|
|
205
|
+
for (const row of rows) {
|
|
206
|
+
let parsed;
|
|
207
|
+
try {
|
|
208
|
+
parsed = new URL(row.normalized_url);
|
|
209
|
+
}
|
|
210
|
+
catch {
|
|
211
|
+
continue;
|
|
212
|
+
}
|
|
213
|
+
if (parsed.origin !== origin) {
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
const targetPath = `${parsed.pathname}${parsed.search}`;
|
|
217
|
+
if (targetPath === row.normalized_url) {
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
const existing = this.db
|
|
221
|
+
.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?')
|
|
222
|
+
.get(siteId, targetPath);
|
|
223
|
+
if (existing && existing.id !== row.id) {
|
|
224
|
+
this.db.prepare('UPDATE edges SET source_page_id = ? WHERE source_page_id = ?').run(existing.id, row.id);
|
|
225
|
+
this.db.prepare('UPDATE edges SET target_page_id = ? WHERE target_page_id = ?').run(existing.id, row.id);
|
|
226
|
+
this.db.prepare('UPDATE OR IGNORE metrics SET page_id = ? WHERE page_id = ?').run(existing.id, row.id);
|
|
227
|
+
this.db.prepare('DELETE FROM metrics WHERE page_id = ?').run(row.id);
|
|
228
|
+
this.db.prepare('DELETE FROM pages WHERE id = ?').run(row.id);
|
|
229
|
+
}
|
|
230
|
+
else {
|
|
231
|
+
this.db.prepare('UPDATE pages SET normalized_url = ? WHERE id = ?').run(targetPath, row.id);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
});
|
|
235
|
+
tx();
|
|
236
|
+
}
|
|
186
237
|
}
|