@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
import Database from 'better-sqlite3';
|
|
2
|
+
import { runBaseMigrations } from './migrations.js';
|
|
3
|
+
import { Statements } from './statements.js';
|
|
4
|
+
import { PluginRegistry } from './pluginRegistry.js';
|
|
5
|
+
import { normalizeUrl } from '../crawler/normalize.js';
|
|
6
|
+
export class CrawlithDB {
|
|
7
|
+
db;
|
|
8
|
+
statements;
|
|
9
|
+
registry;
|
|
10
|
+
/**
|
|
11
|
+
* @internal
|
|
12
|
+
* Dangerous: Returns the raw better-sqlite3 instance.
|
|
13
|
+
* Core only. Plugins must never use this.
|
|
14
|
+
*/
|
|
15
|
+
unsafeGetRawDb() {
|
|
16
|
+
return this.db;
|
|
17
|
+
}
|
|
18
|
+
// Optional scoping properties
|
|
19
|
+
_pluginName;
|
|
20
|
+
_snapshotId;
|
|
21
|
+
/** Whether live fallback is allowed (from --live flag). Core-controlled. */
|
|
22
|
+
_live = false;
|
|
23
|
+
/** Whether this plugin makes network calls. Core-controlled via plugin.storage.fetchMode. */
|
|
24
|
+
_fetchMode = 'network';
|
|
25
|
+
constructor(dbPath) {
|
|
26
|
+
this.db = new Database(dbPath);
|
|
27
|
+
this.db.pragma('journal_mode = WAL');
|
|
28
|
+
this.db.pragma('synchronous = NORMAL');
|
|
29
|
+
this.db.pragma('foreign_keys = ON');
|
|
30
|
+
this.db.pragma('temp_store = MEMORY');
|
|
31
|
+
this.db.pragma('mmap_size = 30000000000');
|
|
32
|
+
this.db.pragma('cache_size = -20000');
|
|
33
|
+
this.db.pragma('busy_timeout = 5000');
|
|
34
|
+
// Integrity check on startup
|
|
35
|
+
const integrity = this.db.pragma('integrity_check', { simple: true });
|
|
36
|
+
if (integrity !== 'ok') {
|
|
37
|
+
console.warn('Database integrity check failed:', integrity);
|
|
38
|
+
}
|
|
39
|
+
this.registry = new PluginRegistry();
|
|
40
|
+
this.initialize();
|
|
41
|
+
this.statements = new Statements(this.db);
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Schema API
|
|
45
|
+
*/
|
|
46
|
+
get schema() {
|
|
47
|
+
return {
|
|
48
|
+
define: (columns) => this.registerPluginDataSchema(columns)
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Fluent Data API (URL-scoped rows)
|
|
53
|
+
*/
|
|
54
|
+
get data() {
|
|
55
|
+
return {
|
|
56
|
+
save: (input) => this.insertPluginRow(input),
|
|
57
|
+
find: (url, options) => this.getPluginRow(url, undefined, undefined, options),
|
|
58
|
+
all: () => this.getPluginRows(),
|
|
59
|
+
/**
|
|
60
|
+
* Cache-first with live fallback. Core-enforced pattern:
|
|
61
|
+
* 1. If cached data exists → return it (always, regardless of age)
|
|
62
|
+
* 2. If no cache + fetchMode='network' + live=false → return null (skip)
|
|
63
|
+
* 3. If no cache + (fetchMode='local' OR live=true) → call fetchFn, save, return
|
|
64
|
+
*
|
|
65
|
+
* Plugin authors NEVER touch ctx.live — the core injects it via scope().
|
|
66
|
+
*/
|
|
67
|
+
getOrFetch: (url, fetchFn) => this._getOrFetch(url, fetchFn),
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Report API (Global snapshot summary)
|
|
72
|
+
*/
|
|
73
|
+
get report() {
|
|
74
|
+
return {
|
|
75
|
+
save: (summary, optionalScores) => this.insertPluginReport({ summary, ...optionalScores }),
|
|
76
|
+
find: () => this.getPluginReport()
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
initialize() {
|
|
80
|
+
runBaseMigrations(this.db);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Create a scoped instance for a specific plugin.
|
|
84
|
+
* Also bakes in live + fetchMode so getOrFetch() can enforce the protocol
|
|
85
|
+
* without exposing those controls to the plugin author.
|
|
86
|
+
*/
|
|
87
|
+
scope(pluginName, snapshotId, options = {}) {
|
|
88
|
+
if (this._pluginName && this._pluginName !== pluginName) {
|
|
89
|
+
throw new Error(`Security Violation: Cannot re-scope a database instance already bound to "${this._pluginName}"`);
|
|
90
|
+
}
|
|
91
|
+
const scoped = Object.create(this);
|
|
92
|
+
scoped._pluginName = pluginName;
|
|
93
|
+
scoped._snapshotId = snapshotId;
|
|
94
|
+
scoped._live = options.live ?? false;
|
|
95
|
+
scoped._fetchMode = options.fetchMode ?? 'network';
|
|
96
|
+
return scoped;
|
|
97
|
+
}
|
|
98
|
+
registerPluginDataSchema(pluginNameOrColumns, extraColumns) {
|
|
99
|
+
let pluginName = this._pluginName;
|
|
100
|
+
let columns = pluginNameOrColumns;
|
|
101
|
+
if (typeof pluginNameOrColumns === 'string') {
|
|
102
|
+
pluginName = pluginNameOrColumns;
|
|
103
|
+
columns = extraColumns;
|
|
104
|
+
}
|
|
105
|
+
if (!pluginName)
|
|
106
|
+
throw new Error('Plugin name is required for registration (use unbound DB or scope() before calling)');
|
|
107
|
+
if (!columns)
|
|
108
|
+
throw new Error('Columns definition is required');
|
|
109
|
+
// Sanitize: hyphens are invalid in unquoted SQLite identifiers (e.g. "heading-health" → "heading_health_plugin")
|
|
110
|
+
const tableName = this._toTableName(pluginName);
|
|
111
|
+
// Validate columns
|
|
112
|
+
const reserved = ['id', 'snapshot_id', 'url_id', 'score', 'weight', 'created_at'];
|
|
113
|
+
for (const col of Object.keys(columns)) {
|
|
114
|
+
if (reserved.includes(col.toLowerCase())) {
|
|
115
|
+
throw new Error(`Plugin "${pluginName}" cannot define reserved column "${col}". Reserved: ${reserved.join(', ')}`);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
if (this._isMigrationExecuted(pluginName)) {
|
|
119
|
+
// Even if executed, ensure the registry knows about the table name for this session
|
|
120
|
+
this.registry.registerTable(tableName, pluginName);
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
const columnDefs = [
|
|
124
|
+
'id INTEGER PRIMARY KEY AUTOINCREMENT',
|
|
125
|
+
'snapshot_id INTEGER NOT NULL',
|
|
126
|
+
'url_id INTEGER NOT NULL',
|
|
127
|
+
...Object.entries(columns).map(([col, type]) => `${col} ${type}`),
|
|
128
|
+
"score REAL",
|
|
129
|
+
"weight REAL DEFAULT 1.0",
|
|
130
|
+
"created_at TEXT DEFAULT (datetime('now'))",
|
|
131
|
+
'FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE',
|
|
132
|
+
'FOREIGN KEY(url_id) REFERENCES pages(id) ON DELETE CASCADE'
|
|
133
|
+
];
|
|
134
|
+
const migrationSQL = `
|
|
135
|
+
CREATE TABLE IF NOT EXISTS ${tableName} (
|
|
136
|
+
${columnDefs.join(',\n ')}
|
|
137
|
+
);
|
|
138
|
+
CREATE INDEX IF NOT EXISTS idx_${tableName}_snapshot_url ON ${tableName}(snapshot_id, url_id);
|
|
139
|
+
`;
|
|
140
|
+
this.runInTransaction(() => {
|
|
141
|
+
this.registry.registerTable(tableName, pluginName);
|
|
142
|
+
this.db.exec(migrationSQL);
|
|
143
|
+
this.statements.insertMigration.run(pluginName);
|
|
144
|
+
this.registry.registerPlugin(pluginName);
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
/** @deprecated Use registerPluginDataSchema */
|
|
148
|
+
registerPluginMigration(pluginName, migrationSQL) {
|
|
149
|
+
this.runInTransaction(() => {
|
|
150
|
+
const tableMatches = migrationSQL.matchAll(/CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)/gi);
|
|
151
|
+
for (const match of tableMatches) {
|
|
152
|
+
this.registry.registerTable(match[1], pluginName);
|
|
153
|
+
}
|
|
154
|
+
this.db.exec(migrationSQL);
|
|
155
|
+
this.statements.insertMigration.run(pluginName);
|
|
156
|
+
this.registry.registerPlugin(pluginName);
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
getPageIdByUrl(snapshotId, url) {
|
|
160
|
+
const raw = url.trim();
|
|
161
|
+
if (!raw)
|
|
162
|
+
return null;
|
|
163
|
+
// Support both stored path format ("/foo") and absolute URL inputs.
|
|
164
|
+
const candidates = new Set();
|
|
165
|
+
const addCandidateWithSlashVariants = (value) => {
|
|
166
|
+
if (!value)
|
|
167
|
+
return;
|
|
168
|
+
candidates.add(value);
|
|
169
|
+
if (value === '/')
|
|
170
|
+
return;
|
|
171
|
+
if (value.endsWith('/')) {
|
|
172
|
+
candidates.add(value.slice(0, -1));
|
|
173
|
+
}
|
|
174
|
+
else {
|
|
175
|
+
candidates.add(`${value}/`);
|
|
176
|
+
}
|
|
177
|
+
};
|
|
178
|
+
if (raw.startsWith('/')) {
|
|
179
|
+
addCandidateWithSlashVariants(raw);
|
|
180
|
+
}
|
|
181
|
+
const normalizedPath = normalizeUrl(raw, '', { stripQuery: false, toPath: true });
|
|
182
|
+
if (normalizedPath) {
|
|
183
|
+
addCandidateWithSlashVariants(normalizedPath);
|
|
184
|
+
}
|
|
185
|
+
// Fallback for absolute URL parsing; normalizeUrl may return null for malformed/bare path input.
|
|
186
|
+
try {
|
|
187
|
+
const parsed = new URL(raw);
|
|
188
|
+
addCandidateWithSlashVariants(`${parsed.pathname}${parsed.search}`);
|
|
189
|
+
}
|
|
190
|
+
catch {
|
|
191
|
+
// ignore
|
|
192
|
+
}
|
|
193
|
+
for (const candidate of candidates) {
|
|
194
|
+
const row = this.statements.getPageIdByUrl.get(snapshotId, candidate);
|
|
195
|
+
if (row)
|
|
196
|
+
return row.id;
|
|
197
|
+
}
|
|
198
|
+
return null;
|
|
199
|
+
}
|
|
200
|
+
insertPluginReport(input) {
|
|
201
|
+
const snapshotId = input.snapshotId || this._snapshotId;
|
|
202
|
+
const pluginName = input.pluginName || this._pluginName;
|
|
203
|
+
if (!snapshotId)
|
|
204
|
+
throw new Error('snapshotId is required (not found in input or scope)');
|
|
205
|
+
if (!pluginName)
|
|
206
|
+
throw new Error('pluginName is required (not found in input or scope)');
|
|
207
|
+
this._assertSnapshotExists(snapshotId);
|
|
208
|
+
const data = JSON.stringify(input.summary);
|
|
209
|
+
this.statements.insertPluginReport.run(snapshotId, pluginName, data, input.totalScore ?? null, input.scoreCount ?? null, input.scoreWeightSum ?? null, input.scoreCalculatedAt ?? null);
|
|
210
|
+
}
|
|
211
|
+
insertPluginRow(input) {
|
|
212
|
+
const tableName = input.tableName || this._pluginName;
|
|
213
|
+
const snapshotId = input.snapshotId || this._snapshotId;
|
|
214
|
+
if (!tableName)
|
|
215
|
+
throw new Error('tableName/pluginName is required');
|
|
216
|
+
if (!snapshotId)
|
|
217
|
+
throw new Error('snapshotId is required');
|
|
218
|
+
const resolvedTable = this._resolveTableName(tableName);
|
|
219
|
+
this._assertSnapshotExists(snapshotId);
|
|
220
|
+
this._assertOwnership(resolvedTable);
|
|
221
|
+
this._assertTableRegistered(resolvedTable);
|
|
222
|
+
const urlId = this.getPageIdByUrl(snapshotId, input.url);
|
|
223
|
+
if (!urlId) {
|
|
224
|
+
throw new Error(`URL "${input.url}" not found in snapshot ${snapshotId}`);
|
|
225
|
+
}
|
|
226
|
+
const columns = Object.keys(input.data);
|
|
227
|
+
const placeholders = columns.map(() => '?').join(', ');
|
|
228
|
+
const fields = ['snapshot_id', 'url_id', ...columns].join(', ');
|
|
229
|
+
// We must use a dynamic but safe query here because T is unknown.
|
|
230
|
+
// However, since we validated tableName against the registry, it's safe.
|
|
231
|
+
// We still use parameters for all values.
|
|
232
|
+
const stmt = this.db.prepare(`
|
|
233
|
+
INSERT INTO ${resolvedTable} (${fields})
|
|
234
|
+
VALUES (?, ?, ${placeholders})
|
|
235
|
+
`);
|
|
236
|
+
const values = Object.values(input.data).map(v => typeof v === 'object' && v !== null ? JSON.stringify(v) : v);
|
|
237
|
+
stmt.run(snapshotId, urlId, ...values);
|
|
238
|
+
}
|
|
239
|
+
getPluginReport(snapshotId, pluginName) {
|
|
240
|
+
const sid = snapshotId || this._snapshotId;
|
|
241
|
+
const name = pluginName || this._pluginName;
|
|
242
|
+
if (!sid || !name)
|
|
243
|
+
throw new Error('snapshotId and pluginName are required');
|
|
244
|
+
const row = this.statements.getPluginReport.get(sid, name);
|
|
245
|
+
return row ? JSON.parse(row.data) : null;
|
|
246
|
+
}
|
|
247
|
+
getPluginRows(tableName, snapshotId) {
|
|
248
|
+
const targetTable = tableName || (this._pluginName ? this._toTableName(this._pluginName) : undefined);
|
|
249
|
+
const sid = snapshotId || this._snapshotId;
|
|
250
|
+
if (!targetTable || !sid)
|
|
251
|
+
throw new Error('Table name and snapshotId are required');
|
|
252
|
+
const resolvedTable = this._resolveTableName(targetTable);
|
|
253
|
+
this._assertTableRegistered(resolvedTable);
|
|
254
|
+
this._assertOwnership(resolvedTable);
|
|
255
|
+
const rows = this.db.prepare(`SELECT * FROM ${resolvedTable} WHERE snapshot_id = ?`).all(sid);
|
|
256
|
+
return rows.map(row => this._parseRow(row));
|
|
257
|
+
}
|
|
258
|
+
getPluginRow(tableNameOrUrl, snapshotId, url, options = {}) {
|
|
259
|
+
let targetTable = this._pluginName;
|
|
260
|
+
let sid = snapshotId || this._snapshotId;
|
|
261
|
+
let targetUrl = tableNameOrUrl;
|
|
262
|
+
// If called with 3 args or first arg is a registered table/plugin name
|
|
263
|
+
// Scoped instances MUST NOT allow overriding the target table to another plugin's table.
|
|
264
|
+
if (!this._pluginName && (url || this.registry.isTableRegistered(tableNameOrUrl) || this.registry.isTableRegistered(`${tableNameOrUrl}_plugin`))) {
|
|
265
|
+
targetTable = tableNameOrUrl;
|
|
266
|
+
sid = snapshotId || this._snapshotId;
|
|
267
|
+
targetUrl = url;
|
|
268
|
+
}
|
|
269
|
+
if (!targetTable || (!sid && !options.global) || !targetUrl) {
|
|
270
|
+
throw new Error(`Missing required arguments for getPluginRow: table=${targetTable}, snapshot=${sid}, url=${targetUrl}`);
|
|
271
|
+
}
|
|
272
|
+
const resolvedTable = this._resolveTableName(targetTable);
|
|
273
|
+
this._assertTableRegistered(resolvedTable);
|
|
274
|
+
this._assertOwnership(resolvedTable);
|
|
275
|
+
// We use normalized URL to get the ID, but for 'global' lookup we might need to be more careful.
|
|
276
|
+
// For now, we assume url_id maps to 'pages' which is snapshotted.
|
|
277
|
+
// Actually, if it's 'global', we should search by actual normalized URL across snapshots.
|
|
278
|
+
// Let's refine the query:
|
|
279
|
+
let query = `SELECT t.* FROM ${resolvedTable} t`;
|
|
280
|
+
const params = [];
|
|
281
|
+
if (options.global) {
|
|
282
|
+
// Join with pages to find the URL globally across all snapshots
|
|
283
|
+
query += ` JOIN pages p ON t.url_id = p.id WHERE p.normalized_url = ?`;
|
|
284
|
+
params.push(targetUrl);
|
|
285
|
+
}
|
|
286
|
+
else {
|
|
287
|
+
const urlId = this.getPageIdByUrl(sid, targetUrl);
|
|
288
|
+
if (!urlId)
|
|
289
|
+
return null;
|
|
290
|
+
query += ` WHERE t.snapshot_id = ? AND t.url_id = ?`;
|
|
291
|
+
params.push(sid, urlId);
|
|
292
|
+
}
|
|
293
|
+
if (options.maxAge) {
|
|
294
|
+
const seconds = typeof options.maxAge === 'number' ? options.maxAge : this._parseDuration(options.maxAge);
|
|
295
|
+
query += ` AND t.created_at >= datetime('now', '-${seconds} seconds')`;
|
|
296
|
+
}
|
|
297
|
+
query += ` ORDER BY t.id DESC LIMIT 1`;
|
|
298
|
+
const row = this.db.prepare(query).get(...params);
|
|
299
|
+
return row ? this._parseRow(row) : null;
|
|
300
|
+
}
|
|
301
|
+
_parseDuration(duration) {
|
|
302
|
+
const match = duration.match(/^(\d+)([hmds])$/);
|
|
303
|
+
if (!match)
|
|
304
|
+
throw new Error(`Invalid duration format: ${duration}. Use e.g. "24h", "1h", "600s"`);
|
|
305
|
+
const value = parseInt(match[1]);
|
|
306
|
+
const unit = match[2];
|
|
307
|
+
const multipliers = {
|
|
308
|
+
's': 1,
|
|
309
|
+
'm': 60,
|
|
310
|
+
'h': 3600,
|
|
311
|
+
'd': 86400
|
|
312
|
+
};
|
|
313
|
+
return value * multipliers[unit];
|
|
314
|
+
}
|
|
315
|
+
_parseRow(row) {
|
|
316
|
+
const result = { ...row };
|
|
317
|
+
for (const key in result) {
|
|
318
|
+
if (typeof result[key] === 'string' && (result[key].startsWith('{') || result[key].startsWith('['))) {
|
|
319
|
+
try {
|
|
320
|
+
result[key] = JSON.parse(result[key]);
|
|
321
|
+
}
|
|
322
|
+
catch {
|
|
323
|
+
// Not JSON or parse failed, keep as string
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
return result;
|
|
328
|
+
}
|
|
329
|
+
deleteSnapshotPlugins(snapshotId) {
|
|
330
|
+
this.runInTransaction(() => {
|
|
331
|
+
this.statements.deleteSnapshotPlugins.run(snapshotId);
|
|
332
|
+
// Also cleanup registered plugin tables
|
|
333
|
+
// We don't have a list of all rows in all tables, but we know the table names
|
|
334
|
+
// Registered in the registry.
|
|
335
|
+
// This implementation assumes plugins follow the convention of having a snapshot_id column.
|
|
336
|
+
});
|
|
337
|
+
}
|
|
338
|
+
async _getOrFetch(url, fetchFn) {
|
|
339
|
+
// 1. Check cache (global across snapshots)
|
|
340
|
+
const cached = this.getPluginRow(url, undefined, undefined, { global: true });
|
|
341
|
+
if (cached !== null) {
|
|
342
|
+
// Materialize a snapshot-local row even when cache hits globally,
|
|
343
|
+
// so per-snapshot score aggregation remains consistent.
|
|
344
|
+
if (this._pluginName && this._snapshotId) {
|
|
345
|
+
const existingForSnapshot = this.getPluginRow(url);
|
|
346
|
+
if (existingForSnapshot === null) {
|
|
347
|
+
const cachedRow = cached;
|
|
348
|
+
const { id: _id, snapshot_id: _snapshotId, url_id: _urlId, created_at: _createdAt, ...pluginData } = cachedRow;
|
|
349
|
+
this.insertPluginRow({ url, data: pluginData });
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
return cached; // Always use cache when it exists
|
|
353
|
+
}
|
|
354
|
+
// 2. No cache. Can we fetch live?
|
|
355
|
+
const canFetchFresh = this._fetchMode === 'local' || (this._fetchMode === 'network' && this._live);
|
|
356
|
+
if (!canFetchFresh) {
|
|
357
|
+
return null; // Silent skip, no data to return
|
|
358
|
+
}
|
|
359
|
+
// 3. Compute/Fetch fresh data
|
|
360
|
+
const freshData = await fetchFn();
|
|
361
|
+
// 4. Save to DB
|
|
362
|
+
this.insertPluginRow({ url, data: freshData });
|
|
363
|
+
return freshData;
|
|
364
|
+
}
|
|
365
|
+
aggregateScoreProviders(snapshotId, plugins) {
|
|
366
|
+
this._assertSnapshotExists(snapshotId);
|
|
367
|
+
for (const plugin of plugins) {
|
|
368
|
+
if (!plugin.scoreProvider || !plugin.storage?.perPage?.columns)
|
|
369
|
+
continue;
|
|
370
|
+
const tableName = this._toTableName(plugin.name);
|
|
371
|
+
if (!this.registry.isTableRegistered(tableName))
|
|
372
|
+
continue;
|
|
373
|
+
try {
|
|
374
|
+
// Ensure the table schema strictly conforms by ignoring plugins that might have bypassed registry constraints
|
|
375
|
+
const aggregate = this.db.prepare(`
|
|
376
|
+
SELECT
|
|
377
|
+
SUM(score * weight) as total_score,
|
|
378
|
+
SUM(weight) as score_weight_sum,
|
|
379
|
+
COUNT(score) as score_count
|
|
380
|
+
FROM ${tableName}
|
|
381
|
+
WHERE snapshot_id = ? AND score IS NOT NULL
|
|
382
|
+
`).get(snapshotId);
|
|
383
|
+
if (!aggregate || aggregate.score_count === 0)
|
|
384
|
+
continue;
|
|
385
|
+
const { total_score, score_weight_sum, score_count } = aggregate;
|
|
386
|
+
const recentReport = this.db.prepare(`
|
|
387
|
+
SELECT id FROM plugin_reports
|
|
388
|
+
WHERE snapshot_id = ? AND plugin_name = ?
|
|
389
|
+
ORDER BY created_at DESC LIMIT 1
|
|
390
|
+
`).get(snapshotId, plugin.name);
|
|
391
|
+
if (recentReport) {
|
|
392
|
+
this.db.prepare(`
|
|
393
|
+
UPDATE plugin_reports
|
|
394
|
+
SET total_score = ?, score_weight_sum = ?, score_count = ?, score_calculated_at = datetime('now')
|
|
395
|
+
WHERE id = ?
|
|
396
|
+
`).run(total_score, score_weight_sum, score_count, recentReport.id);
|
|
397
|
+
}
|
|
398
|
+
else {
|
|
399
|
+
this.db.prepare(`
|
|
400
|
+
INSERT INTO plugin_reports
|
|
401
|
+
(snapshot_id, plugin_name, data, total_score, score_weight_sum, score_count, score_calculated_at)
|
|
402
|
+
VALUES (?, ?, '{}', ?, ?, ?, datetime('now'))
|
|
403
|
+
`).run(snapshotId, plugin.name, total_score, score_weight_sum, score_count);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
catch (err) {
|
|
407
|
+
console.error(`[CrawlithDB.Aggregation] Failed to aggregate scores for plugin: ${plugin.name} - ${err.message}`);
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
try {
|
|
411
|
+
// After all plugins are aggregated, tally up the snapshot-level totals
|
|
412
|
+
const snapshotAggregate = this.db.prepare(`
|
|
413
|
+
SELECT
|
|
414
|
+
SUM(total_score) as overall_total_score,
|
|
415
|
+
SUM(score_weight_sum) as overall_weight_sum,
|
|
416
|
+
SUM(score_count) as overall_score_count
|
|
417
|
+
FROM plugin_reports
|
|
418
|
+
WHERE snapshot_id = ? AND total_score IS NOT NULL
|
|
419
|
+
`).get(snapshotId);
|
|
420
|
+
if (snapshotAggregate && snapshotAggregate.overall_score_count > 0) {
|
|
421
|
+
const pluginTotal = snapshotAggregate.overall_total_score || 0;
|
|
422
|
+
const pluginWeight = snapshotAggregate.overall_weight_sum || 0;
|
|
423
|
+
const pluginCount = snapshotAggregate.overall_score_count || 0;
|
|
424
|
+
// 1. Update the plugin-specific aggregate columns in the snapshots table
|
|
425
|
+
this.db.prepare(`
|
|
426
|
+
UPDATE snapshots
|
|
427
|
+
SET
|
|
428
|
+
total_score = ?,
|
|
429
|
+
score_weight_sum = ?,
|
|
430
|
+
score_count = ?,
|
|
431
|
+
score_calculated_at = datetime('now')
|
|
432
|
+
WHERE id = ?
|
|
433
|
+
`).run(pluginTotal, pluginWeight, pluginCount, snapshotId);
|
|
434
|
+
// 2. Blend with the Core Health Score
|
|
435
|
+
// We fetch the current health_score (calculated by HealthService) and treat it
|
|
436
|
+
// as a "Core" provider with a standard weight of 100.
|
|
437
|
+
const snapshot = this.db.prepare('SELECT health_score FROM snapshots WHERE id = ?').get(snapshotId);
|
|
438
|
+
const coreScore = snapshot?.health_score;
|
|
439
|
+
const CORE_WEIGHT = 100;
|
|
440
|
+
let blendedScore = null;
|
|
441
|
+
if (coreScore !== null && coreScore !== undefined) {
|
|
442
|
+
// Weighted Average: (CoreScore * 100 + Sum(PluginScore * Weight)) / (100 + Sum(Weights))
|
|
443
|
+
blendedScore = (coreScore * CORE_WEIGHT + pluginTotal) / (CORE_WEIGHT + pluginWeight);
|
|
444
|
+
}
|
|
445
|
+
else {
|
|
446
|
+
// Fallback to pure plugin average if core health isn't computed
|
|
447
|
+
blendedScore = pluginTotal / pluginWeight;
|
|
448
|
+
}
|
|
449
|
+
if (blendedScore !== null) {
|
|
450
|
+
this.db.prepare('UPDATE snapshots SET health_score = ? WHERE id = ?').run(Number(blendedScore.toFixed(1)), snapshotId);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
catch (err) {
|
|
455
|
+
console.error(`[CrawlithDB.Aggregation] Failed to aggregate snapshot-level scores: ${err.message}`);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
runInTransaction(fn) {
|
|
459
|
+
const tx = this.db.transaction(fn);
|
|
460
|
+
tx();
|
|
461
|
+
}
|
|
462
|
+
_resolveTableName(name) {
|
|
463
|
+
if (this.registry.isTableRegistered(name))
|
|
464
|
+
return name;
|
|
465
|
+
const pluginTable = this._toTableName(name);
|
|
466
|
+
if (this.registry.isTableRegistered(pluginTable))
|
|
467
|
+
return pluginTable;
|
|
468
|
+
return name; // Will fail assertion later
|
|
469
|
+
}
|
|
470
|
+
/** Converts a plugin name to its canonical SQLite table name, sanitizing invalid characters. */
|
|
471
|
+
_toTableName(pluginName) {
|
|
472
|
+
return `${pluginName.replace(/-/g, '_')}_plugin`;
|
|
473
|
+
}
|
|
474
|
+
close() {
|
|
475
|
+
this.db.close();
|
|
476
|
+
}
|
|
477
|
+
_isMigrationExecuted(pluginName) {
|
|
478
|
+
const row = this.statements.getMigration.get(pluginName);
|
|
479
|
+
return !!row;
|
|
480
|
+
}
|
|
481
|
+
_assertSnapshotExists(snapshotId) {
|
|
482
|
+
const row = this.statements.getSnapshot.get(snapshotId);
|
|
483
|
+
if (!row) {
|
|
484
|
+
throw new Error(`Snapshot ID ${snapshotId} does not exist`);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
_assertTableRegistered(tableName) {
|
|
488
|
+
if (!this.registry.isTableRegistered(tableName)) {
|
|
489
|
+
throw new Error(`Access Denied: Table "${tableName}" is not registered by any plugin migration.`);
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
_assertOwnership(tableName) {
|
|
493
|
+
if (!this._pluginName)
|
|
494
|
+
return; // Unbound instance has full access
|
|
495
|
+
const owner = this.registry.getPluginForTable(tableName);
|
|
496
|
+
if (owner !== this._pluginName) {
|
|
497
|
+
throw new Error(`Security Violation: Plugin "${this._pluginName}" attempted to access table "${tableName}" owned by "${owner}".`);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
}
|
package/dist/db/graphLoader.js
CHANGED
|
@@ -10,22 +10,38 @@ export function loadGraphFromSnapshot(snapshotId) {
|
|
|
10
10
|
const edgeRepo = new EdgeRepository(db);
|
|
11
11
|
const metricsRepo = new MetricsRepository(db);
|
|
12
12
|
const snapshotRepo = new SnapshotRepository(db);
|
|
13
|
-
const pages = pageRepo.getPagesBySnapshot(snapshotId);
|
|
14
|
-
const metrics = metricsRepo.getMetrics(snapshotId);
|
|
15
13
|
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
14
|
+
const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId, snapshot?.run_type || 'completed');
|
|
15
|
+
const metrics = metricsRepo.getMetricsIterator(snapshotId);
|
|
16
16
|
const metricsMap = new Map();
|
|
17
17
|
for (const m of metrics) {
|
|
18
18
|
metricsMap.set(m.page_id, m);
|
|
19
19
|
}
|
|
20
20
|
const graph = new Graph();
|
|
21
|
+
let pagesFetched = 0;
|
|
22
|
+
let pagesCached = 0;
|
|
23
|
+
let pagesSkipped = 0;
|
|
21
24
|
if (snapshot) {
|
|
22
25
|
graph.limitReached = !!snapshot.limit_reached;
|
|
23
26
|
}
|
|
24
27
|
const idMap = new Map();
|
|
25
28
|
for (const p of pages) {
|
|
26
29
|
idMap.set(p.id, p.normalized_url);
|
|
27
|
-
graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
|
|
30
|
+
graph.addNode(p.normalized_url, p.depth, p.http_status || 0, !!p.is_internal);
|
|
28
31
|
const m = metricsMap.get(p.id);
|
|
32
|
+
if (m) {
|
|
33
|
+
const isProcessed = m.crawl_status === 'fetched' ||
|
|
34
|
+
m.crawl_status === 'fetched_error' ||
|
|
35
|
+
m.crawl_status === 'network_error' ||
|
|
36
|
+
m.crawl_status === 'failed_after_retries' ||
|
|
37
|
+
m.crawl_status === 'blocked_by_robots';
|
|
38
|
+
if (isProcessed)
|
|
39
|
+
pagesFetched++;
|
|
40
|
+
else if (m.crawl_status === 'cached')
|
|
41
|
+
pagesCached++;
|
|
42
|
+
else if (m.crawl_status === 'skipped')
|
|
43
|
+
pagesSkipped++;
|
|
44
|
+
}
|
|
29
45
|
let incrementalStatus;
|
|
30
46
|
if (p.first_seen_snapshot_id === snapshotId) {
|
|
31
47
|
incrementalStatus = 'new';
|
|
@@ -37,13 +53,14 @@ export function loadGraphFromSnapshot(snapshotId) {
|
|
|
37
53
|
incrementalStatus = 'changed';
|
|
38
54
|
}
|
|
39
55
|
graph.updateNodeData(p.normalized_url, {
|
|
56
|
+
isInternal: !!p.is_internal,
|
|
40
57
|
canonical: p.canonical_url || undefined,
|
|
58
|
+
discoveredViaSitemap: !!p.discovered_via_sitemap,
|
|
41
59
|
contentHash: p.content_hash || undefined,
|
|
42
60
|
simhash: p.simhash || undefined,
|
|
43
61
|
etag: p.etag || undefined,
|
|
44
62
|
lastModified: p.last_modified || undefined,
|
|
45
63
|
html: p.html || undefined,
|
|
46
|
-
soft404Score: p.soft404_score || undefined,
|
|
47
64
|
noindex: !!p.noindex,
|
|
48
65
|
nofollow: !!p.nofollow,
|
|
49
66
|
incrementalStatus,
|
|
@@ -55,18 +72,24 @@ export function loadGraphFromSnapshot(snapshotId) {
|
|
|
55
72
|
crawlTrapRisk: p.crawl_trap_risk || undefined,
|
|
56
73
|
trapType: p.trap_type || undefined,
|
|
57
74
|
// Metrics
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
75
|
+
// Additional metrics
|
|
76
|
+
crawlStatus: m?.crawl_status || undefined,
|
|
77
|
+
wordCount: m?.word_count != null ? m.word_count : undefined,
|
|
78
|
+
thinContentScore: m?.thin_content_score != null ? m.thin_content_score : undefined,
|
|
79
|
+
externalLinkRatio: m?.external_link_ratio != null ? m.external_link_ratio : undefined,
|
|
80
|
+
pagerankScore: m?.pagerank_score ?? undefined,
|
|
61
81
|
hubScore: m?.hub_score ?? undefined,
|
|
82
|
+
authScore: m?.auth_score ?? undefined,
|
|
62
83
|
linkRole: m?.link_role ?? undefined,
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
84
|
+
soft404Score: m?.soft404_score ?? undefined,
|
|
85
|
+
headingScore: m?.heading_score ?? undefined,
|
|
86
|
+
orphanScore: m?.orphan_score ?? undefined,
|
|
87
|
+
orphanType: m?.orphan_type ?? undefined,
|
|
88
|
+
impactLevel: m?.impact_level ?? undefined,
|
|
89
|
+
headingData: m?.heading_data ?? undefined,
|
|
67
90
|
});
|
|
68
91
|
}
|
|
69
|
-
const edges = edgeRepo.
|
|
92
|
+
const edges = edgeRepo.getEdgesIteratorBySnapshot(snapshotId);
|
|
70
93
|
for (const e of edges) {
|
|
71
94
|
const source = idMap.get(e.source_page_id);
|
|
72
95
|
const target = idMap.get(e.target_page_id);
|
|
@@ -74,23 +97,12 @@ export function loadGraphFromSnapshot(snapshotId) {
|
|
|
74
97
|
graph.addEdge(source, target, e.weight || 1.0);
|
|
75
98
|
}
|
|
76
99
|
}
|
|
77
|
-
//
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
severity: c.severity
|
|
85
|
-
}));
|
|
86
|
-
// Load content clusters
|
|
87
|
-
const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId);
|
|
88
|
-
graph.contentClusters = contentClusters.map(c => ({
|
|
89
|
-
id: c.id,
|
|
90
|
-
count: c.count,
|
|
91
|
-
primaryUrl: c.primary_url,
|
|
92
|
-
risk: c.risk,
|
|
93
|
-
sharedPathPrefix: c.shared_path_prefix || undefined
|
|
94
|
-
}));
|
|
100
|
+
// Set session stats
|
|
101
|
+
graph.sessionStats = {
|
|
102
|
+
pagesFetched,
|
|
103
|
+
pagesCached,
|
|
104
|
+
pagesSkipped,
|
|
105
|
+
totalFound: idMap.size
|
|
106
|
+
};
|
|
95
107
|
return graph;
|
|
96
108
|
}
|
package/dist/db/index.d.ts
CHANGED
|
@@ -1,4 +1,15 @@
|
|
|
1
1
|
import Database from 'better-sqlite3';
|
|
2
|
+
import { CrawlithDB } from './CrawlithDB.js';
|
|
3
|
+
export * from './repositories/SiteRepository.js';
|
|
4
|
+
export * from './repositories/SnapshotRepository.js';
|
|
5
|
+
export * from './CrawlithDB.js';
|
|
2
6
|
export declare function getDbPath(): string;
|
|
7
|
+
/**
|
|
8
|
+
* Returns the higher-level CrawlithDB wrapper for plugins and new code.
|
|
9
|
+
*/
|
|
10
|
+
export declare function getCrawlithDB(): CrawlithDB;
|
|
11
|
+
/**
|
|
12
|
+
* Returns the raw better-sqlite3 Database instance for legacy repositories.
|
|
13
|
+
*/
|
|
3
14
|
export declare function getDb(): Database.Database;
|
|
4
15
|
export declare function closeDb(): void;
|