@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -0,0 +1,500 @@
1
+ import Database from 'better-sqlite3';
2
+ import { runBaseMigrations } from './migrations.js';
3
+ import { Statements } from './statements.js';
4
+ import { PluginRegistry } from './pluginRegistry.js';
5
+ import { normalizeUrl } from '../crawler/normalize.js';
6
+ export class CrawlithDB {
7
+ db;
8
+ statements;
9
+ registry;
10
+ /**
11
+ * @internal
12
+ * Dangerous: Returns the raw better-sqlite3 instance.
13
+ * Core only. Plugins must never use this.
14
+ */
15
+ unsafeGetRawDb() {
16
+ return this.db;
17
+ }
18
+ // Optional scoping properties
19
+ _pluginName;
20
+ _snapshotId;
21
+ /** Whether live fallback is allowed (from --live flag). Core-controlled. */
22
+ _live = false;
23
+ /** Whether this plugin makes network calls. Core-controlled via plugin.storage.fetchMode. */
24
+ _fetchMode = 'network';
25
+ constructor(dbPath) {
26
+ this.db = new Database(dbPath);
27
+ this.db.pragma('journal_mode = WAL');
28
+ this.db.pragma('synchronous = NORMAL');
29
+ this.db.pragma('foreign_keys = ON');
30
+ this.db.pragma('temp_store = MEMORY');
31
+ this.db.pragma('mmap_size = 30000000000');
32
+ this.db.pragma('cache_size = -20000');
33
+ this.db.pragma('busy_timeout = 5000');
34
+ // Integrity check on startup
35
+ const integrity = this.db.pragma('integrity_check', { simple: true });
36
+ if (integrity !== 'ok') {
37
+ console.warn('Database integrity check failed:', integrity);
38
+ }
39
+ this.registry = new PluginRegistry();
40
+ this.initialize();
41
+ this.statements = new Statements(this.db);
42
+ }
43
+ /**
44
+ * Schema API
45
+ */
46
+ get schema() {
47
+ return {
48
+ define: (columns) => this.registerPluginDataSchema(columns)
49
+ };
50
+ }
51
+ /**
52
+ * Fluent Data API (URL-scoped rows)
53
+ */
54
+ get data() {
55
+ return {
56
+ save: (input) => this.insertPluginRow(input),
57
+ find: (url, options) => this.getPluginRow(url, undefined, undefined, options),
58
+ all: () => this.getPluginRows(),
59
+ /**
60
+ * Cache-first with live fallback. Core-enforced pattern:
61
+ * 1. If cached data exists → return it (always, regardless of age)
62
+ * 2. If no cache + fetchMode='network' + live=false → return null (skip)
63
+ * 3. If no cache + (fetchMode='local' OR live=true) → call fetchFn, save, return
64
+ *
65
+ * Plugin authors NEVER touch ctx.live — the core injects it via scope().
66
+ */
67
+ getOrFetch: (url, fetchFn) => this._getOrFetch(url, fetchFn),
68
+ };
69
+ }
70
+ /**
71
+ * Report API (Global snapshot summary)
72
+ */
73
+ get report() {
74
+ return {
75
+ save: (summary, optionalScores) => this.insertPluginReport({ summary, ...optionalScores }),
76
+ find: () => this.getPluginReport()
77
+ };
78
+ }
79
+ initialize() {
80
+ runBaseMigrations(this.db);
81
+ }
82
+ /**
83
+ * Create a scoped instance for a specific plugin.
84
+ * Also bakes in live + fetchMode so getOrFetch() can enforce the protocol
85
+ * without exposing those controls to the plugin author.
86
+ */
87
+ scope(pluginName, snapshotId, options = {}) {
88
+ if (this._pluginName && this._pluginName !== pluginName) {
89
+ throw new Error(`Security Violation: Cannot re-scope a database instance already bound to "${this._pluginName}"`);
90
+ }
91
+ const scoped = Object.create(this);
92
+ scoped._pluginName = pluginName;
93
+ scoped._snapshotId = snapshotId;
94
+ scoped._live = options.live ?? false;
95
+ scoped._fetchMode = options.fetchMode ?? 'network';
96
+ return scoped;
97
+ }
98
+ registerPluginDataSchema(pluginNameOrColumns, extraColumns) {
99
+ let pluginName = this._pluginName;
100
+ let columns = pluginNameOrColumns;
101
+ if (typeof pluginNameOrColumns === 'string') {
102
+ pluginName = pluginNameOrColumns;
103
+ columns = extraColumns;
104
+ }
105
+ if (!pluginName)
106
+ throw new Error('Plugin name is required for registration (use unbound DB or scope() before calling)');
107
+ if (!columns)
108
+ throw new Error('Columns definition is required');
109
+ // Sanitize: hyphens are invalid in unquoted SQLite identifiers (e.g. "heading-health" → "heading_health_plugin")
110
+ const tableName = this._toTableName(pluginName);
111
+ // Validate columns
112
+ const reserved = ['id', 'snapshot_id', 'url_id', 'score', 'weight', 'created_at'];
113
+ for (const col of Object.keys(columns)) {
114
+ if (reserved.includes(col.toLowerCase())) {
115
+ throw new Error(`Plugin "${pluginName}" cannot define reserved column "${col}". Reserved: ${reserved.join(', ')}`);
116
+ }
117
+ }
118
+ if (this._isMigrationExecuted(pluginName)) {
119
+ // Even if executed, ensure the registry knows about the table name for this session
120
+ this.registry.registerTable(tableName, pluginName);
121
+ return;
122
+ }
123
+ const columnDefs = [
124
+ 'id INTEGER PRIMARY KEY AUTOINCREMENT',
125
+ 'snapshot_id INTEGER NOT NULL',
126
+ 'url_id INTEGER NOT NULL',
127
+ ...Object.entries(columns).map(([col, type]) => `${col} ${type}`),
128
+ "score REAL",
129
+ "weight REAL DEFAULT 1.0",
130
+ "created_at TEXT DEFAULT (datetime('now'))",
131
+ 'FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE',
132
+ 'FOREIGN KEY(url_id) REFERENCES pages(id) ON DELETE CASCADE'
133
+ ];
134
+ const migrationSQL = `
135
+ CREATE TABLE IF NOT EXISTS ${tableName} (
136
+ ${columnDefs.join(',\n ')}
137
+ );
138
+ CREATE INDEX IF NOT EXISTS idx_${tableName}_snapshot_url ON ${tableName}(snapshot_id, url_id);
139
+ `;
140
+ this.runInTransaction(() => {
141
+ this.registry.registerTable(tableName, pluginName);
142
+ this.db.exec(migrationSQL);
143
+ this.statements.insertMigration.run(pluginName);
144
+ this.registry.registerPlugin(pluginName);
145
+ });
146
+ }
147
+ /** @deprecated Use registerPluginDataSchema */
148
+ registerPluginMigration(pluginName, migrationSQL) {
149
+ this.runInTransaction(() => {
150
+ const tableMatches = migrationSQL.matchAll(/CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)/gi);
151
+ for (const match of tableMatches) {
152
+ this.registry.registerTable(match[1], pluginName);
153
+ }
154
+ this.db.exec(migrationSQL);
155
+ this.statements.insertMigration.run(pluginName);
156
+ this.registry.registerPlugin(pluginName);
157
+ });
158
+ }
159
+ getPageIdByUrl(snapshotId, url) {
160
+ const raw = url.trim();
161
+ if (!raw)
162
+ return null;
163
+ // Support both stored path format ("/foo") and absolute URL inputs.
164
+ const candidates = new Set();
165
+ const addCandidateWithSlashVariants = (value) => {
166
+ if (!value)
167
+ return;
168
+ candidates.add(value);
169
+ if (value === '/')
170
+ return;
171
+ if (value.endsWith('/')) {
172
+ candidates.add(value.slice(0, -1));
173
+ }
174
+ else {
175
+ candidates.add(`${value}/`);
176
+ }
177
+ };
178
+ if (raw.startsWith('/')) {
179
+ addCandidateWithSlashVariants(raw);
180
+ }
181
+ const normalizedPath = normalizeUrl(raw, '', { stripQuery: false, toPath: true });
182
+ if (normalizedPath) {
183
+ addCandidateWithSlashVariants(normalizedPath);
184
+ }
185
+ // Fallback for absolute URL parsing; normalizeUrl may return null for malformed/bare path input.
186
+ try {
187
+ const parsed = new URL(raw);
188
+ addCandidateWithSlashVariants(`${parsed.pathname}${parsed.search}`);
189
+ }
190
+ catch {
191
+ // ignore
192
+ }
193
+ for (const candidate of candidates) {
194
+ const row = this.statements.getPageIdByUrl.get(snapshotId, candidate);
195
+ if (row)
196
+ return row.id;
197
+ }
198
+ return null;
199
+ }
200
+ insertPluginReport(input) {
201
+ const snapshotId = input.snapshotId || this._snapshotId;
202
+ const pluginName = input.pluginName || this._pluginName;
203
+ if (!snapshotId)
204
+ throw new Error('snapshotId is required (not found in input or scope)');
205
+ if (!pluginName)
206
+ throw new Error('pluginName is required (not found in input or scope)');
207
+ this._assertSnapshotExists(snapshotId);
208
+ const data = JSON.stringify(input.summary);
209
+ this.statements.insertPluginReport.run(snapshotId, pluginName, data, input.totalScore ?? null, input.scoreCount ?? null, input.scoreWeightSum ?? null, input.scoreCalculatedAt ?? null);
210
+ }
211
+ insertPluginRow(input) {
212
+ const tableName = input.tableName || this._pluginName;
213
+ const snapshotId = input.snapshotId || this._snapshotId;
214
+ if (!tableName)
215
+ throw new Error('tableName/pluginName is required');
216
+ if (!snapshotId)
217
+ throw new Error('snapshotId is required');
218
+ const resolvedTable = this._resolveTableName(tableName);
219
+ this._assertSnapshotExists(snapshotId);
220
+ this._assertOwnership(resolvedTable);
221
+ this._assertTableRegistered(resolvedTable);
222
+ const urlId = this.getPageIdByUrl(snapshotId, input.url);
223
+ if (!urlId) {
224
+ throw new Error(`URL "${input.url}" not found in snapshot ${snapshotId}`);
225
+ }
226
+ const columns = Object.keys(input.data);
227
+ const placeholders = columns.map(() => '?').join(', ');
228
+ const fields = ['snapshot_id', 'url_id', ...columns].join(', ');
229
+ // We must use a dynamic but safe query here because T is unknown.
230
+ // However, since we validated tableName against the registry, it's safe.
231
+ // We still use parameters for all values.
232
+ const stmt = this.db.prepare(`
233
+ INSERT INTO ${resolvedTable} (${fields})
234
+ VALUES (?, ?, ${placeholders})
235
+ `);
236
+ const values = Object.values(input.data).map(v => typeof v === 'object' && v !== null ? JSON.stringify(v) : v);
237
+ stmt.run(snapshotId, urlId, ...values);
238
+ }
239
+ getPluginReport(snapshotId, pluginName) {
240
+ const sid = snapshotId || this._snapshotId;
241
+ const name = pluginName || this._pluginName;
242
+ if (!sid || !name)
243
+ throw new Error('snapshotId and pluginName are required');
244
+ const row = this.statements.getPluginReport.get(sid, name);
245
+ return row ? JSON.parse(row.data) : null;
246
+ }
247
+ getPluginRows(tableName, snapshotId) {
248
+ const targetTable = tableName || (this._pluginName ? this._toTableName(this._pluginName) : undefined);
249
+ const sid = snapshotId || this._snapshotId;
250
+ if (!targetTable || !sid)
251
+ throw new Error('Table name and snapshotId are required');
252
+ const resolvedTable = this._resolveTableName(targetTable);
253
+ this._assertTableRegistered(resolvedTable);
254
+ this._assertOwnership(resolvedTable);
255
+ const rows = this.db.prepare(`SELECT * FROM ${resolvedTable} WHERE snapshot_id = ?`).all(sid);
256
+ return rows.map(row => this._parseRow(row));
257
+ }
258
+ getPluginRow(tableNameOrUrl, snapshotId, url, options = {}) {
259
+ let targetTable = this._pluginName;
260
+ let sid = snapshotId || this._snapshotId;
261
+ let targetUrl = tableNameOrUrl;
262
+ // If called with 3 args or first arg is a registered table/plugin name
263
+ // Scoped instances MUST NOT allow overriding the target table to another plugin's table.
264
+ if (!this._pluginName && (url || this.registry.isTableRegistered(tableNameOrUrl) || this.registry.isTableRegistered(`${tableNameOrUrl}_plugin`))) {
265
+ targetTable = tableNameOrUrl;
266
+ sid = snapshotId || this._snapshotId;
267
+ targetUrl = url;
268
+ }
269
+ if (!targetTable || (!sid && !options.global) || !targetUrl) {
270
+ throw new Error(`Missing required arguments for getPluginRow: table=${targetTable}, snapshot=${sid}, url=${targetUrl}`);
271
+ }
272
+ const resolvedTable = this._resolveTableName(targetTable);
273
+ this._assertTableRegistered(resolvedTable);
274
+ this._assertOwnership(resolvedTable);
275
+ // We use normalized URL to get the ID, but for 'global' lookup we might need to be more careful.
276
+ // For now, we assume url_id maps to 'pages' which is snapshotted.
277
+ // Actually, if it's 'global', we should search by actual normalized URL across snapshots.
278
+ // Let's refine the query:
279
+ let query = `SELECT t.* FROM ${resolvedTable} t`;
280
+ const params = [];
281
+ if (options.global) {
282
+ // Join with pages to find the URL globally across all snapshots
283
+ query += ` JOIN pages p ON t.url_id = p.id WHERE p.normalized_url = ?`;
284
+ params.push(targetUrl);
285
+ }
286
+ else {
287
+ const urlId = this.getPageIdByUrl(sid, targetUrl);
288
+ if (!urlId)
289
+ return null;
290
+ query += ` WHERE t.snapshot_id = ? AND t.url_id = ?`;
291
+ params.push(sid, urlId);
292
+ }
293
+ if (options.maxAge) {
294
+ const seconds = typeof options.maxAge === 'number' ? options.maxAge : this._parseDuration(options.maxAge);
295
+ query += ` AND t.created_at >= datetime('now', '-${seconds} seconds')`;
296
+ }
297
+ query += ` ORDER BY t.id DESC LIMIT 1`;
298
+ const row = this.db.prepare(query).get(...params);
299
+ return row ? this._parseRow(row) : null;
300
+ }
301
+ _parseDuration(duration) {
302
+ const match = duration.match(/^(\d+)([hmds])$/);
303
+ if (!match)
304
+ throw new Error(`Invalid duration format: ${duration}. Use e.g. "24h", "1h", "600s"`);
305
+ const value = parseInt(match[1]);
306
+ const unit = match[2];
307
+ const multipliers = {
308
+ 's': 1,
309
+ 'm': 60,
310
+ 'h': 3600,
311
+ 'd': 86400
312
+ };
313
+ return value * multipliers[unit];
314
+ }
315
+ _parseRow(row) {
316
+ const result = { ...row };
317
+ for (const key in result) {
318
+ if (typeof result[key] === 'string' && (result[key].startsWith('{') || result[key].startsWith('['))) {
319
+ try {
320
+ result[key] = JSON.parse(result[key]);
321
+ }
322
+ catch {
323
+ // Not JSON or parse failed, keep as string
324
+ }
325
+ }
326
+ }
327
+ return result;
328
+ }
329
+ deleteSnapshotPlugins(snapshotId) {
330
+ this.runInTransaction(() => {
331
+ this.statements.deleteSnapshotPlugins.run(snapshotId);
332
+ // Also cleanup registered plugin tables
333
+ // We don't have a list of all rows in all tables, but we know the table names
334
+ // Registered in the registry.
335
+ // This implementation assumes plugins follow the convention of having a snapshot_id column.
336
+ });
337
+ }
338
+ async _getOrFetch(url, fetchFn) {
339
+ // 1. Check cache (global across snapshots)
340
+ const cached = this.getPluginRow(url, undefined, undefined, { global: true });
341
+ if (cached !== null) {
342
+ // Materialize a snapshot-local row even when cache hits globally,
343
+ // so per-snapshot score aggregation remains consistent.
344
+ if (this._pluginName && this._snapshotId) {
345
+ const existingForSnapshot = this.getPluginRow(url);
346
+ if (existingForSnapshot === null) {
347
+ const cachedRow = cached;
348
+ const { id: _id, snapshot_id: _snapshotId, url_id: _urlId, created_at: _createdAt, ...pluginData } = cachedRow;
349
+ this.insertPluginRow({ url, data: pluginData });
350
+ }
351
+ }
352
+ return cached; // Always use cache when it exists
353
+ }
354
+ // 2. No cache. Can we fetch live?
355
+ const canFetchFresh = this._fetchMode === 'local' || (this._fetchMode === 'network' && this._live);
356
+ if (!canFetchFresh) {
357
+ return null; // Silent skip, no data to return
358
+ }
359
+ // 3. Compute/Fetch fresh data
360
+ const freshData = await fetchFn();
361
+ // 4. Save to DB
362
+ this.insertPluginRow({ url, data: freshData });
363
+ return freshData;
364
+ }
365
+ aggregateScoreProviders(snapshotId, plugins) {
366
+ this._assertSnapshotExists(snapshotId);
367
+ for (const plugin of plugins) {
368
+ if (!plugin.scoreProvider || !plugin.storage?.perPage?.columns)
369
+ continue;
370
+ const tableName = this._toTableName(plugin.name);
371
+ if (!this.registry.isTableRegistered(tableName))
372
+ continue;
373
+ try {
374
+ // Ensure the table schema strictly conforms by ignoring plugins that might have bypassed registry constraints
375
+ const aggregate = this.db.prepare(`
376
+ SELECT
377
+ SUM(score * weight) as total_score,
378
+ SUM(weight) as score_weight_sum,
379
+ COUNT(score) as score_count
380
+ FROM ${tableName}
381
+ WHERE snapshot_id = ? AND score IS NOT NULL
382
+ `).get(snapshotId);
383
+ if (!aggregate || aggregate.score_count === 0)
384
+ continue;
385
+ const { total_score, score_weight_sum, score_count } = aggregate;
386
+ const recentReport = this.db.prepare(`
387
+ SELECT id FROM plugin_reports
388
+ WHERE snapshot_id = ? AND plugin_name = ?
389
+ ORDER BY created_at DESC LIMIT 1
390
+ `).get(snapshotId, plugin.name);
391
+ if (recentReport) {
392
+ this.db.prepare(`
393
+ UPDATE plugin_reports
394
+ SET total_score = ?, score_weight_sum = ?, score_count = ?, score_calculated_at = datetime('now')
395
+ WHERE id = ?
396
+ `).run(total_score, score_weight_sum, score_count, recentReport.id);
397
+ }
398
+ else {
399
+ this.db.prepare(`
400
+ INSERT INTO plugin_reports
401
+ (snapshot_id, plugin_name, data, total_score, score_weight_sum, score_count, score_calculated_at)
402
+ VALUES (?, ?, '{}', ?, ?, ?, datetime('now'))
403
+ `).run(snapshotId, plugin.name, total_score, score_weight_sum, score_count);
404
+ }
405
+ }
406
+ catch (err) {
407
+ console.error(`[CrawlithDB.Aggregation] Failed to aggregate scores for plugin: ${plugin.name} - ${err.message}`);
408
+ }
409
+ }
410
+ try {
411
+ // After all plugins are aggregated, tally up the snapshot-level totals
412
+ const snapshotAggregate = this.db.prepare(`
413
+ SELECT
414
+ SUM(total_score) as overall_total_score,
415
+ SUM(score_weight_sum) as overall_weight_sum,
416
+ SUM(score_count) as overall_score_count
417
+ FROM plugin_reports
418
+ WHERE snapshot_id = ? AND total_score IS NOT NULL
419
+ `).get(snapshotId);
420
+ if (snapshotAggregate && snapshotAggregate.overall_score_count > 0) {
421
+ const pluginTotal = snapshotAggregate.overall_total_score || 0;
422
+ const pluginWeight = snapshotAggregate.overall_weight_sum || 0;
423
+ const pluginCount = snapshotAggregate.overall_score_count || 0;
424
+ // 1. Update the plugin-specific aggregate columns in the snapshots table
425
+ this.db.prepare(`
426
+ UPDATE snapshots
427
+ SET
428
+ total_score = ?,
429
+ score_weight_sum = ?,
430
+ score_count = ?,
431
+ score_calculated_at = datetime('now')
432
+ WHERE id = ?
433
+ `).run(pluginTotal, pluginWeight, pluginCount, snapshotId);
434
+ // 2. Blend with the Core Health Score
435
+ // We fetch the current health_score (calculated by HealthService) and treat it
436
+ // as a "Core" provider with a standard weight of 100.
437
+ const snapshot = this.db.prepare('SELECT health_score FROM snapshots WHERE id = ?').get(snapshotId);
438
+ const coreScore = snapshot?.health_score;
439
+ const CORE_WEIGHT = 100;
440
+ let blendedScore = null;
441
+ if (coreScore !== null && coreScore !== undefined) {
442
+ // Weighted Average: (CoreScore * 100 + Sum(PluginScore * Weight)) / (100 + Sum(Weights))
443
+ blendedScore = (coreScore * CORE_WEIGHT + pluginTotal) / (CORE_WEIGHT + pluginWeight);
444
+ }
445
+ else {
446
+ // Fallback to pure plugin average if core health isn't computed
447
+ blendedScore = pluginTotal / pluginWeight;
448
+ }
449
+ if (blendedScore !== null) {
450
+ this.db.prepare('UPDATE snapshots SET health_score = ? WHERE id = ?').run(Number(blendedScore.toFixed(1)), snapshotId);
451
+ }
452
+ }
453
+ }
454
+ catch (err) {
455
+ console.error(`[CrawlithDB.Aggregation] Failed to aggregate snapshot-level scores: ${err.message}`);
456
+ }
457
+ }
458
+ runInTransaction(fn) {
459
+ const tx = this.db.transaction(fn);
460
+ tx();
461
+ }
462
+ _resolveTableName(name) {
463
+ if (this.registry.isTableRegistered(name))
464
+ return name;
465
+ const pluginTable = this._toTableName(name);
466
+ if (this.registry.isTableRegistered(pluginTable))
467
+ return pluginTable;
468
+ return name; // Will fail assertion later
469
+ }
470
+ /** Converts a plugin name to its canonical SQLite table name, sanitizing invalid characters. */
471
+ _toTableName(pluginName) {
472
+ return `${pluginName.replace(/-/g, '_')}_plugin`;
473
+ }
474
+ close() {
475
+ this.db.close();
476
+ }
477
+ _isMigrationExecuted(pluginName) {
478
+ const row = this.statements.getMigration.get(pluginName);
479
+ return !!row;
480
+ }
481
+ _assertSnapshotExists(snapshotId) {
482
+ const row = this.statements.getSnapshot.get(snapshotId);
483
+ if (!row) {
484
+ throw new Error(`Snapshot ID ${snapshotId} does not exist`);
485
+ }
486
+ }
487
+ _assertTableRegistered(tableName) {
488
+ if (!this.registry.isTableRegistered(tableName)) {
489
+ throw new Error(`Access Denied: Table "${tableName}" is not registered by any plugin migration.`);
490
+ }
491
+ }
492
+ _assertOwnership(tableName) {
493
+ if (!this._pluginName)
494
+ return; // Unbound instance has full access
495
+ const owner = this.registry.getPluginForTable(tableName);
496
+ if (owner !== this._pluginName) {
497
+ throw new Error(`Security Violation: Plugin "${this._pluginName}" attempted to access table "${tableName}" owned by "${owner}".`);
498
+ }
499
+ }
500
+ }
@@ -10,9 +10,9 @@ export function loadGraphFromSnapshot(snapshotId) {
10
10
  const edgeRepo = new EdgeRepository(db);
11
11
  const metricsRepo = new MetricsRepository(db);
12
12
  const snapshotRepo = new SnapshotRepository(db);
13
- const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId);
14
- const metrics = metricsRepo.getMetricsIterator(snapshotId);
15
13
  const snapshot = snapshotRepo.getSnapshot(snapshotId);
14
+ const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId, snapshot?.run_type || 'completed');
15
+ const metrics = metricsRepo.getMetricsIterator(snapshotId);
16
16
  const metricsMap = new Map();
17
17
  for (const m of metrics) {
18
18
  metricsMap.set(m.page_id, m);
@@ -27,7 +27,7 @@ export function loadGraphFromSnapshot(snapshotId) {
27
27
  const idMap = new Map();
28
28
  for (const p of pages) {
29
29
  idMap.set(p.id, p.normalized_url);
30
- graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
30
+ graph.addNode(p.normalized_url, p.depth, p.http_status || 0, !!p.is_internal);
31
31
  const m = metricsMap.get(p.id);
32
32
  if (m) {
33
33
  const isProcessed = m.crawl_status === 'fetched' ||
@@ -53,13 +53,14 @@ export function loadGraphFromSnapshot(snapshotId) {
53
53
  incrementalStatus = 'changed';
54
54
  }
55
55
  graph.updateNodeData(p.normalized_url, {
56
+ isInternal: !!p.is_internal,
56
57
  canonical: p.canonical_url || undefined,
58
+ discoveredViaSitemap: !!p.discovered_via_sitemap,
57
59
  contentHash: p.content_hash || undefined,
58
60
  simhash: p.simhash || undefined,
59
61
  etag: p.etag || undefined,
60
62
  lastModified: p.last_modified || undefined,
61
63
  html: p.html || undefined,
62
- soft404Score: p.soft404_score || undefined,
63
64
  noindex: !!p.noindex,
64
65
  nofollow: !!p.nofollow,
65
66
  incrementalStatus,
@@ -71,21 +72,21 @@ export function loadGraphFromSnapshot(snapshotId) {
71
72
  crawlTrapRisk: p.crawl_trap_risk || undefined,
72
73
  trapType: p.trap_type || undefined,
73
74
  // Metrics
74
- pageRank: m?.pagerank ?? undefined,
75
- pageRankScore: m?.pagerank_score ?? m?.pagerank ?? undefined,
76
- authorityScore: m?.authority_score ?? undefined,
77
- hubScore: m?.hub_score ?? undefined,
78
- linkRole: m?.link_role ?? undefined,
79
- // Duplicate info
80
- duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
81
- duplicateType: m?.duplicate_type ?? undefined,
82
- isClusterPrimary: m?.is_cluster_primary ? true : undefined,
83
75
  // Additional metrics
84
76
  crawlStatus: m?.crawl_status || undefined,
85
77
  wordCount: m?.word_count != null ? m.word_count : undefined,
86
78
  thinContentScore: m?.thin_content_score != null ? m.thin_content_score : undefined,
87
79
  externalLinkRatio: m?.external_link_ratio != null ? m.external_link_ratio : undefined,
88
- orphanScore: m?.orphan_score != null ? m.orphan_score : undefined,
80
+ pagerankScore: m?.pagerank_score ?? undefined,
81
+ hubScore: m?.hub_score ?? undefined,
82
+ authScore: m?.auth_score ?? undefined,
83
+ linkRole: m?.link_role ?? undefined,
84
+ soft404Score: m?.soft404_score ?? undefined,
85
+ headingScore: m?.heading_score ?? undefined,
86
+ orphanScore: m?.orphan_score ?? undefined,
87
+ orphanType: m?.orphan_type ?? undefined,
88
+ impactLevel: m?.impact_level ?? undefined,
89
+ headingData: m?.heading_data ?? undefined,
89
90
  });
90
91
  }
91
92
  const edges = edgeRepo.getEdgesIteratorBySnapshot(snapshotId);
@@ -96,24 +97,6 @@ export function loadGraphFromSnapshot(snapshotId) {
96
97
  graph.addEdge(source, target, e.weight || 1.0);
97
98
  }
98
99
  }
99
- // Load duplicate clusters
100
- const dupClusters = db.prepare('SELECT * FROM duplicate_clusters WHERE snapshot_id = ?').all(snapshotId);
101
- graph.duplicateClusters = dupClusters.map(c => ({
102
- id: c.id,
103
- type: c.type,
104
- size: c.size,
105
- representative: c.representative,
106
- severity: c.severity
107
- }));
108
- // Load content clusters
109
- const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId);
110
- graph.contentClusters = contentClusters.map(c => ({
111
- id: c.id,
112
- count: c.count,
113
- primaryUrl: c.primary_url,
114
- risk: c.risk,
115
- sharedPathPrefix: c.shared_path_prefix || undefined
116
- }));
117
100
  // Set session stats
118
101
  graph.sessionStats = {
119
102
  pagesFetched,
@@ -1,7 +1,15 @@
1
1
  import Database from 'better-sqlite3';
2
+ import { CrawlithDB } from './CrawlithDB.js';
2
3
  export * from './repositories/SiteRepository.js';
3
4
  export * from './repositories/SnapshotRepository.js';
4
- export { initSchema } from './schema.js';
5
+ export * from './CrawlithDB.js';
5
6
  export declare function getDbPath(): string;
7
+ /**
8
+ * Returns the higher-level CrawlithDB wrapper for plugins and new code.
9
+ */
10
+ export declare function getCrawlithDB(): CrawlithDB;
11
+ /**
12
+ * Returns the raw better-sqlite3 Database instance for legacy repositories.
13
+ */
6
14
  export declare function getDb(): Database.Database;
7
15
  export declare function closeDb(): void;