@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
package/dist/db/index.js CHANGED
@@ -1,9 +1,12 @@
1
- import Database from 'better-sqlite3';
2
1
  import path from 'node:path';
3
2
  import fs from 'node:fs';
4
3
  import os from 'node:os';
5
- import { initSchema } from './schema.js';
4
+ import { CrawlithDB } from './CrawlithDB.js';
6
5
  let dbInstance = null;
6
+ let crawlithDbInstance = null;
7
+ export * from './repositories/SiteRepository.js';
8
+ export * from './repositories/SnapshotRepository.js';
9
+ export * from './CrawlithDB.js';
7
10
  export function getDbPath() {
8
11
  if (process.env.NODE_ENV === 'test') {
9
12
  return ':memory:';
@@ -20,41 +23,50 @@ export function getDbPath() {
20
23
  }
21
24
  return path.join(crawlithDir, 'crawlith.db');
22
25
  }
23
- export function getDb() {
24
- if (dbInstance) {
25
- return dbInstance;
26
+ /**
27
+ * Returns the higher-level CrawlithDB wrapper for plugins and new code.
28
+ */
29
+ export function getCrawlithDB() {
30
+ if (crawlithDbInstance) {
31
+ return crawlithDbInstance;
26
32
  }
27
33
  const dbPath = getDbPath();
28
- const db = new Database(dbPath);
29
- // Hardening & Performance Configuration
30
- db.pragma('journal_mode = WAL');
31
- db.pragma('synchronous = NORMAL');
32
- db.pragma('foreign_keys = ON');
33
- db.pragma('temp_store = MEMORY');
34
- db.pragma('mmap_size = 30000000000');
35
- db.pragma('cache_size = -20000');
36
- db.pragma('busy_timeout = 5000');
37
- // Security controls
38
- // Ensure file permissions are 600 (user read/write only)
34
+ crawlithDbInstance = new CrawlithDB(dbPath);
35
+ dbInstance = crawlithDbInstance.unsafeGetRawDb();
36
+ // Migrations for existing tables
39
37
  try {
40
- fs.chmodSync(dbPath, 0o600);
38
+ dbInstance.exec(`ALTER TABLE pages ADD COLUMN discovered_via_sitemap INTEGER DEFAULT 0;`);
41
39
  }
42
- catch (_e) {
43
- // might fail on first creation if file doesn't exist yet, but better-sqlite3 creates it
44
- // so we can try again or ignore if it's new
40
+ catch (_e) { /* ignore */ }
41
+ // Security controls: Ensure file permissions are 600 (user read/write only)
42
+ if (dbPath !== ':memory:') {
43
+ try {
44
+ fs.chmodSync(dbPath, 0o600);
45
+ }
46
+ catch (_e) {
47
+ // might fail if file doesn't exist yet but better-sqlite3 should have created it
48
+ }
45
49
  }
46
- // Integrity check on startup
47
- const integrity = db.pragma('integrity_check', { simple: true });
48
- if (integrity !== 'ok') {
49
- console.warn('Database integrity check failed:', integrity);
50
+ return crawlithDbInstance;
51
+ }
52
+ /**
53
+ * Returns the raw better-sqlite3 Database instance for legacy repositories.
54
+ */
55
+ export function getDb() {
56
+ if (dbInstance) {
57
+ return dbInstance;
50
58
  }
51
- // Initialize schema
52
- initSchema(db);
53
- dbInstance = db;
54
- return db;
59
+ // Initializing via getCrawlithDB ensures consistent configuration
60
+ getCrawlithDB();
61
+ return dbInstance;
55
62
  }
56
63
  export function closeDb() {
57
- if (dbInstance) {
64
+ if (crawlithDbInstance) {
65
+ crawlithDbInstance.close();
66
+ crawlithDbInstance = null;
67
+ dbInstance = null;
68
+ }
69
+ else if (dbInstance) {
58
70
  dbInstance.close();
59
71
  dbInstance = null;
60
72
  }
@@ -0,0 +1,2 @@
1
+ import { Database } from 'better-sqlite3';
2
+ export declare function runBaseMigrations(db: Database): void;
@@ -1,9 +1,11 @@
1
- export function initSchema(db) {
1
+ export function runBaseMigrations(db) {
2
2
  // Sites Table
3
3
  db.exec(`
4
4
  CREATE TABLE IF NOT EXISTS sites (
5
5
  id INTEGER PRIMARY KEY AUTOINCREMENT,
6
6
  domain TEXT UNIQUE NOT NULL,
7
+ preferred_url TEXT,
8
+ ssl INTEGER,
7
9
  created_at TEXT DEFAULT (datetime('now')),
8
10
  settings_json TEXT,
9
11
  is_active INTEGER DEFAULT 1
@@ -14,18 +16,36 @@ export function initSchema(db) {
14
16
  CREATE TABLE IF NOT EXISTS snapshots (
15
17
  id INTEGER PRIMARY KEY AUTOINCREMENT,
16
18
  site_id INTEGER NOT NULL,
17
- type TEXT CHECK(type IN ('full', 'partial', 'incremental')) NOT NULL,
19
+ run_type TEXT CHECK(run_type IN ('completed', 'incremental', 'single')) NOT NULL DEFAULT 'completed',
18
20
  created_at TEXT DEFAULT (datetime('now')),
19
21
  node_count INTEGER DEFAULT 0,
20
22
  edge_count INTEGER DEFAULT 0,
21
- status TEXT CHECK(status IN ('running', 'completed', 'failed')) DEFAULT 'running',
23
+ status TEXT CHECK(status IN ('queued', 'running', 'completed', 'failed', 'cancelled')) DEFAULT 'running',
22
24
  limit_reached INTEGER DEFAULT 0,
23
25
  health_score REAL,
24
26
  orphan_count INTEGER,
25
27
  thin_content_count INTEGER,
28
+ total_score REAL,
29
+ score_count INTEGER,
30
+ score_weight_sum REAL,
31
+ score_calculated_at TEXT,
26
32
  FOREIGN KEY(site_id) REFERENCES sites(id) ON DELETE CASCADE
27
33
  );
28
34
  `);
35
+ // Migration for snapshots: run_type and status
36
+ try {
37
+ db.exec(`ALTER TABLE snapshots ADD COLUMN run_type TEXT CHECK(run_type IN ('completed', 'incremental', 'single')) NOT NULL DEFAULT 'completed';`);
38
+ }
39
+ catch (_e) { /* ignore */ }
40
+ try {
41
+ // If type column exists, populate run_type from it
42
+ db.exec(`UPDATE snapshots SET run_type = CASE WHEN type = 'partial' THEN 'single' ELSE 'completed' END WHERE run_type IS NULL OR run_type = 'full' OR run_type = 'completed';`);
43
+ }
44
+ catch (_e) { /* ignore */ }
45
+ try {
46
+ db.exec(`ALTER TABLE snapshots DROP COLUMN type;`);
47
+ }
48
+ catch (_e) { /* ignore */ }
29
49
  // Pages Table
30
50
  db.exec(`
31
51
  CREATE TABLE IF NOT EXISTS pages (
@@ -41,14 +61,15 @@ export function initSchema(db) {
41
61
  etag TEXT,
42
62
  last_modified TEXT,
43
63
  html TEXT,
44
- soft404_score REAL,
45
64
  noindex INTEGER DEFAULT 0,
46
65
  nofollow INTEGER DEFAULT 0,
47
66
  security_error TEXT,
48
67
  retries INTEGER DEFAULT 0,
49
68
  depth INTEGER DEFAULT 0,
69
+ discovered_via_sitemap INTEGER DEFAULT 0,
50
70
  redirect_chain TEXT,
51
71
  bytes_received INTEGER,
72
+ is_internal INTEGER DEFAULT 1,
52
73
  crawl_trap_flag INTEGER DEFAULT 0,
53
74
  crawl_trap_risk REAL,
54
75
  trap_type TEXT,
@@ -60,7 +81,15 @@ export function initSchema(db) {
60
81
  UNIQUE(site_id, normalized_url)
61
82
  );
62
83
  `);
63
- // Index for Pages
84
+ // Migrations for existing tables
85
+ try {
86
+ db.exec(`ALTER TABLE pages ADD COLUMN is_internal INTEGER DEFAULT 1;`);
87
+ }
88
+ catch (_e) { /* ignore */ }
89
+ try {
90
+ db.exec(`ALTER TABLE pages ADD COLUMN discovered_via_sitemap INTEGER DEFAULT 0;`);
91
+ }
92
+ catch (_e) { /* ignore */ }
64
93
  db.exec(`CREATE INDEX IF NOT EXISTS idx_pages_site_last_seen ON pages(site_id, last_seen_snapshot_id);`);
65
94
  // Edges Table
66
95
  db.exec(`
@@ -76,26 +105,29 @@ export function initSchema(db) {
76
105
  FOREIGN KEY(target_page_id) REFERENCES pages(id) ON DELETE CASCADE
77
106
  );
78
107
  `);
79
- // Index for Edges
80
- db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot_source ON edges(snapshot_id, source_page_id);`);
81
108
  db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot ON edges(snapshot_id);`);
82
109
  // Metrics Table
83
110
  db.exec(`
84
111
  CREATE TABLE IF NOT EXISTS metrics (
85
112
  snapshot_id INTEGER NOT NULL,
86
113
  page_id INTEGER NOT NULL,
87
- authority_score REAL,
88
- hub_score REAL,
89
- pagerank REAL,
90
- pagerank_score REAL,
91
- link_role TEXT CHECK(link_role IN ('hub', 'authority', 'power', 'balanced', 'peripheral')),
92
114
  crawl_status TEXT,
93
115
  word_count INTEGER,
94
116
  thin_content_score REAL,
95
117
  external_link_ratio REAL,
96
118
  orphan_score INTEGER,
119
+ pagerank_score REAL,
120
+ hub_score REAL,
121
+ auth_score REAL,
122
+ link_role TEXT,
97
123
  duplicate_cluster_id TEXT,
98
- duplicate_type TEXT CHECK(duplicate_type IN ('exact', 'near', 'template_heavy', 'none')),
124
+ duplicate_type TEXT,
125
+ cluster_id INTEGER,
126
+ soft404_score REAL,
127
+ heading_score REAL,
128
+ orphan_type TEXT,
129
+ impact_level TEXT,
130
+ heading_data TEXT,
99
131
  is_cluster_primary INTEGER DEFAULT 0,
100
132
  PRIMARY KEY(snapshot_id, page_id),
101
133
  FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
@@ -129,41 +161,56 @@ export function initSchema(db) {
129
161
  FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
130
162
  );
131
163
  `);
132
- // Migration: add columns to existing DBs that were created before this update
133
- migrateSchema(db);
134
- }
135
- function migrateSchema(db) {
136
- // Add missing columns to pages (safe: ALTER TABLE ADD COLUMN is idempotent-safe with try/catch)
137
- const pageColumns = [
138
- ['redirect_chain', 'TEXT'],
139
- ['bytes_received', 'INTEGER'],
140
- ['crawl_trap_flag', 'INTEGER DEFAULT 0'],
141
- ['crawl_trap_risk', 'REAL'],
142
- ['trap_type', 'TEXT'],
143
- ];
144
- for (const [col, type] of pageColumns) {
145
- try {
146
- db.exec(`ALTER TABLE pages ADD COLUMN ${col} ${type}`);
147
- }
148
- catch { /* already exists */ }
149
- }
150
- // Add missing columns to edges
151
- try {
152
- db.exec('ALTER TABLE edges ADD COLUMN weight REAL DEFAULT 1.0');
153
- }
154
- catch { /* already exists */ }
155
- // Add missing columns to metrics
156
- const metricsColumns = [
164
+ // Plugin Migrations Table
165
+ db.exec(`
166
+ CREATE TABLE IF NOT EXISTS plugin_migrations (
167
+ plugin_name TEXT PRIMARY KEY,
168
+ executed_at TEXT NOT NULL DEFAULT (datetime('now'))
169
+ );
170
+ `);
171
+ // Universal Plugin Reports Table
172
+ db.exec(`
173
+ CREATE TABLE IF NOT EXISTS plugin_reports (
174
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
175
+ snapshot_id INTEGER NOT NULL,
176
+ plugin_name TEXT NOT NULL,
177
+ data TEXT NOT NULL,
178
+ total_score REAL,
179
+ score_count INTEGER,
180
+ score_weight_sum REAL,
181
+ score_calculated_at TEXT,
182
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
183
+ FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
184
+ );
185
+ `);
186
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_plugin_reports_snapshot ON plugin_reports(snapshot_id);`);
187
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_plugin_reports_composite ON plugin_reports(snapshot_id, plugin_name);`);
188
+ // Migrations for metrics
189
+ const metricsCols = [
157
190
  ['pagerank_score', 'REAL'],
191
+ ['hub_score', 'REAL'],
192
+ ['auth_score', 'REAL'],
158
193
  ['link_role', 'TEXT'],
159
- ['duplicate_cluster_id', 'TEXT'],
160
- ['duplicate_type', 'TEXT'],
161
- ['is_cluster_primary', 'INTEGER DEFAULT 0'],
194
+ ['cluster_id', 'INTEGER'],
195
+ ['soft404_score', 'REAL'],
196
+ ['heading_score', 'REAL'],
197
+ ['orphan_type', 'TEXT'],
198
+ ['impact_level', 'TEXT'],
199
+ ['heading_data', 'TEXT'],
162
200
  ];
163
- for (const [col, type] of metricsColumns) {
201
+ for (const [col, type] of metricsCols) {
164
202
  try {
165
203
  db.exec(`ALTER TABLE metrics ADD COLUMN ${col} ${type}`);
166
204
  }
167
- catch { /* already exists */ }
205
+ catch { /* ignore */ }
206
+ }
207
+ // Final site column migrations
208
+ try {
209
+ db.exec('ALTER TABLE sites ADD COLUMN preferred_url TEXT');
210
+ }
211
+ catch { /* ignore */ }
212
+ try {
213
+ db.exec('ALTER TABLE sites ADD COLUMN ssl INTEGER');
168
214
  }
215
+ catch { /* ignore */ }
169
216
  }
@@ -0,0 +1,9 @@
1
+ export declare class PluginRegistry {
2
+ private registeredPlugins;
3
+ private registeredTables;
4
+ registerPlugin(pluginName: string): void;
5
+ isPluginRegistered(pluginName: string): boolean;
6
+ registerTable(tableName: string, pluginName: string): void;
7
+ getPluginForTable(tableName: string): string | undefined;
8
+ isTableRegistered(tableName: string): boolean;
9
+ }
@@ -0,0 +1,19 @@
1
+ export class PluginRegistry {
2
+ registeredPlugins = new Set();
3
+ registeredTables = new Map(); // tableName -> pluginName
4
+ registerPlugin(pluginName) {
5
+ this.registeredPlugins.add(pluginName);
6
+ }
7
+ isPluginRegistered(pluginName) {
8
+ return this.registeredPlugins.has(pluginName);
9
+ }
10
+ registerTable(tableName, pluginName) {
11
+ this.registeredTables.set(tableName, pluginName);
12
+ }
13
+ getPluginForTable(tableName) {
14
+ return this.registeredTables.get(tableName);
15
+ }
16
+ isTableRegistered(tableName) {
17
+ return this.registeredTables.has(tableName);
18
+ }
19
+ }
@@ -12,5 +12,18 @@ export declare class EdgeRepository {
12
12
  private insertStmt;
13
13
  constructor(db: Database);
14
14
  insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight?: number, rel?: string): void;
15
+ insertEdges(edges: {
16
+ snapshot_id: number;
17
+ source_page_id: number;
18
+ target_page_id: number;
19
+ weight: number;
20
+ rel: string;
21
+ }[]): void;
22
+ /**
23
+ * Remove all edges originating from a specific page within a snapshot.
24
+ * Used when re-crawling a page into a reused partial snapshot to avoid duplicates.
25
+ */
26
+ deleteEdgesForPage(snapshotId: number, sourcePageId: number): void;
15
27
  getEdgesBySnapshot(snapshotId: number): Edge[];
28
+ getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge>;
16
29
  }
@@ -11,7 +11,27 @@ export class EdgeRepository {
11
11
  insertEdge(snapshotId, sourcePageId, targetPageId, weight = 1.0, rel = 'internal') {
12
12
  this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
13
13
  }
14
+ insertEdges(edges) {
15
+ if (edges.length === 0)
16
+ return;
17
+ const tx = this.db.transaction((edgesBatch) => {
18
+ for (const edge of edgesBatch) {
19
+ this.insertStmt.run(edge.snapshot_id, edge.source_page_id, edge.target_page_id, edge.weight, edge.rel);
20
+ }
21
+ });
22
+ tx(edges);
23
+ }
24
+ /**
25
+ * Remove all edges originating from a specific page within a snapshot.
26
+ * Used when re-crawling a page into a reused partial snapshot to avoid duplicates.
27
+ */
28
+ deleteEdgesForPage(snapshotId, sourcePageId) {
29
+ this.db.prepare('DELETE FROM edges WHERE snapshot_id = ? AND source_page_id = ?').run(snapshotId, sourcePageId);
30
+ }
14
31
  getEdgesBySnapshot(snapshotId) {
15
32
  return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId);
16
33
  }
34
+ getEdgesIteratorBySnapshot(snapshotId) {
35
+ return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').iterate(snapshotId);
36
+ }
17
37
  }
@@ -2,25 +2,33 @@ import { Database } from 'better-sqlite3';
2
2
  export interface DbMetrics {
3
3
  snapshot_id: number;
4
4
  page_id: number;
5
- authority_score: number | null;
6
- hub_score: number | null;
7
- pagerank: number | null;
8
- pagerank_score: number | null;
9
- link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
10
5
  crawl_status: string | null;
11
6
  word_count: number | null;
12
7
  thin_content_score: number | null;
13
8
  external_link_ratio: number | null;
14
- orphan_score: number | null;
9
+ pagerank_score: number | null;
10
+ hub_score: number | null;
11
+ auth_score: number | null;
12
+ link_role: string | null;
15
13
  duplicate_cluster_id: string | null;
16
- duplicate_type: 'exact' | 'near' | 'template_heavy' | 'none' | null;
17
- is_cluster_primary: number;
14
+ duplicate_type: string | null;
15
+ cluster_id: number | null;
16
+ soft404_score: number | null;
17
+ heading_score: number | null;
18
+ orphan_score: number | null;
19
+ orphan_type: string | null;
20
+ impact_level: string | null;
21
+ heading_data: string | null;
22
+ is_cluster_primary: number | null;
18
23
  }
19
24
  export declare class MetricsRepository {
20
25
  private db;
21
26
  private insertStmt;
27
+ private getByPageStmt;
22
28
  constructor(db: Database);
23
29
  insertMetrics(metrics: DbMetrics): void;
24
30
  getMetrics(snapshotId: number): DbMetrics[];
31
+ getMetricsIterator(snapshotId: number): IterableIterator<DbMetrics>;
25
32
  getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined;
33
+ insertMany(metricsList: DbMetrics[]): void;
26
34
  }
@@ -1,17 +1,27 @@
1
1
  export class MetricsRepository {
2
2
  db;
3
3
  insertStmt;
4
+ getByPageStmt;
4
5
  constructor(db) {
5
6
  this.db = db;
7
+ this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
6
8
  this.insertStmt = this.db.prepare(`
7
9
  INSERT OR REPLACE INTO metrics (
8
- snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
9
- link_role, crawl_status, word_count, thin_content_score, external_link_ratio,
10
- orphan_score, duplicate_cluster_id, duplicate_type, is_cluster_primary
10
+ snapshot_id, page_id,
11
+ crawl_status, word_count, thin_content_score, external_link_ratio,
12
+ pagerank_score, hub_score, auth_score, link_role,
13
+ duplicate_cluster_id, duplicate_type, cluster_id,
14
+ soft404_score, heading_score,
15
+ orphan_score, orphan_type, impact_level,
16
+ heading_data, is_cluster_primary
11
17
  ) VALUES (
12
- @snapshot_id, @page_id, @authority_score, @hub_score, @pagerank, @pagerank_score,
13
- @link_role, @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
14
- @orphan_score, @duplicate_cluster_id, @duplicate_type, @is_cluster_primary
18
+ @snapshot_id, @page_id,
19
+ @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
20
+ @pagerank_score, @hub_score, @auth_score, @link_role,
21
+ @duplicate_cluster_id, @duplicate_type, @cluster_id,
22
+ @soft404_score, @heading_score,
23
+ @orphan_score, @orphan_type, @impact_level,
24
+ @heading_data, @is_cluster_primary
15
25
  )
16
26
  `);
17
27
  }
@@ -21,7 +31,18 @@ export class MetricsRepository {
21
31
  getMetrics(snapshotId) {
22
32
  return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId);
23
33
  }
34
+ getMetricsIterator(snapshotId) {
35
+ return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').iterate(snapshotId);
36
+ }
24
37
  getMetricsForPage(snapshotId, pageId) {
25
- return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?').get(snapshotId, pageId);
38
+ return this.getByPageStmt.get(snapshotId, pageId);
39
+ }
40
+ insertMany(metricsList) {
41
+ const insert = this.insertStmt;
42
+ const tx = this.db.transaction((items) => {
43
+ for (const item of items)
44
+ insert.run(item);
45
+ });
46
+ tx(metricsList);
26
47
  }
27
48
  }
@@ -12,14 +12,15 @@ export interface Page {
12
12
  etag: string | null;
13
13
  last_modified: string | null;
14
14
  html: string | null;
15
- soft404_score: number | null;
16
15
  noindex: number;
17
16
  nofollow: number;
18
17
  security_error: string | null;
19
18
  retries: number;
20
19
  depth: number;
20
+ discovered_via_sitemap: number;
21
21
  redirect_chain: string | null;
22
22
  bytes_received: number | null;
23
+ is_internal: number;
23
24
  crawl_trap_flag: number;
24
25
  crawl_trap_risk: number | null;
25
26
  trap_type: string | null;
@@ -42,6 +43,18 @@ export declare class PageRepository {
42
43
  last_seen_snapshot_id: number;
43
44
  }): number;
44
45
  getPage(siteId: number, url: string): Page | undefined;
45
- getPagesBySnapshot(snapshotId: number): Page[];
46
+ getPagesByUrls(siteId: number, urls: string[]): Page[];
47
+ upsertMany(pages: (Partial<Page> & {
48
+ site_id: number;
49
+ normalized_url: string;
50
+ last_seen_snapshot_id: number;
51
+ })[]): Map<string, number>;
52
+ getPagesBySnapshot(snapshotId: number, runType?: string): Page[];
53
+ getPagesIdentityBySnapshot(snapshotId: number): {
54
+ id: number;
55
+ normalized_url: string;
56
+ }[];
57
+ getPagesIteratorBySnapshot(snapshotId: number, runType?: string): IterableIterator<Page>;
46
58
  getIdByUrl(siteId: number, url: string): number | undefined;
59
+ reconcileInternalUrls(siteId: number, siteOrigin: string): void;
47
60
  }