@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
package/dist/db/index.js CHANGED
@@ -1,12 +1,12 @@
1
- import Database from 'better-sqlite3';
2
1
  import path from 'node:path';
3
2
  import fs from 'node:fs';
4
3
  import os from 'node:os';
5
- import { initSchema } from './schema.js';
4
+ import { CrawlithDB } from './CrawlithDB.js';
6
5
  let dbInstance = null;
6
+ let crawlithDbInstance = null;
7
7
  export * from './repositories/SiteRepository.js';
8
8
  export * from './repositories/SnapshotRepository.js';
9
- export { initSchema } from './schema.js';
9
+ export * from './CrawlithDB.js';
10
10
  export function getDbPath() {
11
11
  if (process.env.NODE_ENV === 'test') {
12
12
  return ':memory:';
@@ -23,42 +23,50 @@ export function getDbPath() {
23
23
  }
24
24
  return path.join(crawlithDir, 'crawlith.db');
25
25
  }
26
- export function getDb() {
27
- if (dbInstance) {
28
- return dbInstance;
26
+ /**
27
+ * Returns the higher-level CrawlithDB wrapper for plugins and new code.
28
+ */
29
+ export function getCrawlithDB() {
30
+ if (crawlithDbInstance) {
31
+ return crawlithDbInstance;
29
32
  }
30
33
  const dbPath = getDbPath();
31
- const db = new Database(dbPath);
32
- // Hardening & Performance Configuration
33
- db.pragma('journal_mode = WAL');
34
- db.pragma('synchronous = NORMAL');
35
- db.pragma('foreign_keys = ON');
36
- db.pragma('temp_store = MEMORY');
37
- db.pragma('mmap_size = 30000000000');
38
- db.pragma('cache_size = -20000');
39
- db.pragma('busy_timeout = 5000');
40
- // Security controls
41
- // Ensure file permissions are 600 (user read/write only)
34
+ crawlithDbInstance = new CrawlithDB(dbPath);
35
+ dbInstance = crawlithDbInstance.unsafeGetRawDb();
36
+ // Migrations for existing tables
42
37
  try {
43
- fs.chmodSync(dbPath, 0o600);
38
+ dbInstance.exec(`ALTER TABLE pages ADD COLUMN discovered_via_sitemap INTEGER DEFAULT 0;`);
44
39
  }
45
- catch (_e) {
46
- // might fail on first creation if file doesn't exist yet, but better-sqlite3 creates it
47
- // so we can try again or ignore if it's new
40
+ catch (_e) { /* ignore */ }
41
+ // Security controls: Ensure file permissions are 600 (user read/write only)
42
+ if (dbPath !== ':memory:') {
43
+ try {
44
+ fs.chmodSync(dbPath, 0o600);
45
+ }
46
+ catch (_e) {
47
+ // might fail if file doesn't exist yet but better-sqlite3 should have created it
48
+ }
48
49
  }
49
- // Integrity check on startup
50
- const integrity = db.pragma('integrity_check', { simple: true });
51
- if (integrity !== 'ok') {
52
- // Reverted to console.warn to avoid breaking change
53
- console.warn('Database integrity check failed:', integrity);
50
+ return crawlithDbInstance;
51
+ }
52
+ /**
53
+ * Returns the raw better-sqlite3 Database instance for legacy repositories.
54
+ */
55
+ export function getDb() {
56
+ if (dbInstance) {
57
+ return dbInstance;
54
58
  }
55
- // Initialize schema
56
- initSchema(db);
57
- dbInstance = db;
58
- return db;
59
+ // Initializing via getCrawlithDB ensures consistent configuration
60
+ getCrawlithDB();
61
+ return dbInstance;
59
62
  }
60
63
  export function closeDb() {
61
- if (dbInstance) {
64
+ if (crawlithDbInstance) {
65
+ crawlithDbInstance.close();
66
+ crawlithDbInstance = null;
67
+ dbInstance = null;
68
+ }
69
+ else if (dbInstance) {
62
70
  dbInstance.close();
63
71
  dbInstance = null;
64
72
  }
@@ -0,0 +1,2 @@
1
+ import { Database } from 'better-sqlite3';
2
+ export declare function runBaseMigrations(db: Database): void;
@@ -1,9 +1,11 @@
1
- export function initSchema(db) {
1
+ export function runBaseMigrations(db) {
2
2
  // Sites Table
3
3
  db.exec(`
4
4
  CREATE TABLE IF NOT EXISTS sites (
5
5
  id INTEGER PRIMARY KEY AUTOINCREMENT,
6
6
  domain TEXT UNIQUE NOT NULL,
7
+ preferred_url TEXT,
8
+ ssl INTEGER,
7
9
  created_at TEXT DEFAULT (datetime('now')),
8
10
  settings_json TEXT,
9
11
  is_active INTEGER DEFAULT 1
@@ -14,18 +16,36 @@ export function initSchema(db) {
14
16
  CREATE TABLE IF NOT EXISTS snapshots (
15
17
  id INTEGER PRIMARY KEY AUTOINCREMENT,
16
18
  site_id INTEGER NOT NULL,
17
- type TEXT CHECK(type IN ('full', 'partial', 'incremental')) NOT NULL,
19
+ run_type TEXT CHECK(run_type IN ('completed', 'incremental', 'single')) NOT NULL DEFAULT 'completed',
18
20
  created_at TEXT DEFAULT (datetime('now')),
19
21
  node_count INTEGER DEFAULT 0,
20
22
  edge_count INTEGER DEFAULT 0,
21
- status TEXT CHECK(status IN ('running', 'completed', 'failed')) DEFAULT 'running',
23
+ status TEXT CHECK(status IN ('queued', 'running', 'completed', 'failed', 'cancelled')) DEFAULT 'running',
22
24
  limit_reached INTEGER DEFAULT 0,
23
25
  health_score REAL,
24
26
  orphan_count INTEGER,
25
27
  thin_content_count INTEGER,
28
+ total_score REAL,
29
+ score_count INTEGER,
30
+ score_weight_sum REAL,
31
+ score_calculated_at TEXT,
26
32
  FOREIGN KEY(site_id) REFERENCES sites(id) ON DELETE CASCADE
27
33
  );
28
34
  `);
35
+ // Migration for snapshots: run_type and status
36
+ try {
37
+ db.exec(`ALTER TABLE snapshots ADD COLUMN run_type TEXT CHECK(run_type IN ('completed', 'incremental', 'single')) NOT NULL DEFAULT 'completed';`);
38
+ }
39
+ catch (_e) { /* ignore */ }
40
+ try {
41
+ // If type column exists, populate run_type from it
42
+ db.exec(`UPDATE snapshots SET run_type = CASE WHEN type = 'partial' THEN 'single' ELSE 'completed' END WHERE run_type IS NULL OR run_type = 'full' OR run_type = 'completed';`);
43
+ }
44
+ catch (_e) { /* ignore */ }
45
+ try {
46
+ db.exec(`ALTER TABLE snapshots DROP COLUMN type;`);
47
+ }
48
+ catch (_e) { /* ignore */ }
29
49
  // Pages Table
30
50
  db.exec(`
31
51
  CREATE TABLE IF NOT EXISTS pages (
@@ -41,14 +61,15 @@ export function initSchema(db) {
41
61
  etag TEXT,
42
62
  last_modified TEXT,
43
63
  html TEXT,
44
- soft404_score REAL,
45
64
  noindex INTEGER DEFAULT 0,
46
65
  nofollow INTEGER DEFAULT 0,
47
66
  security_error TEXT,
48
67
  retries INTEGER DEFAULT 0,
49
68
  depth INTEGER DEFAULT 0,
69
+ discovered_via_sitemap INTEGER DEFAULT 0,
50
70
  redirect_chain TEXT,
51
71
  bytes_received INTEGER,
72
+ is_internal INTEGER DEFAULT 1,
52
73
  crawl_trap_flag INTEGER DEFAULT 0,
53
74
  crawl_trap_risk REAL,
54
75
  trap_type TEXT,
@@ -60,7 +81,15 @@ export function initSchema(db) {
60
81
  UNIQUE(site_id, normalized_url)
61
82
  );
62
83
  `);
63
- // Index for Pages
84
+ // Migrations for existing tables
85
+ try {
86
+ db.exec(`ALTER TABLE pages ADD COLUMN is_internal INTEGER DEFAULT 1;`);
87
+ }
88
+ catch (_e) { /* ignore */ }
89
+ try {
90
+ db.exec(`ALTER TABLE pages ADD COLUMN discovered_via_sitemap INTEGER DEFAULT 0;`);
91
+ }
92
+ catch (_e) { /* ignore */ }
64
93
  db.exec(`CREATE INDEX IF NOT EXISTS idx_pages_site_last_seen ON pages(site_id, last_seen_snapshot_id);`);
65
94
  // Edges Table
66
95
  db.exec(`
@@ -76,26 +105,29 @@ export function initSchema(db) {
76
105
  FOREIGN KEY(target_page_id) REFERENCES pages(id) ON DELETE CASCADE
77
106
  );
78
107
  `);
79
- // Index for Edges
80
- db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot_source ON edges(snapshot_id, source_page_id);`);
81
108
  db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot ON edges(snapshot_id);`);
82
109
  // Metrics Table
83
110
  db.exec(`
84
111
  CREATE TABLE IF NOT EXISTS metrics (
85
112
  snapshot_id INTEGER NOT NULL,
86
113
  page_id INTEGER NOT NULL,
87
- authority_score REAL,
88
- hub_score REAL,
89
- pagerank REAL,
90
- pagerank_score REAL,
91
- link_role TEXT CHECK(link_role IN ('hub', 'authority', 'power', 'balanced', 'peripheral')),
92
114
  crawl_status TEXT,
93
115
  word_count INTEGER,
94
116
  thin_content_score REAL,
95
117
  external_link_ratio REAL,
96
118
  orphan_score INTEGER,
119
+ pagerank_score REAL,
120
+ hub_score REAL,
121
+ auth_score REAL,
122
+ link_role TEXT,
97
123
  duplicate_cluster_id TEXT,
98
- duplicate_type TEXT CHECK(duplicate_type IN ('exact', 'near', 'template_heavy', 'none')),
124
+ duplicate_type TEXT,
125
+ cluster_id INTEGER,
126
+ soft404_score REAL,
127
+ heading_score REAL,
128
+ orphan_type TEXT,
129
+ impact_level TEXT,
130
+ heading_data TEXT,
99
131
  is_cluster_primary INTEGER DEFAULT 0,
100
132
  PRIMARY KEY(snapshot_id, page_id),
101
133
  FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
@@ -129,41 +161,56 @@ export function initSchema(db) {
129
161
  FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
130
162
  );
131
163
  `);
132
- // Migration: add columns to existing DBs that were created before this update
133
- migrateSchema(db);
134
- }
135
- function migrateSchema(db) {
136
- // Add missing columns to pages (safe: ALTER TABLE ADD COLUMN is idempotent-safe with try/catch)
137
- const pageColumns = [
138
- ['redirect_chain', 'TEXT'],
139
- ['bytes_received', 'INTEGER'],
140
- ['crawl_trap_flag', 'INTEGER DEFAULT 0'],
141
- ['crawl_trap_risk', 'REAL'],
142
- ['trap_type', 'TEXT'],
143
- ];
144
- for (const [col, type] of pageColumns) {
145
- try {
146
- db.exec(`ALTER TABLE pages ADD COLUMN ${col} ${type}`);
147
- }
148
- catch { /* already exists */ }
149
- }
150
- // Add missing columns to edges
151
- try {
152
- db.exec('ALTER TABLE edges ADD COLUMN weight REAL DEFAULT 1.0');
153
- }
154
- catch { /* already exists */ }
155
- // Add missing columns to metrics
156
- const metricsColumns = [
164
+ // Plugin Migrations Table
165
+ db.exec(`
166
+ CREATE TABLE IF NOT EXISTS plugin_migrations (
167
+ plugin_name TEXT PRIMARY KEY,
168
+ executed_at TEXT NOT NULL DEFAULT (datetime('now'))
169
+ );
170
+ `);
171
+ // Universal Plugin Reports Table
172
+ db.exec(`
173
+ CREATE TABLE IF NOT EXISTS plugin_reports (
174
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
175
+ snapshot_id INTEGER NOT NULL,
176
+ plugin_name TEXT NOT NULL,
177
+ data TEXT NOT NULL,
178
+ total_score REAL,
179
+ score_count INTEGER,
180
+ score_weight_sum REAL,
181
+ score_calculated_at TEXT,
182
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
183
+ FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
184
+ );
185
+ `);
186
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_plugin_reports_snapshot ON plugin_reports(snapshot_id);`);
187
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_plugin_reports_composite ON plugin_reports(snapshot_id, plugin_name);`);
188
+ // Migrations for metrics
189
+ const metricsCols = [
157
190
  ['pagerank_score', 'REAL'],
191
+ ['hub_score', 'REAL'],
192
+ ['auth_score', 'REAL'],
158
193
  ['link_role', 'TEXT'],
159
- ['duplicate_cluster_id', 'TEXT'],
160
- ['duplicate_type', 'TEXT'],
161
- ['is_cluster_primary', 'INTEGER DEFAULT 0'],
194
+ ['cluster_id', 'INTEGER'],
195
+ ['soft404_score', 'REAL'],
196
+ ['heading_score', 'REAL'],
197
+ ['orphan_type', 'TEXT'],
198
+ ['impact_level', 'TEXT'],
199
+ ['heading_data', 'TEXT'],
162
200
  ];
163
- for (const [col, type] of metricsColumns) {
201
+ for (const [col, type] of metricsCols) {
164
202
  try {
165
203
  db.exec(`ALTER TABLE metrics ADD COLUMN ${col} ${type}`);
166
204
  }
167
- catch { /* already exists */ }
205
+ catch { /* ignore */ }
206
+ }
207
+ // Final site column migrations
208
+ try {
209
+ db.exec('ALTER TABLE sites ADD COLUMN preferred_url TEXT');
210
+ }
211
+ catch { /* ignore */ }
212
+ try {
213
+ db.exec('ALTER TABLE sites ADD COLUMN ssl INTEGER');
168
214
  }
215
+ catch { /* ignore */ }
169
216
  }
@@ -0,0 +1,9 @@
1
+ export declare class PluginRegistry {
2
+ private registeredPlugins;
3
+ private registeredTables;
4
+ registerPlugin(pluginName: string): void;
5
+ isPluginRegistered(pluginName: string): boolean;
6
+ registerTable(tableName: string, pluginName: string): void;
7
+ getPluginForTable(tableName: string): string | undefined;
8
+ isTableRegistered(tableName: string): boolean;
9
+ }
@@ -0,0 +1,19 @@
1
+ export class PluginRegistry {
2
+ registeredPlugins = new Set();
3
+ registeredTables = new Map(); // tableName -> pluginName
4
+ registerPlugin(pluginName) {
5
+ this.registeredPlugins.add(pluginName);
6
+ }
7
+ isPluginRegistered(pluginName) {
8
+ return this.registeredPlugins.has(pluginName);
9
+ }
10
+ registerTable(tableName, pluginName) {
11
+ this.registeredTables.set(tableName, pluginName);
12
+ }
13
+ getPluginForTable(tableName) {
14
+ return this.registeredTables.get(tableName);
15
+ }
16
+ isTableRegistered(tableName) {
17
+ return this.registeredTables.has(tableName);
18
+ }
19
+ }
@@ -19,6 +19,11 @@ export declare class EdgeRepository {
19
19
  weight: number;
20
20
  rel: string;
21
21
  }[]): void;
22
+ /**
23
+ * Remove all edges originating from a specific page within a snapshot.
24
+ * Used when re-crawling a page into a reused partial snapshot to avoid duplicates.
25
+ */
26
+ deleteEdgesForPage(snapshotId: number, sourcePageId: number): void;
22
27
  getEdgesBySnapshot(snapshotId: number): Edge[];
23
28
  getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge>;
24
29
  }
@@ -21,6 +21,13 @@ export class EdgeRepository {
21
21
  });
22
22
  tx(edges);
23
23
  }
24
+ /**
25
+ * Remove all edges originating from a specific page within a snapshot.
26
+ * Used when re-crawling a page into a reused partial snapshot to avoid duplicates.
27
+ */
28
+ deleteEdgesForPage(snapshotId, sourcePageId) {
29
+ this.db.prepare('DELETE FROM edges WHERE snapshot_id = ? AND source_page_id = ?').run(snapshotId, sourcePageId);
30
+ }
24
31
  getEdgesBySnapshot(snapshotId) {
25
32
  return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId);
26
33
  }
@@ -2,19 +2,24 @@ import { Database } from 'better-sqlite3';
2
2
  export interface DbMetrics {
3
3
  snapshot_id: number;
4
4
  page_id: number;
5
- authority_score: number | null;
6
- hub_score: number | null;
7
- pagerank: number | null;
8
- pagerank_score: number | null;
9
- link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
10
5
  crawl_status: string | null;
11
6
  word_count: number | null;
12
7
  thin_content_score: number | null;
13
8
  external_link_ratio: number | null;
14
- orphan_score: number | null;
9
+ pagerank_score: number | null;
10
+ hub_score: number | null;
11
+ auth_score: number | null;
12
+ link_role: string | null;
15
13
  duplicate_cluster_id: string | null;
16
- duplicate_type: 'exact' | 'near' | 'template_heavy' | 'none' | null;
17
- is_cluster_primary: number;
14
+ duplicate_type: string | null;
15
+ cluster_id: number | null;
16
+ soft404_score: number | null;
17
+ heading_score: number | null;
18
+ orphan_score: number | null;
19
+ orphan_type: string | null;
20
+ impact_level: string | null;
21
+ heading_data: string | null;
22
+ is_cluster_primary: number | null;
18
23
  }
19
24
  export declare class MetricsRepository {
20
25
  private db;
@@ -7,13 +7,21 @@ export class MetricsRepository {
7
7
  this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
8
8
  this.insertStmt = this.db.prepare(`
9
9
  INSERT OR REPLACE INTO metrics (
10
- snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
11
- link_role, crawl_status, word_count, thin_content_score, external_link_ratio,
12
- orphan_score, duplicate_cluster_id, duplicate_type, is_cluster_primary
10
+ snapshot_id, page_id,
11
+ crawl_status, word_count, thin_content_score, external_link_ratio,
12
+ pagerank_score, hub_score, auth_score, link_role,
13
+ duplicate_cluster_id, duplicate_type, cluster_id,
14
+ soft404_score, heading_score,
15
+ orphan_score, orphan_type, impact_level,
16
+ heading_data, is_cluster_primary
13
17
  ) VALUES (
14
- @snapshot_id, @page_id, @authority_score, @hub_score, @pagerank, @pagerank_score,
15
- @link_role, @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
16
- @orphan_score, @duplicate_cluster_id, @duplicate_type, @is_cluster_primary
18
+ @snapshot_id, @page_id,
19
+ @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
20
+ @pagerank_score, @hub_score, @auth_score, @link_role,
21
+ @duplicate_cluster_id, @duplicate_type, @cluster_id,
22
+ @soft404_score, @heading_score,
23
+ @orphan_score, @orphan_type, @impact_level,
24
+ @heading_data, @is_cluster_primary
17
25
  )
18
26
  `);
19
27
  }
@@ -12,14 +12,15 @@ export interface Page {
12
12
  etag: string | null;
13
13
  last_modified: string | null;
14
14
  html: string | null;
15
- soft404_score: number | null;
16
15
  noindex: number;
17
16
  nofollow: number;
18
17
  security_error: string | null;
19
18
  retries: number;
20
19
  depth: number;
20
+ discovered_via_sitemap: number;
21
21
  redirect_chain: string | null;
22
22
  bytes_received: number | null;
23
+ is_internal: number;
23
24
  crawl_trap_flag: number;
24
25
  crawl_trap_risk: number | null;
25
26
  trap_type: string | null;
@@ -48,11 +49,12 @@ export declare class PageRepository {
48
49
  normalized_url: string;
49
50
  last_seen_snapshot_id: number;
50
51
  })[]): Map<string, number>;
51
- getPagesBySnapshot(snapshotId: number): Page[];
52
+ getPagesBySnapshot(snapshotId: number, runType?: string): Page[];
52
53
  getPagesIdentityBySnapshot(snapshotId: number): {
53
54
  id: number;
54
55
  normalized_url: string;
55
56
  }[];
56
- getPagesIteratorBySnapshot(snapshotId: number): IterableIterator<Page>;
57
+ getPagesIteratorBySnapshot(snapshotId: number, runType?: string): IterableIterator<Page>;
57
58
  getIdByUrl(siteId: number, url: string): number | undefined;
59
+ reconcileInternalUrls(siteId: number, siteOrigin: string): void;
58
60
  }
@@ -8,17 +8,18 @@ export class PageRepository {
8
8
  INSERT INTO pages (
9
9
  site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
10
10
  http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
11
- soft404_score, noindex, nofollow, security_error, retries, depth,
12
- redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
11
+ noindex, nofollow, security_error, retries, depth,
12
+ discovered_via_sitemap, redirect_chain, bytes_received, is_internal, crawl_trap_flag, crawl_trap_risk, trap_type,
13
13
  updated_at
14
14
  ) VALUES (
15
15
  @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
16
16
  @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
17
- @soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
18
- @redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
17
+ @noindex, @nofollow, @security_error, @retries, @depth,
18
+ @discovered_via_sitemap, @redirect_chain, @bytes_received, @is_internal, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
19
19
  datetime('now')
20
20
  )
21
21
  ON CONFLICT(site_id, normalized_url) DO UPDATE SET
22
+ first_seen_snapshot_id = COALESCE(pages.first_seen_snapshot_id, excluded.first_seen_snapshot_id),
22
23
  last_seen_snapshot_id = excluded.last_seen_snapshot_id,
23
24
  http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
24
25
  canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
@@ -27,14 +28,15 @@ export class PageRepository {
27
28
  etag = COALESCE(excluded.etag, pages.etag),
28
29
  last_modified = COALESCE(excluded.last_modified, pages.last_modified),
29
30
  html = COALESCE(excluded.html, pages.html),
30
- soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
31
31
  noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
32
32
  nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
33
33
  security_error = COALESCE(excluded.security_error, pages.security_error),
34
34
  retries = MAX(pages.retries, excluded.retries),
35
35
  depth = MIN(pages.depth, excluded.depth),
36
+ discovered_via_sitemap = MAX(pages.discovered_via_sitemap, excluded.discovered_via_sitemap),
36
37
  redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
37
38
  bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
39
+ is_internal = COALESCE(excluded.is_internal, pages.is_internal),
38
40
  crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
39
41
  crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
40
42
  trap_type = COALESCE(excluded.trap_type, pages.trap_type),
@@ -55,14 +57,15 @@ export class PageRepository {
55
57
  etag: page.etag ?? null,
56
58
  last_modified: page.last_modified ?? null,
57
59
  html: page.html ?? null,
58
- soft404_score: page.soft404_score ?? null,
59
60
  noindex: page.noindex ?? 0,
60
61
  nofollow: page.nofollow ?? 0,
61
62
  security_error: page.security_error ?? null,
62
63
  retries: page.retries ?? 0,
63
64
  depth: page.depth ?? 0,
65
+ discovered_via_sitemap: page.discovered_via_sitemap ?? 0,
64
66
  redirect_chain: page.redirect_chain ?? null,
65
67
  bytes_received: page.bytes_received ?? null,
68
+ is_internal: page.is_internal ?? 1,
66
69
  crawl_trap_flag: page.crawl_trap_flag ?? 0,
67
70
  crawl_trap_risk: page.crawl_trap_risk ?? null,
68
71
  trap_type: page.trap_type ?? null,
@@ -103,17 +106,18 @@ export class PageRepository {
103
106
  INSERT INTO pages (
104
107
  site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
105
108
  http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
106
- soft404_score, noindex, nofollow, security_error, retries, depth,
107
- redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
109
+ noindex, nofollow, security_error, retries, depth,
110
+ discovered_via_sitemap, redirect_chain, bytes_received, is_internal, crawl_trap_flag, crawl_trap_risk, trap_type,
108
111
  updated_at
109
112
  ) VALUES (
110
113
  @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
111
114
  @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
112
- @soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
113
- @redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
115
+ @noindex, @nofollow, @security_error, @retries, @depth,
116
+ @discovered_via_sitemap, @redirect_chain, @bytes_received, @is_internal, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
114
117
  datetime('now')
115
118
  )
116
119
  ON CONFLICT(site_id, normalized_url) DO UPDATE SET
120
+ first_seen_snapshot_id = COALESCE(pages.first_seen_snapshot_id, excluded.first_seen_snapshot_id),
117
121
  last_seen_snapshot_id = excluded.last_seen_snapshot_id,
118
122
  http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
119
123
  canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
@@ -122,14 +126,15 @@ export class PageRepository {
122
126
  etag = COALESCE(excluded.etag, pages.etag),
123
127
  last_modified = COALESCE(excluded.last_modified, pages.last_modified),
124
128
  html = COALESCE(excluded.html, pages.html),
125
- soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
126
129
  noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
127
130
  nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
128
131
  security_error = COALESCE(excluded.security_error, pages.security_error),
129
132
  retries = MAX(pages.retries, excluded.retries),
130
133
  depth = MIN(pages.depth, excluded.depth),
134
+ discovered_via_sitemap = MAX(pages.discovered_via_sitemap, excluded.discovered_via_sitemap),
131
135
  redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
132
136
  bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
137
+ is_internal = COALESCE(excluded.is_internal, pages.is_internal),
133
138
  crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
134
139
  crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
135
140
  trap_type = COALESCE(excluded.trap_type, pages.trap_type),
@@ -151,14 +156,15 @@ export class PageRepository {
151
156
  etag: page.etag ?? null,
152
157
  last_modified: page.last_modified ?? null,
153
158
  html: page.html ?? null,
154
- soft404_score: page.soft404_score ?? null,
155
159
  noindex: page.noindex ?? 0,
156
160
  nofollow: page.nofollow ?? 0,
157
161
  security_error: page.security_error ?? null,
158
162
  retries: page.retries ?? 0,
159
163
  depth: page.depth ?? 0,
164
+ discovered_via_sitemap: page.discovered_via_sitemap ?? 0,
160
165
  redirect_chain: page.redirect_chain ?? null,
161
166
  bytes_received: page.bytes_received ?? null,
167
+ is_internal: page.is_internal ?? 1,
162
168
  crawl_trap_flag: page.crawl_trap_flag ?? 0,
163
169
  crawl_trap_risk: page.crawl_trap_risk ?? null,
164
170
  trap_type: page.trap_type ?? null,
@@ -170,17 +176,62 @@ export class PageRepository {
170
176
  tx(pages);
171
177
  return urlToId;
172
178
  }
173
- getPagesBySnapshot(snapshotId) {
174
- return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId);
179
+ getPagesBySnapshot(snapshotId, runType = 'completed') {
180
+ if (runType === 'single') {
181
+ return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').all(snapshotId);
182
+ }
183
+ return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId);
175
184
  }
176
185
  getPagesIdentityBySnapshot(snapshotId) {
177
- return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId);
186
+ // For identities, always loading all up to this point is fine for the crawler to map URLs to IDs.
187
+ return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId);
178
188
  }
179
- getPagesIteratorBySnapshot(snapshotId) {
180
- return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').iterate(snapshotId, snapshotId);
189
+ getPagesIteratorBySnapshot(snapshotId, runType = 'completed') {
190
+ if (runType === 'single') {
191
+ return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').iterate(snapshotId);
192
+ }
193
+ return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').iterate(snapshotId, snapshotId);
181
194
  }
182
195
  getIdByUrl(siteId, url) {
183
196
  const row = this.getIdStmt.get(siteId, url);
184
197
  return row?.id;
185
198
  }
199
+ reconcileInternalUrls(siteId, siteOrigin) {
200
+ const origin = siteOrigin.replace(/\/+$/, '');
201
+ const tx = this.db.transaction(() => {
202
+ const rows = this.db
203
+ .prepare("SELECT id, normalized_url FROM pages WHERE site_id = ? AND (normalized_url LIKE 'http://%' OR normalized_url LIKE 'https://%')")
204
+ .all(siteId);
205
+ for (const row of rows) {
206
+ let parsed;
207
+ try {
208
+ parsed = new URL(row.normalized_url);
209
+ }
210
+ catch {
211
+ continue;
212
+ }
213
+ if (parsed.origin !== origin) {
214
+ continue;
215
+ }
216
+ const targetPath = `${parsed.pathname}${parsed.search}`;
217
+ if (targetPath === row.normalized_url) {
218
+ continue;
219
+ }
220
+ const existing = this.db
221
+ .prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?')
222
+ .get(siteId, targetPath);
223
+ if (existing && existing.id !== row.id) {
224
+ this.db.prepare('UPDATE edges SET source_page_id = ? WHERE source_page_id = ?').run(existing.id, row.id);
225
+ this.db.prepare('UPDATE edges SET target_page_id = ? WHERE target_page_id = ?').run(existing.id, row.id);
226
+ this.db.prepare('UPDATE OR IGNORE metrics SET page_id = ? WHERE page_id = ?').run(existing.id, row.id);
227
+ this.db.prepare('DELETE FROM metrics WHERE page_id = ?').run(row.id);
228
+ this.db.prepare('DELETE FROM pages WHERE id = ?').run(row.id);
229
+ }
230
+ else {
231
+ this.db.prepare('UPDATE pages SET normalized_url = ? WHERE id = ?').run(targetPath, row.id);
232
+ }
233
+ }
234
+ });
235
+ tx();
236
+ }
186
237
  }