@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -2,6 +2,8 @@ import { Database } from 'better-sqlite3';
2
2
  export interface Site {
3
3
  id: number;
4
4
  domain: string;
5
+ preferred_url: string | null;
6
+ ssl: number | null;
5
7
  created_at: string;
6
8
  settings_json: string | null;
7
9
  is_active: number;
@@ -13,6 +15,10 @@ export declare class SiteRepository {
13
15
  getSite(domain: string): Site | undefined;
14
16
  getAllSites(): Site[];
15
17
  createSite(domain: string): number;
18
+ updateSitePreference(id: number, prefs: {
19
+ preferred_url: string;
20
+ ssl: number;
21
+ }): void;
16
22
  firstOrCreateSite(domain: string): Site;
17
23
  deleteSite(id: number): void;
18
24
  }
@@ -17,6 +17,10 @@ export class SiteRepository {
17
17
  const info = stmt.run(domain);
18
18
  return info.lastInsertRowid;
19
19
  }
20
+ updateSitePreference(id, prefs) {
21
+ const stmt = this.db.prepare('UPDATE sites SET preferred_url = ?, ssl = ? WHERE id = ?');
22
+ stmt.run(prefs.preferred_url, prefs.ssl, id);
23
+ }
20
24
  firstOrCreateSite(domain) {
21
25
  let site = this.getSite(domain);
22
26
  if (!site) {
@@ -2,11 +2,11 @@ import { Database } from 'better-sqlite3';
2
2
  export interface Snapshot {
3
3
  id: number;
4
4
  site_id: number;
5
- type: 'full' | 'partial' | 'incremental';
5
+ run_type: 'completed' | 'incremental' | 'single';
6
6
  created_at: string;
7
7
  node_count: number;
8
8
  edge_count: number;
9
- status: 'running' | 'completed' | 'failed';
9
+ status: 'queued' | 'running' | 'completed' | 'failed' | 'cancelled';
10
10
  limit_reached: number;
11
11
  health_score: number | null;
12
12
  orphan_count: number | null;
@@ -15,10 +15,17 @@ export interface Snapshot {
15
15
  export declare class SnapshotRepository {
16
16
  private db;
17
17
  constructor(db: Database);
18
- createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status?: 'running' | 'completed' | 'failed'): number;
19
- getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined;
18
+ createSnapshot(siteId: number, runType: Snapshot['run_type'], status?: Snapshot['status']): number;
19
+ getLatestSnapshot(siteId: number, status?: Snapshot['status'], includeSingle?: boolean): Snapshot | undefined;
20
+ touchSnapshot(id: number): void;
20
21
  getSnapshotCount(siteId: number): number;
21
- updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats?: Partial<Snapshot>): void;
22
+ /**
23
+ * Returns true if the site has ever had a completed full or incremental crawl.
24
+ * Single snapshots (from page --live) do NOT count as a "first crawl".
25
+ */
26
+ hasFullCrawl(siteId: number): boolean;
27
+ updateSnapshotStatus(id: number, status: Snapshot['status'], stats?: Partial<Snapshot>): void;
22
28
  getSnapshot(id: number): Snapshot | undefined;
23
29
  deleteSnapshot(id: number): void;
30
+ pruneSnapshots(siteId: number, maxSnapshots: number, maxSingleSnapshots: number, protectedSnapshotId?: number): void;
24
31
  }
@@ -3,18 +3,16 @@ export class SnapshotRepository {
3
3
  constructor(db) {
4
4
  this.db = db;
5
5
  }
6
- createSnapshot(siteId, type, status = 'running') {
7
- // Basic throttling or sleep if needed for tests, but generally SQLite is fast enough to have diff timestamps if not in same ms.
8
- // However, if we run in memory, created_at is default current time.
9
- // If two snapshots created in same second, ORDER BY created_at DESC is unstable or equal.
10
- // We should rely on ID for stability if timestamps are equal, but the query uses created_at.
11
- // Let's ensure we can also order by ID as tie-breaker.
12
- const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
13
- const info = stmt.run(siteId, type, status);
6
+ createSnapshot(siteId, runType, status = 'running') {
7
+ const stmt = this.db.prepare('INSERT INTO snapshots (site_id, run_type, status) VALUES (?, ?, ?)');
8
+ const info = stmt.run(siteId, runType, status);
14
9
  return info.lastInsertRowid;
15
10
  }
16
- getLatestSnapshot(siteId, status) {
17
- let sql = 'SELECT * FROM snapshots WHERE site_id = ? AND type != \'partial\'';
11
+ getLatestSnapshot(siteId, status, includeSingle = false) {
12
+ let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
13
+ if (!includeSingle) {
14
+ sql += ' AND run_type != \'single\'';
15
+ }
18
16
  const params = [siteId];
19
17
  if (status) {
20
18
  sql += ' AND status = ?';
@@ -23,10 +21,21 @@ export class SnapshotRepository {
23
21
  sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
24
22
  return this.db.prepare(sql).get(...params);
25
23
  }
24
+ touchSnapshot(id) {
25
+ this.db.prepare(`UPDATE snapshots SET created_at = datetime('now') WHERE id = ?`).run(id);
26
+ }
26
27
  getSnapshotCount(siteId) {
27
28
  const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId);
28
29
  return result.count;
29
30
  }
31
+ /**
32
+ * Returns true if the site has ever had a completed full or incremental crawl.
33
+ * Single snapshots (from page --live) do NOT count as a "first crawl".
34
+ */
35
+ hasFullCrawl(siteId) {
36
+ const result = this.db.prepare(`SELECT COUNT(*) as count FROM snapshots WHERE site_id = ? AND run_type IN ('completed', 'incremental') AND status = 'completed'`).get(siteId);
37
+ return result.count > 0;
38
+ }
30
39
  updateSnapshotStatus(id, status, stats = {}) {
31
40
  const sets = ['status = ?'];
32
41
  const params = [status];
@@ -73,4 +82,33 @@ export class SnapshotRepository {
73
82
  });
74
83
  tx();
75
84
  }
85
+ pruneSnapshots(siteId, maxSnapshots, maxSingleSnapshots, protectedSnapshotId) {
86
+ const tx = this.db.transaction(() => {
87
+ const singlesToDelete = this.db.prepare(`
88
+ SELECT id
89
+ FROM snapshots
90
+ WHERE site_id = ? AND run_type = 'single'
91
+ ORDER BY created_at DESC, id DESC
92
+ LIMIT -1 OFFSET ?
93
+ `).all(siteId, Math.max(0, maxSingleSnapshots));
94
+ const fullToDelete = this.db.prepare(`
95
+ SELECT id
96
+ FROM snapshots
97
+ WHERE site_id = ? AND run_type IN ('completed', 'incremental')
98
+ ORDER BY created_at DESC, id DESC
99
+ LIMIT -1 OFFSET ?
100
+ `).all(siteId, Math.max(0, maxSnapshots));
101
+ const ids = [...singlesToDelete, ...fullToDelete]
102
+ .map(r => r.id)
103
+ .filter(id => id !== protectedSnapshotId);
104
+ for (const id of ids) {
105
+ // Inline delete logic to keep operation inside this transaction.
106
+ this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
107
+ this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
108
+ this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
109
+ this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
110
+ }
111
+ });
112
+ tx();
113
+ }
76
114
  }
@@ -0,0 +1,9 @@
1
+ export interface ResetOptions {
2
+ reportsDir?: string;
3
+ dryRun?: boolean;
4
+ }
5
+ /**
6
+ * Completely resets the Crawlith state.
7
+ * Deletes the database, clears all locks, and optionally wipes the reports directory.
8
+ */
9
+ export declare function resetCrawlith(options?: ResetOptions): Promise<void>;
@@ -0,0 +1,32 @@
1
+ import fs from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import os from 'node:os';
4
+ import { closeDb, getDb, getDbPath } from './index.js';
5
+ import { LockManager } from '../lock/lockManager.js';
6
+ /**
7
+ * Completely resets the Crawlith state.
8
+ * Deletes the database, clears all locks, and optionally wipes the reports directory.
9
+ */
10
+ export async function resetCrawlith(options = {}) {
11
+ const { reportsDir, dryRun = false } = options;
12
+ if (dryRun) {
13
+ return;
14
+ }
15
+ // 1. Close database connection to release file handles
16
+ closeDb();
17
+ // 2. Clear all locks
18
+ await LockManager.clearAllLocks();
19
+ // 3. Remove the entire state directory (includes DB)
20
+ const dbPath = getDbPath();
21
+ if (dbPath !== ':memory:') {
22
+ const crawlithDir = path.join(os.homedir(), '.crawlith');
23
+ await fs.rm(crawlithDir, { recursive: true, force: true });
24
+ }
25
+ // 4. Remove reports directory if specified
26
+ if (reportsDir) {
27
+ const resolvedReportsDir = path.resolve(reportsDir);
28
+ await fs.rm(resolvedReportsDir, { recursive: true, force: true });
29
+ }
30
+ // 5. Re-initialize database to ensure schema is fresh for next use
31
+ getDb();
32
+ }
@@ -0,0 +1,12 @@
1
+ import { Database, Statement } from 'better-sqlite3';
2
+ export declare class Statements {
3
+ private db;
4
+ getPageIdByUrl: Statement;
5
+ insertPluginReport: Statement;
6
+ getPluginReport: Statement;
7
+ deleteSnapshotPlugins: Statement;
8
+ getSnapshot: Statement;
9
+ getMigration: Statement;
10
+ insertMigration: Statement;
11
+ constructor(db: Database);
12
+ }
@@ -0,0 +1,40 @@
1
+ export class Statements {
2
+ db;
3
+ getPageIdByUrl;
4
+ insertPluginReport;
5
+ getPluginReport;
6
+ deleteSnapshotPlugins;
7
+ getSnapshot;
8
+ getMigration;
9
+ insertMigration;
10
+ constructor(db) {
11
+ this.db = db;
12
+ this.getPageIdByUrl = this.db.prepare(`
13
+ SELECT id FROM pages
14
+ WHERE site_id = (SELECT site_id FROM snapshots WHERE id = ?)
15
+ AND normalized_url = ?
16
+ `);
17
+ this.insertPluginReport = this.db.prepare(`
18
+ INSERT OR REPLACE INTO plugin_reports
19
+ (snapshot_id, plugin_name, data, total_score, score_count, score_weight_sum, score_calculated_at)
20
+ VALUES (?, ?, ?, ?, ?, ?, ?)
21
+ `);
22
+ this.getPluginReport = this.db.prepare(`
23
+ SELECT data FROM plugin_reports
24
+ WHERE snapshot_id = ? AND plugin_name = ?
25
+ ORDER BY created_at DESC LIMIT 1
26
+ `);
27
+ this.deleteSnapshotPlugins = this.db.prepare(`
28
+ DELETE FROM plugin_reports WHERE snapshot_id = ?
29
+ `);
30
+ this.getSnapshot = this.db.prepare(`
31
+ SELECT id FROM snapshots WHERE id = ?
32
+ `);
33
+ this.getMigration = this.db.prepare(`
34
+ SELECT plugin_name FROM plugin_migrations WHERE plugin_name = ?
35
+ `);
36
+ this.insertMigration = this.db.prepare(`
37
+ INSERT INTO plugin_migrations (plugin_name) VALUES (?)
38
+ `);
39
+ }
40
+ }
@@ -12,11 +12,6 @@ export interface DiffResult {
12
12
  oldCanonical: string | null;
13
13
  newCanonical: string | null;
14
14
  }[];
15
- changedDuplicateGroup: {
16
- url: string;
17
- oldGroup: string | null;
18
- newGroup: string | null;
19
- }[];
20
15
  metricDeltas: {
21
16
  structuralEntropy: number;
22
17
  orphanCount: number;
@@ -6,7 +6,6 @@ export function compareGraphs(oldGraph, newGraph) {
6
6
  const removedUrls = [];
7
7
  const changedStatus = [];
8
8
  const changedCanonical = [];
9
- const changedDuplicateGroup = [];
10
9
  // Added & Changed
11
10
  for (const [url, newNode] of newNodes) {
12
11
  const oldNode = oldNodes.get(url);
@@ -26,16 +25,6 @@ export function compareGraphs(oldGraph, newGraph) {
26
25
  newCanonical: newNode.canonical || null
27
26
  });
28
27
  }
29
- // Changed Duplicate Group
30
- const oldGroup = oldNode.duplicateClusterId || null;
31
- const newGroup = newNode.duplicateClusterId || null;
32
- if (oldGroup !== newGroup) {
33
- changedDuplicateGroup.push({
34
- url,
35
- oldGroup,
36
- newGroup
37
- });
38
- }
39
28
  }
40
29
  }
41
30
  // Removed
@@ -58,7 +47,6 @@ export function compareGraphs(oldGraph, newGraph) {
58
47
  removedUrls,
59
48
  changedStatus,
60
49
  changedCanonical,
61
- changedDuplicateGroup,
62
50
  metricDeltas
63
51
  };
64
52
  }
@@ -0,0 +1,16 @@
1
+ import { Graph } from '../graph/graph.js';
2
+ export interface DiffOptions {
3
+ onlyCritical?: boolean;
4
+ }
5
+ export interface SnapshotDiff {
6
+ newPages: string[];
7
+ removedPages: string[];
8
+ changedPages: {
9
+ url: string;
10
+ changes: string[];
11
+ severity: 'low' | 'medium' | 'high';
12
+ }[];
13
+ }
14
+ export declare class DiffService {
15
+ compare(oldGraph: Graph | undefined, newGraph: Graph, _options?: DiffOptions): SnapshotDiff;
16
+ }
@@ -0,0 +1,41 @@
1
+ export class DiffService {
2
+ compare(oldGraph, newGraph, _options = {}) {
3
+ if (!oldGraph) {
4
+ return {
5
+ newPages: Array.from(newGraph.nodes.keys()),
6
+ removedPages: [],
7
+ changedPages: []
8
+ };
9
+ }
10
+ const oldUrls = new Set(oldGraph.nodes.keys());
11
+ const newUrls = new Set(newGraph.nodes.keys());
12
+ const newPages = Array.from(newUrls).filter(u => !oldUrls.has(u));
13
+ const removedPages = Array.from(oldUrls).filter(u => !newUrls.has(u));
14
+ const changedPages = [];
15
+ for (const url of newUrls) {
16
+ if (oldUrls.has(url)) {
17
+ const oldNode = oldGraph.nodes.get(url);
18
+ const newNode = newGraph.nodes.get(url);
19
+ const changes = [];
20
+ let severity = 'low';
21
+ if (oldNode.status !== newNode.status) {
22
+ changes.push(`status: ${oldNode.status} -> ${newNode.status}`);
23
+ severity = 'high';
24
+ }
25
+ if (oldNode.contentHash !== newNode.contentHash) {
26
+ changes.push('content changed');
27
+ if (severity !== 'high')
28
+ severity = 'medium';
29
+ }
30
+ if (oldNode.noindex !== newNode.noindex) {
31
+ changes.push(`noindex: ${oldNode.noindex} -> ${newNode.noindex}`);
32
+ severity = 'high';
33
+ }
34
+ if (changes.length > 0) {
35
+ changedPages.push({ url, changes, severity });
36
+ }
37
+ }
38
+ }
39
+ return { newPages, removedPages, changedPages };
40
+ }
41
+ }
@@ -0,0 +1,4 @@
1
+ export * from '../graph/graph.js';
2
+ export * from '../graph/metrics.js';
3
+ export * from '../graph/simhash.js';
4
+ export * from '../crawler/normalize.js';
@@ -0,0 +1,4 @@
1
+ export * from '../graph/graph.js';
2
+ export * from '../graph/metrics.js';
3
+ export * from '../graph/simhash.js';
4
+ export * from '../crawler/normalize.js';
package/dist/events.d.ts CHANGED
@@ -15,6 +15,14 @@ export type CrawlEvent = {
15
15
  } | {
16
16
  type: 'crawl:limit-reached';
17
17
  limit: number;
18
+ } | {
19
+ type: 'crawl:progress';
20
+ pagesCrawled: number;
21
+ queued: number;
22
+ active: number;
23
+ nodesFound: number;
24
+ edgesFound: number;
25
+ phase?: string;
18
26
  } | {
19
27
  type: 'queue:enqueue';
20
28
  url: string;
@@ -1,5 +1,6 @@
1
1
  export interface GraphNode {
2
2
  url: string;
3
+ isInternal?: boolean;
3
4
  depth: number;
4
5
  inLinks: number;
5
6
  outLinks: number;
@@ -9,50 +10,48 @@ export interface GraphNode {
9
10
  nofollow?: boolean;
10
11
  brokenLinks?: string[];
11
12
  redirectChain?: string[];
13
+ discoveredViaSitemap?: boolean;
12
14
  incrementalStatus?: 'new' | 'changed' | 'unchanged' | 'deleted';
13
15
  etag?: string;
14
16
  lastModified?: string;
15
17
  contentHash?: string;
16
18
  html?: string;
17
- pageRank?: number;
18
- pageRankScore?: number;
19
- authorityScore?: number;
20
- hubScore?: number;
21
- duplicateClusterId?: string;
22
- duplicateType?: 'exact' | 'near' | 'template_heavy' | 'none';
23
- isClusterPrimary?: boolean;
24
- isCollapsed?: boolean;
25
- collapseInto?: string;
26
19
  simhash?: string;
27
20
  uniqueTokenRatio?: number;
28
- soft404Score?: number;
29
- soft404Signals?: string[];
30
21
  crawlTrapFlag?: boolean;
31
22
  crawlTrapRisk?: number;
32
23
  trapType?: string;
33
24
  securityError?: string;
34
25
  retries?: number;
35
- clusterId?: number;
36
26
  bytesReceived?: number;
37
- linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
38
27
  crawlStatus?: string;
39
28
  wordCount?: number;
40
29
  thinContentScore?: number;
41
30
  externalLinkRatio?: number;
31
+ h1Count?: number;
32
+ h2Count?: number;
33
+ title?: string;
34
+ clusterId?: number;
35
+ duplicateClusterId?: string;
36
+ duplicateType?: 'exact' | 'near' | 'template_heavy';
37
+ pagerankScore?: number;
38
+ hubScore?: number;
39
+ authScore?: number;
40
+ linkRole?: string;
41
+ soft404Score?: number;
42
+ headingScore?: number;
42
43
  orphanScore?: number;
44
+ orphanType?: string;
45
+ impactLevel?: string;
46
+ headingData?: string;
47
+ isClusterPrimary?: boolean;
48
+ isCollapsed?: boolean;
43
49
  }
44
50
  export interface GraphEdge {
45
51
  source: string;
46
52
  target: string;
47
53
  weight: number;
48
54
  }
49
- export interface ClusterInfo {
50
- id: number;
51
- count: number;
52
- primaryUrl: string;
53
- risk: 'low' | 'medium' | 'high';
54
- sharedPathPrefix?: string;
55
- }
56
55
  export interface CrawlStats {
57
56
  pagesFetched: number;
58
57
  pagesCached: number;
@@ -64,19 +63,6 @@ export declare class Graph {
64
63
  edges: Map<string, number>;
65
64
  limitReached: boolean;
66
65
  sessionStats: CrawlStats;
67
- trapClusters: {
68
- pattern: string;
69
- type: string;
70
- count: number;
71
- }[];
72
- duplicateClusters: {
73
- id: string;
74
- type: 'exact' | 'near' | 'template_heavy';
75
- size: number;
76
- representative: string;
77
- severity: 'low' | 'medium' | 'high';
78
- }[];
79
- contentClusters: ClusterInfo[];
80
66
  /**
81
67
  * Generates a unique key for an edge.
82
68
  */
@@ -93,7 +79,7 @@ export declare class Graph {
93
79
  * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
94
80
  * Depth is only set on creation (BFS guarantees shortest path first).
95
81
  */
96
- addNode(url: string, depth: number, status?: number): void;
82
+ addNode(url: string, depth: number, status?: number, isInternal?: boolean): void;
97
83
  updateNodeData(url: string, data: Partial<GraphNode>): void;
98
84
  /**
99
85
  * Adds a directed edge between two nodes.
@@ -106,14 +92,6 @@ export declare class Graph {
106
92
  toJSON(): {
107
93
  nodes: GraphNode[];
108
94
  edges: GraphEdge[];
109
- duplicateClusters: {
110
- id: string;
111
- type: "exact" | "near" | "template_heavy";
112
- size: number;
113
- representative: string;
114
- severity: "low" | "medium" | "high";
115
- }[];
116
- contentClusters: ClusterInfo[];
117
95
  };
118
96
  static fromJSON(json: any): Graph;
119
97
  }
@@ -9,32 +9,33 @@ export class Graph {
9
9
  pagesSkipped: 0,
10
10
  totalFound: 0
11
11
  };
12
- trapClusters = [];
13
- duplicateClusters = [];
14
- contentClusters = [];
15
12
  /**
16
13
  * Generates a unique key for an edge.
17
14
  */
18
15
  static getEdgeKey(source, target) {
19
- return JSON.stringify([source, target]);
16
+ return source + '\x00' + target;
20
17
  }
21
18
  /**
22
19
  * Parses an edge key back into source and target.
23
20
  */
24
21
  static parseEdgeKey(key) {
25
- const [source, target] = JSON.parse(key);
26
- return { source, target };
22
+ const splitIndex = key.indexOf('\x00');
23
+ return {
24
+ source: key.slice(0, splitIndex),
25
+ target: key.slice(splitIndex + 1)
26
+ };
27
27
  }
28
28
  /**
29
29
  * Adds a node to the graph if it doesn't exist.
30
30
  * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
31
31
  * Depth is only set on creation (BFS guarantees shortest path first).
32
32
  */
33
- addNode(url, depth, status = 0) {
33
+ addNode(url, depth, status = 0, isInternal = true) {
34
34
  const existing = this.nodes.get(url);
35
35
  if (!existing) {
36
36
  this.nodes.set(url, {
37
37
  url,
38
+ isInternal,
38
39
  depth,
39
40
  status,
40
41
  inLinks: 0,
@@ -46,6 +47,9 @@ export class Graph {
46
47
  if (status !== 0) {
47
48
  existing.status = status;
48
49
  }
50
+ if (isInternal !== undefined) {
51
+ existing.isInternal = isInternal;
52
+ }
49
53
  }
50
54
  }
51
55
  updateNodeData(url, data) {
@@ -90,9 +94,7 @@ export class Graph {
90
94
  toJSON() {
91
95
  return {
92
96
  nodes: this.getNodes(),
93
- edges: this.getEdges(),
94
- duplicateClusters: this.duplicateClusters,
95
- contentClusters: this.contentClusters
97
+ edges: this.getEdges()
96
98
  };
97
99
  }
98
100
  static fromJSON(json) {
@@ -108,12 +110,6 @@ export class Graph {
108
110
  graph.edges.set(key, edge.weight || 1.0);
109
111
  }
110
112
  }
111
- if (json.duplicateClusters) {
112
- graph.duplicateClusters = json.duplicateClusters;
113
- }
114
- if (json.contentClusters) {
115
- graph.contentClusters = json.contentClusters;
116
- }
117
113
  return graph;
118
114
  }
119
115
  }
@@ -0,0 +1,23 @@
1
+ import { Graph } from './graph.js';
2
+ export type LinkRole = 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
3
+ export interface HITSRow {
4
+ authority_score: number;
5
+ hub_score: number;
6
+ link_role: LinkRole;
7
+ }
8
+ export interface HITSOptions {
9
+ iterations?: number;
10
+ }
11
+ /**
12
+ * Service to compute Hub and Authority scores using the HITS algorithm.
13
+ * Operates purely on the internal link graph.
14
+ */
15
+ export declare class HITSService {
16
+ /**
17
+ * Computes Hub and Authority scores using the HITS algorithm.
18
+ * @param {Graph} graph - The link graph to analyze.
19
+ * @param {HITSOptions} options - Algorithm options (e.g. number of iterations).
20
+ * @returns {Map<string, HITSRow>} A map of page URLs to their HITS results.
21
+ */
22
+ evaluate(graph: Graph, options?: HITSOptions): Map<string, HITSRow>;
23
+ }