@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,73 @@
1
+ import { request } from 'undici';
2
+ import * as cheerio from 'cheerio';
3
+ import { normalizeUrl } from './normalize.js';
4
+
5
+ export class Sitemap {
6
+ /**
7
+ * Fetches and parses a sitemap (or sitemap index) to extract URLs.
8
+ * Recursively handles sitemap indexes with loop detection and depth limits.
9
+ */
10
+ async fetch(url: string): Promise<string[]> {
11
+ const visited = new Set<string>();
12
+ const urls = new Set<string>();
13
+
14
+ await this.processSitemap(url, visited, urls);
15
+
16
+ return Array.from(urls);
17
+ }
18
+
19
+ private async processSitemap(url: string, visited: Set<string>, urls: Set<string>) {
20
+ if (visited.has(url)) return;
21
+ visited.add(url);
22
+
23
+ // Hard limit on number of sitemaps to fetch to prevent abuse
24
+ if (visited.size > 50) return;
25
+
26
+ try {
27
+ const res = await request(url, {
28
+ maxRedirections: 3,
29
+ headers: { 'User-Agent': 'crawlith/1.0' },
30
+ headersTimeout: 10000,
31
+ bodyTimeout: 10000
32
+ });
33
+
34
+ if (res.statusCode >= 200 && res.statusCode < 300) {
35
+ const xml = await res.body.text();
36
+ // Basic validation: must verify it looks like XML
37
+ if (!xml.trim().startsWith('<')) return;
38
+
39
+ const $ = cheerio.load(xml, { xmlMode: true });
40
+
41
+ // Check if it's a sitemap index
42
+ const sitemaps = $('sitemap > loc');
43
+ if (sitemaps.length > 0) {
44
+ const childSitemaps: string[] = [];
45
+ sitemaps.each((_, el) => {
46
+ const loc = $(el).text().trim();
47
+ if (loc) childSitemaps.push(loc);
48
+ });
49
+
50
+ // Process children sequentially to avoid massive concurrency spike
51
+ for (const childUrl of childSitemaps) {
52
+ await this.processSitemap(childUrl, visited, urls);
53
+ }
54
+ } else {
55
+ // It's a URL Set
56
+ $('url > loc').each((_, el) => {
57
+ const loc = $(el).text().trim();
58
+ if (loc) {
59
+ const normalized = normalizeUrl(loc, '');
60
+ if (normalized) {
61
+ urls.add(normalized);
62
+ }
63
+ }
64
+ });
65
+ }
66
+ } else {
67
+ await res.body.dump();
68
+ }
69
+ } catch (e) {
70
+ console.warn(`Failed to fetch sitemap ${url}:`, e);
71
+ }
72
+ }
73
+ }
@@ -0,0 +1,96 @@
1
+
2
+ export type TrapType = 'faceted_navigation' | 'calendar_trap' | 'pagination_loop' | 'session_trap';
3
+
4
+ export interface TrapResult {
5
+ risk: number;
6
+ type: TrapType | null;
7
+ }
8
+
9
+ export class TrapDetector {
10
+ private pathCounters = new Map<string, Set<string>>();
11
+ private paginationCounters = new Map<string, number>();
12
+ private sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
13
+
14
+ // Configurable thresholds
15
+ private PARAM_EXPLOSION_THRESHOLD = 30;
16
+ private PAGINATION_THRESHOLD = 50;
17
+
18
+ constructor(options: { paramThreshold?: number, paginationThreshold?: number } = {}) {
19
+ if (options.paramThreshold) this.PARAM_EXPLOSION_THRESHOLD = options.paramThreshold;
20
+ if (options.paginationThreshold) this.PAGINATION_THRESHOLD = options.paginationThreshold;
21
+ }
22
+
23
+ /**
24
+ * Checks if a URL represents a potential crawl trap.
25
+ */
26
+ checkTrap(rawUrl: string, _depth: number): TrapResult {
27
+ let risk = 0;
28
+ let type: TrapType | null = null;
29
+
30
+ try {
31
+ const u = new URL(rawUrl);
32
+ const params = new URLSearchParams(u.search);
33
+ const pathname = u.pathname;
34
+ const pathKey = `${u.origin}${pathname}`;
35
+
36
+ // 1. Session IDs / Tracking Parameters
37
+ for (const [key] of params) {
38
+ if (this.sessionParams.has(key.toLowerCase()) || key.toLowerCase().includes('session')) {
39
+ risk = Math.max(risk, 0.9);
40
+ type = 'session_trap';
41
+ }
42
+ }
43
+
44
+ // 2. Calendar Pattern Detection
45
+ // Matches /2023/12/01, /2023-12-01, /12-2023 etc
46
+ const calendarRegex = /\/\d{4}[-/]\d{2}[-/]\d{2}\/|\/\d{2}[-/]\d{2}[-/]\d{4}\//;
47
+ if (calendarRegex.test(pathname)) {
48
+ risk = Math.max(risk, 0.7);
49
+ type = 'calendar_trap';
50
+ }
51
+
52
+ // 3. Pagination Loop
53
+ const pageParam = params.get('page') || params.get('p') || params.get('pg');
54
+ if (pageParam && /^\d+$/.test(pageParam)) {
55
+ const pageNum = parseInt(pageParam, 10);
56
+ const currentMaxPage = this.paginationCounters.get(pathKey) || 0;
57
+
58
+ if (pageNum > currentMaxPage) {
59
+ this.paginationCounters.set(pathKey, pageNum);
60
+ }
61
+
62
+ if (pageNum > this.PAGINATION_THRESHOLD) {
63
+ risk = Math.max(risk, 0.85);
64
+ type = 'pagination_loop';
65
+ }
66
+ }
67
+
68
+ // 4. Infinite Parameter Explosion (Faceted Navigation)
69
+ if (params.size > 0) {
70
+ const paramSet = this.pathCounters.get(pathKey) || new Set<string>();
71
+ params.sort();
72
+ const paramKey = params.toString();
73
+ paramSet.add(paramKey);
74
+ this.pathCounters.set(pathKey, paramSet);
75
+
76
+ if (paramSet.size > this.PARAM_EXPLOSION_THRESHOLD) {
77
+ risk = Math.max(risk, 0.95);
78
+ if (!type) type = 'faceted_navigation';
79
+ }
80
+ }
81
+
82
+ } catch (_e) {
83
+ // Invalid URL
84
+ }
85
+
86
+ return { risk, type };
87
+ }
88
+
89
+ /**
90
+ * Resets internal state (useful for multi-crawl sessions if needed)
91
+ */
92
+ reset() {
93
+ this.pathCounters.clear();
94
+ this.paginationCounters.clear();
95
+ }
96
+ }
@@ -0,0 +1,105 @@
1
+ import { getDb } from './index.js';
2
+ import { PageRepository } from './repositories/PageRepository.js';
3
+ import { EdgeRepository } from './repositories/EdgeRepository.js';
4
+ import { MetricsRepository, DbMetrics } from './repositories/MetricsRepository.js';
5
+ import { SnapshotRepository } from './repositories/SnapshotRepository.js';
6
+ import { Graph } from '../graph/graph.js';
7
+
8
+ export function loadGraphFromSnapshot(snapshotId: number): Graph {
9
+ const db = getDb();
10
+ const pageRepo = new PageRepository(db);
11
+ const edgeRepo = new EdgeRepository(db);
12
+ const metricsRepo = new MetricsRepository(db);
13
+ const snapshotRepo = new SnapshotRepository(db);
14
+
15
+ const pages = pageRepo.getPagesBySnapshot(snapshotId);
16
+ const metrics = metricsRepo.getMetrics(snapshotId);
17
+ const snapshot = snapshotRepo.getSnapshot(snapshotId);
18
+ const metricsMap = new Map<number, DbMetrics>();
19
+ for (const m of metrics) {
20
+ metricsMap.set(m.page_id, m);
21
+ }
22
+
23
+ const graph = new Graph();
24
+ if (snapshot) {
25
+ graph.limitReached = !!snapshot.limit_reached;
26
+ }
27
+ const idMap = new Map<number, string>();
28
+
29
+ for (const p of pages) {
30
+ idMap.set(p.id, p.normalized_url);
31
+ graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
32
+
33
+ const m = metricsMap.get(p.id);
34
+ let incrementalStatus: 'new' | 'changed' | 'unchanged' | undefined;
35
+ if (p.first_seen_snapshot_id === snapshotId) {
36
+ incrementalStatus = 'new';
37
+ } else if (m?.crawl_status === 'cached') {
38
+ incrementalStatus = 'unchanged';
39
+ } else if (m?.crawl_status === 'fetched') {
40
+ incrementalStatus = 'changed';
41
+ }
42
+
43
+ graph.updateNodeData(p.normalized_url, {
44
+ canonical: p.canonical_url || undefined,
45
+ contentHash: p.content_hash || undefined,
46
+ simhash: p.simhash || undefined,
47
+ etag: p.etag || undefined,
48
+ lastModified: p.last_modified || undefined,
49
+ html: p.html || undefined,
50
+ soft404Score: p.soft404_score || undefined,
51
+ noindex: !!p.noindex,
52
+ nofollow: !!p.nofollow,
53
+ incrementalStatus,
54
+ securityError: p.security_error || undefined,
55
+ retries: p.retries || undefined,
56
+ bytesReceived: p.bytes_received || undefined,
57
+ redirectChain: p.redirect_chain ? JSON.parse(p.redirect_chain) : undefined,
58
+ crawlTrapFlag: !!p.crawl_trap_flag,
59
+ crawlTrapRisk: p.crawl_trap_risk || undefined,
60
+ trapType: p.trap_type || undefined,
61
+ // Metrics
62
+ pageRank: m?.pagerank ?? undefined,
63
+ pageRankScore: m?.pagerank_score ?? m?.pagerank ?? undefined,
64
+ authorityScore: m?.authority_score ?? undefined,
65
+ hubScore: m?.hub_score ?? undefined,
66
+ linkRole: m?.link_role ?? undefined,
67
+ // Duplicate info
68
+ duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
69
+ duplicateType: m?.duplicate_type ?? undefined,
70
+ isClusterPrimary: m?.is_cluster_primary ? true : undefined,
71
+ });
72
+ }
73
+
74
+ const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
75
+
76
+ for (const e of edges) {
77
+ const source = idMap.get(e.source_page_id);
78
+ const target = idMap.get(e.target_page_id);
79
+ if (source && target) {
80
+ graph.addEdge(source, target, e.weight || 1.0);
81
+ }
82
+ }
83
+
84
+ // Load duplicate clusters
85
+ const dupClusters = db.prepare('SELECT * FROM duplicate_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
86
+ graph.duplicateClusters = dupClusters.map(c => ({
87
+ id: c.id,
88
+ type: c.type,
89
+ size: c.size,
90
+ representative: c.representative,
91
+ severity: c.severity
92
+ }));
93
+
94
+ // Load content clusters
95
+ const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
96
+ graph.contentClusters = contentClusters.map(c => ({
97
+ id: c.id,
98
+ count: c.count,
99
+ primaryUrl: c.primary_url,
100
+ risk: c.risk,
101
+ sharedPathPrefix: c.shared_path_prefix || undefined
102
+ }));
103
+
104
+ return graph;
105
+ }
@@ -0,0 +1,70 @@
1
+ import Database from 'better-sqlite3';
2
+ import path from 'node:path';
3
+ import fs from 'node:fs';
4
+ import os from 'node:os';
5
+ import { initSchema } from './schema.js';
6
+
7
+ let dbInstance: Database.Database | null = null;
8
+
9
+ export function getDbPath(): string {
10
+ if (process.env.NODE_ENV === 'test') {
11
+ return ':memory:';
12
+ }
13
+ if (process.env.CRAWLITH_DB_PATH) {
14
+ return process.env.CRAWLITH_DB_PATH;
15
+ }
16
+ const homeDir = os.homedir();
17
+ const crawlithDir = path.join(homeDir, '.crawlith');
18
+ if (!fs.existsSync(crawlithDir)) {
19
+ fs.mkdirSync(crawlithDir, { recursive: true });
20
+ // Set permissions to 700 (user only)
21
+ fs.chmodSync(crawlithDir, 0o700);
22
+ }
23
+ return path.join(crawlithDir, 'crawlith.db');
24
+ }
25
+
26
+ export function getDb(): Database.Database {
27
+ if (dbInstance) {
28
+ return dbInstance;
29
+ }
30
+
31
+ const dbPath = getDbPath();
32
+ const db = new Database(dbPath);
33
+
34
+ // Hardening & Performance Configuration
35
+ db.pragma('journal_mode = WAL');
36
+ db.pragma('synchronous = NORMAL');
37
+ db.pragma('foreign_keys = ON');
38
+ db.pragma('temp_store = MEMORY');
39
+ db.pragma('mmap_size = 30000000000');
40
+ db.pragma('cache_size = -20000');
41
+ db.pragma('busy_timeout = 5000');
42
+
43
+ // Security controls
44
+ // Ensure file permissions are 600 (user read/write only)
45
+ try {
46
+ fs.chmodSync(dbPath, 0o600);
47
+ } catch (_e) {
48
+ // might fail on first creation if file doesn't exist yet, but better-sqlite3 creates it
49
+ // so we can try again or ignore if it's new
50
+ }
51
+
52
+ // Integrity check on startup
53
+ const integrity = db.pragma('integrity_check', { simple: true });
54
+ if (integrity !== 'ok') {
55
+ console.warn('Database integrity check failed:', integrity);
56
+ }
57
+
58
+ // Initialize schema
59
+ initSchema(db);
60
+
61
+ dbInstance = db;
62
+ return db;
63
+ }
64
+
65
+ export function closeDb() {
66
+ if (dbInstance) {
67
+ dbInstance.close();
68
+ dbInstance = null;
69
+ }
70
+ }
@@ -0,0 +1,29 @@
1
+ import { Database } from 'better-sqlite3';
2
+
3
+ export interface Edge {
4
+ id: number;
5
+ snapshot_id: number;
6
+ source_page_id: number;
7
+ target_page_id: number;
8
+ weight: number;
9
+ rel: 'nofollow' | 'sponsored' | 'ugc' | 'internal' | 'external' | 'unknown';
10
+ }
11
+
12
+ export class EdgeRepository {
13
+ private insertStmt;
14
+
15
+ constructor(private db: Database) {
16
+ this.insertStmt = this.db.prepare(`
17
+ INSERT INTO edges (snapshot_id, source_page_id, target_page_id, weight, rel)
18
+ VALUES (?, ?, ?, ?, ?)
19
+ `);
20
+ }
21
+
22
+ insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight: number = 1.0, rel: string = 'internal') {
23
+ this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
24
+ }
25
+
26
+ getEdgesBySnapshot(snapshotId: number): Edge[] {
27
+ return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId) as Edge[];
28
+ }
29
+ }
@@ -0,0 +1,49 @@
1
+ import { Database } from 'better-sqlite3';
2
+
3
+ export interface DbMetrics {
4
+ snapshot_id: number;
5
+ page_id: number;
6
+ authority_score: number | null;
7
+ hub_score: number | null;
8
+ pagerank: number | null;
9
+ pagerank_score: number | null;
10
+ link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
11
+ crawl_status: string | null;
12
+ word_count: number | null;
13
+ thin_content_score: number | null;
14
+ external_link_ratio: number | null;
15
+ orphan_score: number | null;
16
+ duplicate_cluster_id: string | null;
17
+ duplicate_type: 'exact' | 'near' | 'template_heavy' | 'none' | null;
18
+ is_cluster_primary: number;
19
+ }
20
+
21
+ export class MetricsRepository {
22
+ private insertStmt;
23
+
24
+ constructor(private db: Database) {
25
+ this.insertStmt = this.db.prepare(`
26
+ INSERT OR REPLACE INTO metrics (
27
+ snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
28
+ link_role, crawl_status, word_count, thin_content_score, external_link_ratio,
29
+ orphan_score, duplicate_cluster_id, duplicate_type, is_cluster_primary
30
+ ) VALUES (
31
+ @snapshot_id, @page_id, @authority_score, @hub_score, @pagerank, @pagerank_score,
32
+ @link_role, @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
33
+ @orphan_score, @duplicate_cluster_id, @duplicate_type, @is_cluster_primary
34
+ )
35
+ `);
36
+ }
37
+
38
+ insertMetrics(metrics: DbMetrics) {
39
+ this.insertStmt.run(metrics);
40
+ }
41
+
42
+ getMetrics(snapshotId: number): DbMetrics[] {
43
+ return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId) as DbMetrics[];
44
+ }
45
+
46
+ getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined {
47
+ return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?').get(snapshotId, pageId) as DbMetrics | undefined;
48
+ }
49
+ }
@@ -0,0 +1,128 @@
1
+ import { Database } from 'better-sqlite3';
2
+
3
+ export interface Page {
4
+ id: number;
5
+ site_id: number;
6
+ normalized_url: string;
7
+ first_seen_snapshot_id: number | null;
8
+ last_seen_snapshot_id: number | null;
9
+ http_status: number | null;
10
+ canonical_url: string | null;
11
+ content_hash: string | null;
12
+ simhash: string | null;
13
+ etag: string | null;
14
+ last_modified: string | null;
15
+ html: string | null;
16
+ soft404_score: number | null;
17
+ noindex: number;
18
+ nofollow: number;
19
+ security_error: string | null;
20
+ retries: number;
21
+ depth: number;
22
+ redirect_chain: string | null;
23
+ bytes_received: number | null;
24
+ crawl_trap_flag: number;
25
+ crawl_trap_risk: number | null;
26
+ trap_type: string | null;
27
+ created_at: string;
28
+ updated_at: string;
29
+ }
30
+
31
+ export class PageRepository {
32
+ private upsertStmt;
33
+ private getIdStmt;
34
+
35
+ constructor(private db: Database) {
36
+ this.upsertStmt = this.db.prepare(`
37
+ INSERT INTO pages (
38
+ site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
39
+ http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
40
+ soft404_score, noindex, nofollow, security_error, retries, depth,
41
+ redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
42
+ updated_at
43
+ ) VALUES (
44
+ @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
45
+ @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
46
+ @soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
47
+ @redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
48
+ datetime('now')
49
+ )
50
+ ON CONFLICT(site_id, normalized_url) DO UPDATE SET
51
+ last_seen_snapshot_id = excluded.last_seen_snapshot_id,
52
+ http_status = excluded.http_status,
53
+ canonical_url = excluded.canonical_url,
54
+ content_hash = excluded.content_hash,
55
+ simhash = excluded.simhash,
56
+ etag = excluded.etag,
57
+ last_modified = excluded.last_modified,
58
+ html = excluded.html,
59
+ soft404_score = excluded.soft404_score,
60
+ noindex = excluded.noindex,
61
+ nofollow = excluded.nofollow,
62
+ security_error = excluded.security_error,
63
+ retries = excluded.retries,
64
+ depth = excluded.depth,
65
+ redirect_chain = excluded.redirect_chain,
66
+ bytes_received = excluded.bytes_received,
67
+ crawl_trap_flag = excluded.crawl_trap_flag,
68
+ crawl_trap_risk = excluded.crawl_trap_risk,
69
+ trap_type = excluded.trap_type,
70
+ updated_at = datetime('now')
71
+ `);
72
+
73
+ this.getIdStmt = this.db.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?');
74
+ }
75
+
76
+ upsertPage(page: Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number }) {
77
+ const params = {
78
+ site_id: page.site_id,
79
+ normalized_url: page.normalized_url,
80
+ first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
81
+ last_seen_snapshot_id: page.last_seen_snapshot_id,
82
+ http_status: page.http_status ?? null,
83
+ canonical_url: page.canonical_url ?? null,
84
+ content_hash: page.content_hash ?? null,
85
+ simhash: page.simhash ?? null,
86
+ etag: page.etag ?? null,
87
+ last_modified: page.last_modified ?? null,
88
+ html: page.html ?? null,
89
+ soft404_score: page.soft404_score ?? null,
90
+ noindex: page.noindex ?? 0,
91
+ nofollow: page.nofollow ?? 0,
92
+ security_error: page.security_error ?? null,
93
+ retries: page.retries ?? 0,
94
+ depth: page.depth ?? 0,
95
+ redirect_chain: page.redirect_chain ?? null,
96
+ bytes_received: page.bytes_received ?? null,
97
+ crawl_trap_flag: page.crawl_trap_flag ?? 0,
98
+ crawl_trap_risk: page.crawl_trap_risk ?? null,
99
+ trap_type: page.trap_type ?? null,
100
+ };
101
+
102
+ const info = this.upsertStmt.run(params);
103
+ return info;
104
+ }
105
+
106
+ upsertAndGetId(page: Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number }): number {
107
+ const tx = this.db.transaction(() => {
108
+ this.upsertPage(page);
109
+ const row = this.getIdStmt.get(page.site_id, page.normalized_url) as { id: number } | undefined;
110
+ if (!row) throw new Error(`Failed to retrieve ID for upserted page: ${page.normalized_url}`);
111
+ return row.id;
112
+ });
113
+ return tx();
114
+ }
115
+
116
+ getPage(siteId: number, url: string): Page | undefined {
117
+ return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url) as Page | undefined;
118
+ }
119
+
120
+ getPagesBySnapshot(snapshotId: number): Page[] {
121
+ return this.db.prepare('SELECT * FROM pages WHERE last_seen_snapshot_id = ?').all(snapshotId) as Page[];
122
+ }
123
+
124
+ getIdByUrl(siteId: number, url: string): number | undefined {
125
+ const row = this.getIdStmt.get(siteId, url) as { id: number } | undefined;
126
+ return row?.id;
127
+ }
128
+ }
@@ -0,0 +1,32 @@
1
+ import { Database } from 'better-sqlite3';
2
+
3
+ export interface Site {
4
+ id: number;
5
+ domain: string;
6
+ created_at: string;
7
+ settings_json: string | null;
8
+ is_active: number;
9
+ }
10
+
11
+ export class SiteRepository {
12
+ constructor(private db: Database) { }
13
+
14
+ getSite(domain: string): Site | undefined {
15
+ return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain) as Site | undefined;
16
+ }
17
+
18
+ createSite(domain: string): number {
19
+ const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
20
+ const info = stmt.run(domain);
21
+ return info.lastInsertRowid as number;
22
+ }
23
+
24
+ firstOrCreateSite(domain: string): Site {
25
+ let site = this.getSite(domain);
26
+ if (!site) {
27
+ this.createSite(domain);
28
+ site = this.getSite(domain);
29
+ }
30
+ return site!;
31
+ }
32
+ }
@@ -0,0 +1,74 @@
1
+ import { Database } from 'better-sqlite3';
2
+
3
+ export interface Snapshot {
4
+ id: number;
5
+ site_id: number;
6
+ type: 'full' | 'partial' | 'incremental';
7
+ created_at: string;
8
+ node_count: number;
9
+ edge_count: number;
10
+ status: 'running' | 'completed' | 'failed';
11
+ limit_reached: number;
12
+ health_score: number | null;
13
+ orphan_count: number | null;
14
+ thin_content_count: number | null;
15
+ }
16
+
17
+ export class SnapshotRepository {
18
+ constructor(private db: Database) {}
19
+
20
+ createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status: 'running' | 'completed' | 'failed' = 'running'): number {
21
+ const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
22
+ const info = stmt.run(siteId, type, status);
23
+ return info.lastInsertRowid as number;
24
+ }
25
+
26
+ getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined {
27
+ let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
28
+ const params: any[] = [siteId];
29
+ if (status) {
30
+ sql += ' AND status = ?';
31
+ params.push(status);
32
+ }
33
+ sql += ' ORDER BY created_at DESC LIMIT 1';
34
+ return this.db.prepare(sql).get(...params) as Snapshot | undefined;
35
+ }
36
+
37
+ updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats: Partial<Snapshot> = {}) {
38
+ const sets: string[] = ['status = ?'];
39
+ const params: any[] = [status];
40
+
41
+ if (stats.node_count !== undefined) {
42
+ sets.push('node_count = ?');
43
+ params.push(stats.node_count);
44
+ }
45
+ if (stats.edge_count !== undefined) {
46
+ sets.push('edge_count = ?');
47
+ params.push(stats.edge_count);
48
+ }
49
+ if (stats.limit_reached !== undefined) {
50
+ sets.push('limit_reached = ?');
51
+ params.push(stats.limit_reached);
52
+ }
53
+ if (stats.health_score !== undefined) {
54
+ sets.push('health_score = ?');
55
+ params.push(stats.health_score);
56
+ }
57
+ if (stats.orphan_count !== undefined) {
58
+ sets.push('orphan_count = ?');
59
+ params.push(stats.orphan_count);
60
+ }
61
+ if (stats.thin_content_count !== undefined) {
62
+ sets.push('thin_content_count = ?');
63
+ params.push(stats.thin_content_count);
64
+ }
65
+
66
+ params.push(id);
67
+ const sql = `UPDATE snapshots SET ${sets.join(', ')} WHERE id = ?`;
68
+ this.db.prepare(sql).run(...params);
69
+ }
70
+
71
+ getSnapshot(id: number): Snapshot | undefined {
72
+ return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id) as Snapshot | undefined;
73
+ }
74
+ }