@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,76 +0,0 @@
1
- import { request } from 'undici';
2
- import * as cheerio from 'cheerio';
3
- import { normalizeUrl } from './normalize.js';
4
- import { EngineContext } from '../events.js';
5
-
6
- export class Sitemap {
7
- constructor(private context?: EngineContext) {}
8
-
9
- /**
10
- * Fetches and parses a sitemap (or sitemap index) to extract URLs.
11
- * Recursively handles sitemap indexes with loop detection and depth limits.
12
- */
13
- async fetch(url: string): Promise<string[]> {
14
- const visited = new Set<string>();
15
- const urls = new Set<string>();
16
-
17
- await this.processSitemap(url, visited, urls);
18
-
19
- return Array.from(urls);
20
- }
21
-
22
- private async processSitemap(url: string, visited: Set<string>, urls: Set<string>) {
23
- if (visited.has(url)) return;
24
- visited.add(url);
25
-
26
- // Hard limit on number of sitemaps to fetch to prevent abuse
27
- if (visited.size > 50) return;
28
-
29
- try {
30
- const res = await request(url, {
31
- maxRedirections: 3,
32
- headers: { 'User-Agent': 'crawlith/1.0' },
33
- headersTimeout: 10000,
34
- bodyTimeout: 10000
35
- });
36
-
37
- if (res.statusCode >= 200 && res.statusCode < 300) {
38
- const xml = await res.body.text();
39
- // Basic validation: must verify it looks like XML
40
- if (!xml.trim().startsWith('<')) return;
41
-
42
- const $ = cheerio.load(xml, { xmlMode: true });
43
-
44
- // Check if it's a sitemap index
45
- const sitemaps = $('sitemap > loc');
46
- if (sitemaps.length > 0) {
47
- const childSitemaps: string[] = [];
48
- sitemaps.each((_, el) => {
49
- const loc = $(el).text().trim();
50
- if (loc) childSitemaps.push(loc);
51
- });
52
-
53
- // Process children sequentially to avoid massive concurrency spike
54
- for (const childUrl of childSitemaps) {
55
- await this.processSitemap(childUrl, visited, urls);
56
- }
57
- } else {
58
- // It's a URL Set
59
- $('url > loc').each((_, el) => {
60
- const loc = $(el).text().trim();
61
- if (loc) {
62
- const normalized = normalizeUrl(loc, '');
63
- if (normalized) {
64
- urls.add(normalized);
65
- }
66
- }
67
- });
68
- }
69
- } else {
70
- await res.body.dump();
71
- }
72
- } catch (e) {
73
- this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url}`, context: e });
74
- }
75
- }
76
- }
@@ -1,96 +0,0 @@
1
-
2
- export type TrapType = 'faceted_navigation' | 'calendar_trap' | 'pagination_loop' | 'session_trap';
3
-
4
- export interface TrapResult {
5
- risk: number;
6
- type: TrapType | null;
7
- }
8
-
9
- export class TrapDetector {
10
- private pathCounters = new Map<string, Set<string>>();
11
- private paginationCounters = new Map<string, number>();
12
- private sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
13
-
14
- // Configurable thresholds
15
- private PARAM_EXPLOSION_THRESHOLD = 30;
16
- private PAGINATION_THRESHOLD = 50;
17
-
18
- constructor(options: { paramThreshold?: number, paginationThreshold?: number } = {}) {
19
- if (options.paramThreshold) this.PARAM_EXPLOSION_THRESHOLD = options.paramThreshold;
20
- if (options.paginationThreshold) this.PAGINATION_THRESHOLD = options.paginationThreshold;
21
- }
22
-
23
- /**
24
- * Checks if a URL represents a potential crawl trap.
25
- */
26
- checkTrap(rawUrl: string, _depth: number): TrapResult {
27
- let risk = 0;
28
- let type: TrapType | null = null;
29
-
30
- try {
31
- const u = new URL(rawUrl);
32
- const params = new URLSearchParams(u.search);
33
- const pathname = u.pathname;
34
- const pathKey = `${u.origin}${pathname}`;
35
-
36
- // 1. Session IDs / Tracking Parameters
37
- for (const [key] of params) {
38
- if (this.sessionParams.has(key.toLowerCase()) || key.toLowerCase().includes('session')) {
39
- risk = Math.max(risk, 0.9);
40
- type = 'session_trap';
41
- }
42
- }
43
-
44
- // 2. Calendar Pattern Detection
45
- // Matches /2023/12/01, /2023-12-01, /12-2023 etc
46
- const calendarRegex = /\/\d{4}[-/]\d{2}[-/]\d{2}\/|\/\d{2}[-/]\d{2}[-/]\d{4}\//;
47
- if (calendarRegex.test(pathname)) {
48
- risk = Math.max(risk, 0.7);
49
- type = 'calendar_trap';
50
- }
51
-
52
- // 3. Pagination Loop
53
- const pageParam = params.get('page') || params.get('p') || params.get('pg');
54
- if (pageParam && /^\d+$/.test(pageParam)) {
55
- const pageNum = parseInt(pageParam, 10);
56
- const currentMaxPage = this.paginationCounters.get(pathKey) || 0;
57
-
58
- if (pageNum > currentMaxPage) {
59
- this.paginationCounters.set(pathKey, pageNum);
60
- }
61
-
62
- if (pageNum > this.PAGINATION_THRESHOLD) {
63
- risk = Math.max(risk, 0.85);
64
- type = 'pagination_loop';
65
- }
66
- }
67
-
68
- // 4. Infinite Parameter Explosion (Faceted Navigation)
69
- if (params.size > 0) {
70
- const paramSet = this.pathCounters.get(pathKey) || new Set<string>();
71
- params.sort();
72
- const paramKey = params.toString();
73
- paramSet.add(paramKey);
74
- this.pathCounters.set(pathKey, paramSet);
75
-
76
- if (paramSet.size > this.PARAM_EXPLOSION_THRESHOLD) {
77
- risk = Math.max(risk, 0.95);
78
- if (!type) type = 'faceted_navigation';
79
- }
80
- }
81
-
82
- } catch (_e) {
83
- // Invalid URL
84
- }
85
-
86
- return { risk, type };
87
- }
88
-
89
- /**
90
- * Resets internal state (useful for multi-crawl sessions if needed)
91
- */
92
- reset() {
93
- this.pathCounters.clear();
94
- this.paginationCounters.clear();
95
- }
96
- }
@@ -1,135 +0,0 @@
1
- import { getDb } from './index.js';
2
- import { PageRepository } from './repositories/PageRepository.js';
3
- import { EdgeRepository } from './repositories/EdgeRepository.js';
4
- import { MetricsRepository, DbMetrics } from './repositories/MetricsRepository.js';
5
- import { SnapshotRepository } from './repositories/SnapshotRepository.js';
6
- import { Graph } from '../graph/graph.js';
7
-
8
- export function loadGraphFromSnapshot(snapshotId: number): Graph {
9
- const db = getDb();
10
- const pageRepo = new PageRepository(db);
11
- const edgeRepo = new EdgeRepository(db);
12
- const metricsRepo = new MetricsRepository(db);
13
- const snapshotRepo = new SnapshotRepository(db);
14
-
15
- const pages = pageRepo.getPagesIteratorBySnapshot(snapshotId);
16
- const metrics = metricsRepo.getMetricsIterator(snapshotId);
17
- const snapshot = snapshotRepo.getSnapshot(snapshotId);
18
- const metricsMap = new Map<number, DbMetrics>();
19
- for (const m of metrics) {
20
- metricsMap.set(m.page_id, m);
21
- }
22
-
23
- const graph = new Graph();
24
- let pagesFetched = 0;
25
- let pagesCached = 0;
26
- let pagesSkipped = 0;
27
-
28
- if (snapshot) {
29
- graph.limitReached = !!snapshot.limit_reached;
30
- }
31
- const idMap = new Map<number, string>();
32
-
33
- for (const p of pages) {
34
- idMap.set(p.id, p.normalized_url);
35
- graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
36
-
37
- const m = metricsMap.get(p.id);
38
- if (m) {
39
- const isProcessed = m.crawl_status === 'fetched' ||
40
- m.crawl_status === 'fetched_error' ||
41
- m.crawl_status === 'network_error' ||
42
- m.crawl_status === 'failed_after_retries' ||
43
- m.crawl_status === 'blocked_by_robots';
44
-
45
- if (isProcessed) pagesFetched++;
46
- else if (m.crawl_status === 'cached') pagesCached++;
47
- else if (m.crawl_status === 'skipped') pagesSkipped++;
48
- }
49
-
50
- let incrementalStatus: 'new' | 'changed' | 'unchanged' | undefined;
51
- if (p.first_seen_snapshot_id === snapshotId) {
52
- incrementalStatus = 'new';
53
- } else if (m?.crawl_status === 'cached') {
54
- incrementalStatus = 'unchanged';
55
- } else if (m?.crawl_status === 'fetched') {
56
- incrementalStatus = 'changed';
57
- }
58
-
59
- graph.updateNodeData(p.normalized_url, {
60
- canonical: p.canonical_url || undefined,
61
- contentHash: p.content_hash || undefined,
62
- simhash: p.simhash || undefined,
63
- etag: p.etag || undefined,
64
- lastModified: p.last_modified || undefined,
65
- html: p.html || undefined,
66
- soft404Score: p.soft404_score || undefined,
67
- noindex: !!p.noindex,
68
- nofollow: !!p.nofollow,
69
- incrementalStatus,
70
- securityError: p.security_error || undefined,
71
- retries: p.retries || undefined,
72
- bytesReceived: p.bytes_received || undefined,
73
- redirectChain: p.redirect_chain ? JSON.parse(p.redirect_chain) : undefined,
74
- crawlTrapFlag: !!p.crawl_trap_flag,
75
- crawlTrapRisk: p.crawl_trap_risk || undefined,
76
- trapType: p.trap_type || undefined,
77
- // Metrics
78
- pageRank: m?.pagerank ?? undefined,
79
- pageRankScore: m?.pagerank_score ?? m?.pagerank ?? undefined,
80
- authorityScore: m?.authority_score ?? undefined,
81
- hubScore: m?.hub_score ?? undefined,
82
- linkRole: m?.link_role ?? undefined,
83
- // Duplicate info
84
- duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
85
- duplicateType: m?.duplicate_type ?? undefined,
86
- isClusterPrimary: m?.is_cluster_primary ? true : undefined,
87
- // Additional metrics
88
- crawlStatus: m?.crawl_status || undefined,
89
- wordCount: m?.word_count != null ? m.word_count : undefined,
90
- thinContentScore: m?.thin_content_score != null ? m.thin_content_score : undefined,
91
- externalLinkRatio: m?.external_link_ratio != null ? m.external_link_ratio : undefined,
92
- orphanScore: m?.orphan_score != null ? m.orphan_score : undefined,
93
- });
94
- }
95
-
96
- const edges = edgeRepo.getEdgesIteratorBySnapshot(snapshotId);
97
-
98
- for (const e of edges) {
99
- const source = idMap.get(e.source_page_id);
100
- const target = idMap.get(e.target_page_id);
101
- if (source && target) {
102
- graph.addEdge(source, target, e.weight || 1.0);
103
- }
104
- }
105
-
106
- // Load duplicate clusters
107
- const dupClusters = db.prepare('SELECT * FROM duplicate_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
108
- graph.duplicateClusters = dupClusters.map(c => ({
109
- id: c.id,
110
- type: c.type,
111
- size: c.size,
112
- representative: c.representative,
113
- severity: c.severity
114
- }));
115
-
116
- // Load content clusters
117
- const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
118
- graph.contentClusters = contentClusters.map(c => ({
119
- id: c.id,
120
- count: c.count,
121
- primaryUrl: c.primary_url,
122
- risk: c.risk,
123
- sharedPathPrefix: c.shared_path_prefix || undefined
124
- }));
125
-
126
- // Set session stats
127
- graph.sessionStats = {
128
- pagesFetched,
129
- pagesCached,
130
- pagesSkipped,
131
- totalFound: idMap.size
132
- };
133
-
134
- return graph;
135
- }
package/src/db/index.ts DELETED
@@ -1,75 +0,0 @@
1
- import Database from 'better-sqlite3';
2
- import path from 'node:path';
3
- import fs from 'node:fs';
4
- import os from 'node:os';
5
- import { initSchema } from './schema.js';
6
-
7
- let dbInstance: Database.Database | null = null;
8
-
9
- export * from './repositories/SiteRepository.js';
10
- export * from './repositories/SnapshotRepository.js';
11
- export { initSchema } from './schema.js';
12
-
13
- export function getDbPath(): string {
14
- if (process.env.NODE_ENV === 'test') {
15
- return ':memory:';
16
- }
17
- if (process.env.CRAWLITH_DB_PATH) {
18
- return process.env.CRAWLITH_DB_PATH;
19
- }
20
- const homeDir = os.homedir();
21
- const crawlithDir = path.join(homeDir, '.crawlith');
22
- if (!fs.existsSync(crawlithDir)) {
23
- fs.mkdirSync(crawlithDir, { recursive: true });
24
- // Set permissions to 700 (user only)
25
- fs.chmodSync(crawlithDir, 0o700);
26
- }
27
- return path.join(crawlithDir, 'crawlith.db');
28
- }
29
-
30
- export function getDb(): Database.Database {
31
- if (dbInstance) {
32
- return dbInstance;
33
- }
34
-
35
- const dbPath = getDbPath();
36
- const db = new Database(dbPath);
37
-
38
- // Hardening & Performance Configuration
39
- db.pragma('journal_mode = WAL');
40
- db.pragma('synchronous = NORMAL');
41
- db.pragma('foreign_keys = ON');
42
- db.pragma('temp_store = MEMORY');
43
- db.pragma('mmap_size = 30000000000');
44
- db.pragma('cache_size = -20000');
45
- db.pragma('busy_timeout = 5000');
46
-
47
- // Security controls
48
- // Ensure file permissions are 600 (user read/write only)
49
- try {
50
- fs.chmodSync(dbPath, 0o600);
51
- } catch (_e) {
52
- // might fail on first creation if file doesn't exist yet, but better-sqlite3 creates it
53
- // so we can try again or ignore if it's new
54
- }
55
-
56
- // Integrity check on startup
57
- const integrity = db.pragma('integrity_check', { simple: true });
58
- if (integrity !== 'ok') {
59
- // Reverted to console.warn to avoid breaking change
60
- console.warn('Database integrity check failed:', integrity);
61
- }
62
-
63
- // Initialize schema
64
- initSchema(db);
65
-
66
- dbInstance = db;
67
- return db;
68
- }
69
-
70
- export function closeDb() {
71
- if (dbInstance) {
72
- dbInstance.close();
73
- dbInstance = null;
74
- }
75
- }
@@ -1,43 +0,0 @@
1
- import { Database } from 'better-sqlite3';
2
-
3
- export interface Edge {
4
- id: number;
5
- snapshot_id: number;
6
- source_page_id: number;
7
- target_page_id: number;
8
- weight: number;
9
- rel: 'nofollow' | 'sponsored' | 'ugc' | 'internal' | 'external' | 'unknown';
10
- }
11
-
12
- export class EdgeRepository {
13
- private insertStmt;
14
-
15
- constructor(private db: Database) {
16
- this.insertStmt = this.db.prepare(`
17
- INSERT INTO edges (snapshot_id, source_page_id, target_page_id, weight, rel)
18
- VALUES (?, ?, ?, ?, ?)
19
- `);
20
- }
21
-
22
- insertEdge(snapshotId: number, sourcePageId: number, targetPageId: number, weight: number = 1.0, rel: string = 'internal') {
23
- this.insertStmt.run(snapshotId, sourcePageId, targetPageId, weight, rel);
24
- }
25
-
26
- insertEdges(edges: { snapshot_id: number; source_page_id: number; target_page_id: number; weight: number; rel: string }[]) {
27
- if (edges.length === 0) return;
28
- const tx = this.db.transaction((edgesBatch) => {
29
- for (const edge of edgesBatch) {
30
- this.insertStmt.run(edge.snapshot_id, edge.source_page_id, edge.target_page_id, edge.weight, edge.rel);
31
- }
32
- });
33
- tx(edges);
34
- }
35
-
36
- getEdgesBySnapshot(snapshotId: number): Edge[] {
37
- return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').all(snapshotId) as Edge[];
38
- }
39
-
40
- getEdgesIteratorBySnapshot(snapshotId: number): IterableIterator<Edge> {
41
- return this.db.prepare('SELECT * FROM edges WHERE snapshot_id = ?').iterate(snapshotId) as IterableIterator<Edge>;
42
- }
43
- }
@@ -1,63 +0,0 @@
1
- import { Database } from 'better-sqlite3';
2
-
3
- export interface DbMetrics {
4
- snapshot_id: number;
5
- page_id: number;
6
- authority_score: number | null;
7
- hub_score: number | null;
8
- pagerank: number | null;
9
- pagerank_score: number | null;
10
- link_role: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral' | null;
11
- crawl_status: string | null;
12
- word_count: number | null;
13
- thin_content_score: number | null;
14
- external_link_ratio: number | null;
15
- orphan_score: number | null;
16
- duplicate_cluster_id: string | null;
17
- duplicate_type: 'exact' | 'near' | 'template_heavy' | 'none' | null;
18
- is_cluster_primary: number;
19
- }
20
-
21
- export class MetricsRepository {
22
- private insertStmt;
23
- private getByPageStmt;
24
-
25
- constructor(private db: Database) {
26
- this.getByPageStmt = this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ? AND page_id = ?');
27
- this.insertStmt = this.db.prepare(`
28
- INSERT OR REPLACE INTO metrics (
29
- snapshot_id, page_id, authority_score, hub_score, pagerank, pagerank_score,
30
- link_role, crawl_status, word_count, thin_content_score, external_link_ratio,
31
- orphan_score, duplicate_cluster_id, duplicate_type, is_cluster_primary
32
- ) VALUES (
33
- @snapshot_id, @page_id, @authority_score, @hub_score, @pagerank, @pagerank_score,
34
- @link_role, @crawl_status, @word_count, @thin_content_score, @external_link_ratio,
35
- @orphan_score, @duplicate_cluster_id, @duplicate_type, @is_cluster_primary
36
- )
37
- `);
38
- }
39
-
40
- insertMetrics(metrics: DbMetrics) {
41
- this.insertStmt.run(metrics);
42
- }
43
-
44
- getMetrics(snapshotId: number): DbMetrics[] {
45
- return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').all(snapshotId) as DbMetrics[];
46
- }
47
-
48
- getMetricsIterator(snapshotId: number): IterableIterator<DbMetrics> {
49
- return this.db.prepare('SELECT * FROM metrics WHERE snapshot_id = ?').iterate(snapshotId) as IterableIterator<DbMetrics>;
50
- }
51
-
52
- getMetricsForPage(snapshotId: number, pageId: number): DbMetrics | undefined {
53
- return this.getByPageStmt.get(snapshotId, pageId) as DbMetrics | undefined;
54
- }
55
-
56
- insertMany(metricsList: DbMetrics[]) {
57
- const insert = this.insertStmt;
58
- const tx = this.db.transaction((items: DbMetrics[]) => {
59
- for (const item of items) insert.run(item);
60
- });
61
- tx(metricsList);
62
- }
63
- }