@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,228 +0,0 @@
1
- import { Database } from 'better-sqlite3';
2
-
3
- export interface Page {
4
- id: number;
5
- site_id: number;
6
- normalized_url: string;
7
- first_seen_snapshot_id: number | null;
8
- last_seen_snapshot_id: number | null;
9
- http_status: number | null;
10
- canonical_url: string | null;
11
- content_hash: string | null;
12
- simhash: string | null;
13
- etag: string | null;
14
- last_modified: string | null;
15
- html: string | null;
16
- soft404_score: number | null;
17
- noindex: number;
18
- nofollow: number;
19
- security_error: string | null;
20
- retries: number;
21
- depth: number;
22
- redirect_chain: string | null;
23
- bytes_received: number | null;
24
- crawl_trap_flag: number;
25
- crawl_trap_risk: number | null;
26
- trap_type: string | null;
27
- created_at: string;
28
- updated_at: string;
29
- }
30
-
31
- export class PageRepository {
32
- private upsertStmt;
33
- private getIdStmt;
34
-
35
- constructor(private db: Database) {
36
- this.upsertStmt = this.db.prepare(`
37
- INSERT INTO pages (
38
- site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
39
- http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
40
- soft404_score, noindex, nofollow, security_error, retries, depth,
41
- redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
42
- updated_at
43
- ) VALUES (
44
- @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
45
- @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
46
- @soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
47
- @redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
48
- datetime('now')
49
- )
50
- ON CONFLICT(site_id, normalized_url) DO UPDATE SET
51
- last_seen_snapshot_id = excluded.last_seen_snapshot_id,
52
- http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
53
- canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
54
- content_hash = COALESCE(excluded.content_hash, pages.content_hash),
55
- simhash = COALESCE(excluded.simhash, pages.simhash),
56
- etag = COALESCE(excluded.etag, pages.etag),
57
- last_modified = COALESCE(excluded.last_modified, pages.last_modified),
58
- html = COALESCE(excluded.html, pages.html),
59
- soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
60
- noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
61
- nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
62
- security_error = COALESCE(excluded.security_error, pages.security_error),
63
- retries = MAX(pages.retries, excluded.retries),
64
- depth = MIN(pages.depth, excluded.depth),
65
- redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
66
- bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
67
- crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
68
- crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
69
- trap_type = COALESCE(excluded.trap_type, pages.trap_type),
70
- updated_at = datetime('now')
71
- `);
72
-
73
- this.getIdStmt = this.db.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?');
74
- }
75
-
76
- upsertPage(page: Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number }) {
77
- const params = {
78
- site_id: page.site_id,
79
- normalized_url: page.normalized_url,
80
- first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
81
- last_seen_snapshot_id: page.last_seen_snapshot_id,
82
- http_status: page.http_status ?? null,
83
- canonical_url: page.canonical_url ?? null,
84
- content_hash: page.content_hash ?? null,
85
- simhash: page.simhash ?? null,
86
- etag: page.etag ?? null,
87
- last_modified: page.last_modified ?? null,
88
- html: page.html ?? null,
89
- soft404_score: page.soft404_score ?? null,
90
- noindex: page.noindex ?? 0,
91
- nofollow: page.nofollow ?? 0,
92
- security_error: page.security_error ?? null,
93
- retries: page.retries ?? 0,
94
- depth: page.depth ?? 0,
95
- redirect_chain: page.redirect_chain ?? null,
96
- bytes_received: page.bytes_received ?? null,
97
- crawl_trap_flag: page.crawl_trap_flag ?? 0,
98
- crawl_trap_risk: page.crawl_trap_risk ?? null,
99
- trap_type: page.trap_type ?? null,
100
- };
101
-
102
- const info = this.upsertStmt.run(params);
103
- return info;
104
- }
105
-
106
- upsertAndGetId(page: Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number }): number {
107
- const tx = this.db.transaction(() => {
108
- this.upsertPage(page);
109
- const row = this.getIdStmt.get(page.site_id, page.normalized_url) as { id: number } | undefined;
110
- if (!row) throw new Error(`Failed to retrieve ID for upserted page: ${page.normalized_url}`);
111
- return row.id;
112
- });
113
- return tx();
114
- }
115
-
116
- getPage(siteId: number, url: string): Page | undefined {
117
- return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url) as Page | undefined;
118
- }
119
-
120
- getPagesByUrls(siteId: number, urls: string[]): Page[] {
121
- if (urls.length === 0) return [];
122
- const chunkSize = 900;
123
- const results: Page[] = [];
124
-
125
- for (let i = 0; i < urls.length; i += chunkSize) {
126
- const chunk = urls.slice(i, i + chunkSize);
127
- const placeholders = chunk.map(() => '?').join(',');
128
- const chunkResults = this.db.prepare(`SELECT * FROM pages WHERE site_id = ? AND normalized_url IN (${placeholders})`).all(siteId, ...chunk) as Page[];
129
- results.push(...chunkResults);
130
- }
131
-
132
- return results;
133
- }
134
-
135
- upsertMany(pages: (Partial<Page> & { site_id: number; normalized_url: string; last_seen_snapshot_id: number })[]): Map<string, number> {
136
- if (pages.length === 0) return new Map();
137
-
138
- const upsertStmtWithReturn = this.db.prepare(`
139
- INSERT INTO pages (
140
- site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
141
- http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
142
- soft404_score, noindex, nofollow, security_error, retries, depth,
143
- redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
144
- updated_at
145
- ) VALUES (
146
- @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
147
- @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
148
- @soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
149
- @redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
150
- datetime('now')
151
- )
152
- ON CONFLICT(site_id, normalized_url) DO UPDATE SET
153
- last_seen_snapshot_id = excluded.last_seen_snapshot_id,
154
- http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
155
- canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
156
- content_hash = COALESCE(excluded.content_hash, pages.content_hash),
157
- simhash = COALESCE(excluded.simhash, pages.simhash),
158
- etag = COALESCE(excluded.etag, pages.etag),
159
- last_modified = COALESCE(excluded.last_modified, pages.last_modified),
160
- html = COALESCE(excluded.html, pages.html),
161
- soft404_score = COALESCE(excluded.soft404_score, pages.soft404_score),
162
- noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
163
- nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
164
- security_error = COALESCE(excluded.security_error, pages.security_error),
165
- retries = MAX(pages.retries, excluded.retries),
166
- depth = MIN(pages.depth, excluded.depth),
167
- redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
168
- bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
169
- crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
170
- crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
171
- trap_type = COALESCE(excluded.trap_type, pages.trap_type),
172
- updated_at = datetime('now')
173
- RETURNING id
174
- `);
175
-
176
- const urlToId = new Map<string, number>();
177
- const tx = this.db.transaction((pagesBatch) => {
178
- for (const page of pagesBatch) {
179
- const params = {
180
- site_id: page.site_id,
181
- normalized_url: page.normalized_url,
182
- first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
183
- last_seen_snapshot_id: page.last_seen_snapshot_id,
184
- http_status: page.http_status ?? null,
185
- canonical_url: page.canonical_url ?? null,
186
- content_hash: page.content_hash ?? null,
187
- simhash: page.simhash ?? null,
188
- etag: page.etag ?? null,
189
- last_modified: page.last_modified ?? null,
190
- html: page.html ?? null,
191
- soft404_score: page.soft404_score ?? null,
192
- noindex: page.noindex ?? 0,
193
- nofollow: page.nofollow ?? 0,
194
- security_error: page.security_error ?? null,
195
- retries: page.retries ?? 0,
196
- depth: page.depth ?? 0,
197
- redirect_chain: page.redirect_chain ?? null,
198
- bytes_received: page.bytes_received ?? null,
199
- crawl_trap_flag: page.crawl_trap_flag ?? 0,
200
- crawl_trap_risk: page.crawl_trap_risk ?? null,
201
- trap_type: page.trap_type ?? null,
202
- };
203
- const row = upsertStmtWithReturn.get(params) as { id: number };
204
- urlToId.set(page.normalized_url, row.id);
205
- }
206
- });
207
-
208
- tx(pages);
209
- return urlToId;
210
- }
211
-
212
- getPagesBySnapshot(snapshotId: number): Page[] {
213
- return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId) as Page[];
214
- }
215
-
216
- getPagesIdentityBySnapshot(snapshotId: number): { id: number; normalized_url: string }[] {
217
- return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').all(snapshotId, snapshotId) as { id: number; normalized_url: string }[];
218
- }
219
-
220
- getPagesIteratorBySnapshot(snapshotId: number): IterableIterator<Page> {
221
- return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.first_seen_snapshot_id <= ?').iterate(snapshotId, snapshotId) as IterableIterator<Page>;
222
- }
223
-
224
- getIdByUrl(siteId: number, url: string): number | undefined {
225
- const row = this.getIdStmt.get(siteId, url) as { id: number } | undefined;
226
- return row?.id;
227
- }
228
- }
@@ -1,43 +0,0 @@
1
- import { Database } from 'better-sqlite3';
2
-
3
- export interface Site {
4
- id: number;
5
- domain: string;
6
- created_at: string;
7
- settings_json: string | null;
8
- is_active: number;
9
- }
10
-
11
- export class SiteRepository {
12
- constructor(private db: Database) { }
13
-
14
- getSiteById(id: number): Site | undefined {
15
- return this.db.prepare('SELECT * FROM sites WHERE id = ?').get(id) as Site | undefined;
16
- }
17
-
18
- getSite(domain: string): Site | undefined {
19
- return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain) as Site | undefined;
20
- }
21
-
22
- getAllSites(): Site[] {
23
- return this.db.prepare('SELECT * FROM sites ORDER BY domain ASC').all() as Site[];
24
- }
25
-
26
- createSite(domain: string): number {
27
- const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
28
- const info = stmt.run(domain);
29
- return info.lastInsertRowid as number;
30
- }
31
-
32
- firstOrCreateSite(domain: string): Site {
33
- let site = this.getSite(domain);
34
- if (!site) {
35
- this.createSite(domain);
36
- site = this.getSite(domain);
37
- }
38
- return site!;
39
- }
40
- deleteSite(id: number): void {
41
- this.db.prepare('DELETE FROM sites WHERE id = ?').run(id);
42
- }
43
- }
@@ -1,99 +0,0 @@
1
- import { Database } from 'better-sqlite3';
2
-
3
- export interface Snapshot {
4
- id: number;
5
- site_id: number;
6
- type: 'full' | 'partial' | 'incremental';
7
- created_at: string;
8
- node_count: number;
9
- edge_count: number;
10
- status: 'running' | 'completed' | 'failed';
11
- limit_reached: number;
12
- health_score: number | null;
13
- orphan_count: number | null;
14
- thin_content_count: number | null;
15
- }
16
-
17
- export class SnapshotRepository {
18
- constructor(private db: Database) { }
19
-
20
- createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status: 'running' | 'completed' | 'failed' = 'running'): number {
21
- // Basic throttling or sleep if needed for tests, but generally SQLite is fast enough to have diff timestamps if not in same ms.
22
- // However, if we run in memory, created_at is default current time.
23
- // If two snapshots created in same second, ORDER BY created_at DESC is unstable or equal.
24
- // We should rely on ID for stability if timestamps are equal, but the query uses created_at.
25
- // Let's ensure we can also order by ID as tie-breaker.
26
- const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
27
- const info = stmt.run(siteId, type, status);
28
- return info.lastInsertRowid as number;
29
- }
30
-
31
- getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined {
32
- let sql = 'SELECT * FROM snapshots WHERE site_id = ? AND type != \'partial\'';
33
- const params: any[] = [siteId];
34
- if (status) {
35
- sql += ' AND status = ?';
36
- params.push(status);
37
- }
38
- sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
39
- return this.db.prepare(sql).get(...params) as Snapshot | undefined;
40
- }
41
-
42
- getSnapshotCount(siteId: number): number {
43
- const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId) as { count: number };
44
- return result.count;
45
- }
46
-
47
- updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats: Partial<Snapshot> = {}) {
48
- const sets: string[] = ['status = ?'];
49
- const params: any[] = [status];
50
-
51
- if (stats.node_count !== undefined) {
52
- sets.push('node_count = ?');
53
- params.push(stats.node_count);
54
- }
55
- if (stats.edge_count !== undefined) {
56
- sets.push('edge_count = ?');
57
- params.push(stats.edge_count);
58
- }
59
- if (stats.limit_reached !== undefined) {
60
- sets.push('limit_reached = ?');
61
- params.push(stats.limit_reached);
62
- }
63
- if (stats.health_score !== undefined) {
64
- sets.push('health_score = ?');
65
- params.push(stats.health_score);
66
- }
67
- if (stats.orphan_count !== undefined) {
68
- sets.push('orphan_count = ?');
69
- params.push(stats.orphan_count);
70
- }
71
- if (stats.thin_content_count !== undefined) {
72
- sets.push('thin_content_count = ?');
73
- params.push(stats.thin_content_count);
74
- }
75
-
76
- params.push(id);
77
- const sql = `UPDATE snapshots SET ${sets.join(', ')} WHERE id = ?`;
78
- this.db.prepare(sql).run(...params);
79
- }
80
-
81
- getSnapshot(id: number): Snapshot | undefined {
82
- return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id) as Snapshot | undefined;
83
- }
84
-
85
- deleteSnapshot(id: number): void {
86
- const tx = this.db.transaction(() => {
87
- // Unlink pages from this snapshot to prevent FK constraint violations or data inconsistencies
88
- this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
89
- this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
90
-
91
- // Cleanup: Delete pages that are no longer referenced by any snapshot
92
- this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
93
-
94
- // Delete the snapshot
95
- this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
96
- });
97
- tx();
98
- }
99
- }
package/src/db/schema.ts DELETED
@@ -1,177 +0,0 @@
1
- import { Database } from 'better-sqlite3';
2
-
3
- export function initSchema(db: Database) {
4
- // Sites Table
5
- db.exec(`
6
- CREATE TABLE IF NOT EXISTS sites (
7
- id INTEGER PRIMARY KEY AUTOINCREMENT,
8
- domain TEXT UNIQUE NOT NULL,
9
- created_at TEXT DEFAULT (datetime('now')),
10
- settings_json TEXT,
11
- is_active INTEGER DEFAULT 1
12
- );
13
- `);
14
-
15
- // Snapshots Table
16
- db.exec(`
17
- CREATE TABLE IF NOT EXISTS snapshots (
18
- id INTEGER PRIMARY KEY AUTOINCREMENT,
19
- site_id INTEGER NOT NULL,
20
- type TEXT CHECK(type IN ('full', 'partial', 'incremental')) NOT NULL,
21
- created_at TEXT DEFAULT (datetime('now')),
22
- node_count INTEGER DEFAULT 0,
23
- edge_count INTEGER DEFAULT 0,
24
- status TEXT CHECK(status IN ('running', 'completed', 'failed')) DEFAULT 'running',
25
- limit_reached INTEGER DEFAULT 0,
26
- health_score REAL,
27
- orphan_count INTEGER,
28
- thin_content_count INTEGER,
29
- FOREIGN KEY(site_id) REFERENCES sites(id) ON DELETE CASCADE
30
- );
31
- `);
32
-
33
- // Pages Table
34
- db.exec(`
35
- CREATE TABLE IF NOT EXISTS pages (
36
- id INTEGER PRIMARY KEY AUTOINCREMENT,
37
- site_id INTEGER NOT NULL,
38
- normalized_url TEXT NOT NULL,
39
- first_seen_snapshot_id INTEGER,
40
- last_seen_snapshot_id INTEGER,
41
- http_status INTEGER,
42
- canonical_url TEXT,
43
- content_hash TEXT,
44
- simhash TEXT,
45
- etag TEXT,
46
- last_modified TEXT,
47
- html TEXT,
48
- soft404_score REAL,
49
- noindex INTEGER DEFAULT 0,
50
- nofollow INTEGER DEFAULT 0,
51
- security_error TEXT,
52
- retries INTEGER DEFAULT 0,
53
- depth INTEGER DEFAULT 0,
54
- redirect_chain TEXT,
55
- bytes_received INTEGER,
56
- crawl_trap_flag INTEGER DEFAULT 0,
57
- crawl_trap_risk REAL,
58
- trap_type TEXT,
59
- created_at TEXT DEFAULT (datetime('now')),
60
- updated_at TEXT DEFAULT (datetime('now')),
61
- FOREIGN KEY(site_id) REFERENCES sites(id) ON DELETE CASCADE,
62
- FOREIGN KEY(first_seen_snapshot_id) REFERENCES snapshots(id),
63
- FOREIGN KEY(last_seen_snapshot_id) REFERENCES snapshots(id),
64
- UNIQUE(site_id, normalized_url)
65
- );
66
- `);
67
-
68
- // Index for Pages
69
- db.exec(`CREATE INDEX IF NOT EXISTS idx_pages_site_last_seen ON pages(site_id, last_seen_snapshot_id);`);
70
-
71
- // Edges Table
72
- db.exec(`
73
- CREATE TABLE IF NOT EXISTS edges (
74
- id INTEGER PRIMARY KEY AUTOINCREMENT,
75
- snapshot_id INTEGER NOT NULL,
76
- source_page_id INTEGER NOT NULL,
77
- target_page_id INTEGER NOT NULL,
78
- weight REAL DEFAULT 1.0,
79
- rel TEXT CHECK(rel IN ('nofollow', 'sponsored', 'ugc', 'internal', 'external', 'unknown')) DEFAULT 'internal',
80
- FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
81
- FOREIGN KEY(source_page_id) REFERENCES pages(id) ON DELETE CASCADE,
82
- FOREIGN KEY(target_page_id) REFERENCES pages(id) ON DELETE CASCADE
83
- );
84
- `);
85
-
86
- // Index for Edges
87
- db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot_source ON edges(snapshot_id, source_page_id);`);
88
- db.exec(`CREATE INDEX IF NOT EXISTS idx_edges_snapshot ON edges(snapshot_id);`);
89
-
90
- // Metrics Table
91
- db.exec(`
92
- CREATE TABLE IF NOT EXISTS metrics (
93
- snapshot_id INTEGER NOT NULL,
94
- page_id INTEGER NOT NULL,
95
- authority_score REAL,
96
- hub_score REAL,
97
- pagerank REAL,
98
- pagerank_score REAL,
99
- link_role TEXT CHECK(link_role IN ('hub', 'authority', 'power', 'balanced', 'peripheral')),
100
- crawl_status TEXT,
101
- word_count INTEGER,
102
- thin_content_score REAL,
103
- external_link_ratio REAL,
104
- orphan_score INTEGER,
105
- duplicate_cluster_id TEXT,
106
- duplicate_type TEXT CHECK(duplicate_type IN ('exact', 'near', 'template_heavy', 'none')),
107
- is_cluster_primary INTEGER DEFAULT 0,
108
- PRIMARY KEY(snapshot_id, page_id),
109
- FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
110
- FOREIGN KEY(page_id) REFERENCES pages(id) ON DELETE CASCADE
111
- );
112
- `);
113
-
114
- db.exec(`CREATE INDEX IF NOT EXISTS idx_metrics_snapshot ON metrics(snapshot_id);`);
115
-
116
- // Duplicate Clusters Table
117
- db.exec(`
118
- CREATE TABLE IF NOT EXISTS duplicate_clusters (
119
- id TEXT NOT NULL,
120
- snapshot_id INTEGER NOT NULL,
121
- type TEXT CHECK(type IN ('exact', 'near', 'template_heavy')) NOT NULL,
122
- size INTEGER NOT NULL,
123
- representative TEXT NOT NULL,
124
- severity TEXT CHECK(severity IN ('low', 'medium', 'high')) NOT NULL,
125
- PRIMARY KEY(snapshot_id, id),
126
- FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
127
- );
128
- `);
129
-
130
- // Content Clusters Table
131
- db.exec(`
132
- CREATE TABLE IF NOT EXISTS content_clusters (
133
- id INTEGER NOT NULL,
134
- snapshot_id INTEGER NOT NULL,
135
- count INTEGER NOT NULL,
136
- primary_url TEXT NOT NULL,
137
- risk TEXT CHECK(risk IN ('low', 'medium', 'high')) NOT NULL,
138
- shared_path_prefix TEXT,
139
- PRIMARY KEY(snapshot_id, id),
140
- FOREIGN KEY(snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
141
- );
142
- `);
143
-
144
- // Migration: add columns to existing DBs that were created before this update
145
- migrateSchema(db);
146
- }
147
-
148
- function migrateSchema(db: Database) {
149
- // Add missing columns to pages (safe: ALTER TABLE ADD COLUMN is idempotent-safe with try/catch)
150
- const pageColumns = [
151
- ['redirect_chain', 'TEXT'],
152
- ['bytes_received', 'INTEGER'],
153
- ['crawl_trap_flag', 'INTEGER DEFAULT 0'],
154
- ['crawl_trap_risk', 'REAL'],
155
- ['trap_type', 'TEXT'],
156
- ];
157
-
158
- for (const [col, type] of pageColumns) {
159
- try { db.exec(`ALTER TABLE pages ADD COLUMN ${col} ${type}`); } catch { /* already exists */ }
160
- }
161
-
162
- // Add missing columns to edges
163
- try { db.exec('ALTER TABLE edges ADD COLUMN weight REAL DEFAULT 1.0'); } catch { /* already exists */ }
164
-
165
- // Add missing columns to metrics
166
- const metricsColumns = [
167
- ['pagerank_score', 'REAL'],
168
- ['link_role', 'TEXT'],
169
- ['duplicate_cluster_id', 'TEXT'],
170
- ['duplicate_type', 'TEXT'],
171
- ['is_cluster_primary', 'INTEGER DEFAULT 0'],
172
- ];
173
-
174
- for (const [col, type] of metricsColumns) {
175
- try { db.exec(`ALTER TABLE metrics ADD COLUMN ${col} ${type}`); } catch { /* already exists */ }
176
- }
177
- }
@@ -1,84 +0,0 @@
1
- import { Graph } from '../graph/graph.js';
2
- import { calculateMetrics } from '../graph/metrics.js';
3
-
4
- export interface DiffResult {
5
- addedUrls: string[];
6
- removedUrls: string[];
7
- changedStatus: { url: string; oldStatus: number; newStatus: number }[];
8
- changedCanonical: { url: string; oldCanonical: string | null; newCanonical: string | null }[];
9
- changedDuplicateGroup: { url: string; oldGroup: string | null; newGroup: string | null }[];
10
- metricDeltas: {
11
- structuralEntropy: number;
12
- orphanCount: number;
13
- crawlEfficiency: number;
14
- };
15
- }
16
-
17
- export function compareGraphs(oldGraph: Graph, newGraph: Graph): DiffResult {
18
- const oldNodes = new Map(oldGraph.getNodes().map(n => [n.url, n]));
19
- const newNodes = new Map(newGraph.getNodes().map(n => [n.url, n]));
20
-
21
- const addedUrls: string[] = [];
22
- const removedUrls: string[] = [];
23
- const changedStatus: { url: string; oldStatus: number; newStatus: number }[] = [];
24
- const changedCanonical: { url: string; oldCanonical: string | null; newCanonical: string | null }[] = [];
25
- const changedDuplicateGroup: { url: string; oldGroup: string | null; newGroup: string | null }[] = [];
26
-
27
- // Added & Changed
28
- for (const [url, newNode] of newNodes) {
29
- const oldNode = oldNodes.get(url);
30
- if (!oldNode) {
31
- addedUrls.push(url);
32
- } else {
33
- // Changed Status
34
- if (oldNode.status !== newNode.status) {
35
- changedStatus.push({ url, oldStatus: oldNode.status, newStatus: newNode.status });
36
- }
37
- // Changed Canonical
38
- if (oldNode.canonical !== newNode.canonical) {
39
- changedCanonical.push({
40
- url,
41
- oldCanonical: oldNode.canonical || null,
42
- newCanonical: newNode.canonical || null
43
- });
44
- }
45
- // Changed Duplicate Group
46
- const oldGroup = oldNode.duplicateClusterId || null;
47
- const newGroup = newNode.duplicateClusterId || null;
48
- if (oldGroup !== newGroup) {
49
- changedDuplicateGroup.push({
50
- url,
51
- oldGroup,
52
- newGroup
53
- });
54
- }
55
- }
56
- }
57
-
58
- // Removed
59
- for (const url of oldNodes.keys()) {
60
- if (!newNodes.has(url)) {
61
- removedUrls.push(url);
62
- }
63
- }
64
-
65
- // Metrics
66
- // maxDepth is ignored by current calculateMetrics implementation but required by signature
67
- const oldMetrics = calculateMetrics(oldGraph, 10);
68
- const newMetrics = calculateMetrics(newGraph, 10);
69
-
70
- const metricDeltas = {
71
- structuralEntropy: newMetrics.structuralEntropy - oldMetrics.structuralEntropy,
72
- orphanCount: newMetrics.orphanPages.length - oldMetrics.orphanPages.length,
73
- crawlEfficiency: newMetrics.crawlEfficiencyScore - oldMetrics.crawlEfficiencyScore
74
- };
75
-
76
- return {
77
- addedUrls,
78
- removedUrls,
79
- changedStatus,
80
- changedCanonical,
81
- changedDuplicateGroup,
82
- metricDeltas
83
- };
84
- }
package/src/events.ts DELETED
@@ -1,16 +0,0 @@
1
- export type CrawlEvent =
2
- | { type: 'crawl:start'; url: string }
3
- | { type: 'crawl:success'; url: string; status: number; durationMs: number; depth?: number }
4
- | { type: 'crawl:error'; url: string; error: string; depth?: number }
5
- | { type: 'crawl:limit-reached'; limit: number }
6
- | { type: 'queue:enqueue'; url: string; depth: number }
7
- | { type: 'metrics:start'; phase: string }
8
- | { type: 'metrics:complete'; durationMs: number }
9
- | { type: 'debug'; message: string; context?: unknown }
10
- | { type: 'info'; message: string; context?: unknown }
11
- | { type: 'warn'; message: string; context?: unknown }
12
- | { type: 'error'; message: string; error?: unknown; context?: unknown };
13
-
14
- export interface EngineContext {
15
- emit: (event: CrawlEvent) => void;
16
- }