@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -8,36 +8,38 @@ export class PageRepository {
8
8
  INSERT INTO pages (
9
9
  site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
10
10
  http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
11
- soft404_score, noindex, nofollow, security_error, retries, depth,
12
- redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
11
+ noindex, nofollow, security_error, retries, depth,
12
+ discovered_via_sitemap, redirect_chain, bytes_received, is_internal, crawl_trap_flag, crawl_trap_risk, trap_type,
13
13
  updated_at
14
14
  ) VALUES (
15
15
  @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
16
16
  @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
17
- @soft404_score, @noindex, @nofollow, @security_error, @retries, @depth,
18
- @redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
17
+ @noindex, @nofollow, @security_error, @retries, @depth,
18
+ @discovered_via_sitemap, @redirect_chain, @bytes_received, @is_internal, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
19
19
  datetime('now')
20
20
  )
21
21
  ON CONFLICT(site_id, normalized_url) DO UPDATE SET
22
+ first_seen_snapshot_id = COALESCE(pages.first_seen_snapshot_id, excluded.first_seen_snapshot_id),
22
23
  last_seen_snapshot_id = excluded.last_seen_snapshot_id,
23
- http_status = excluded.http_status,
24
- canonical_url = excluded.canonical_url,
25
- content_hash = excluded.content_hash,
26
- simhash = excluded.simhash,
27
- etag = excluded.etag,
28
- last_modified = excluded.last_modified,
29
- html = excluded.html,
30
- soft404_score = excluded.soft404_score,
31
- noindex = excluded.noindex,
32
- nofollow = excluded.nofollow,
33
- security_error = excluded.security_error,
34
- retries = excluded.retries,
35
- depth = excluded.depth,
36
- redirect_chain = excluded.redirect_chain,
37
- bytes_received = excluded.bytes_received,
38
- crawl_trap_flag = excluded.crawl_trap_flag,
39
- crawl_trap_risk = excluded.crawl_trap_risk,
40
- trap_type = excluded.trap_type,
24
+ http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
25
+ canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
26
+ content_hash = COALESCE(excluded.content_hash, pages.content_hash),
27
+ simhash = COALESCE(excluded.simhash, pages.simhash),
28
+ etag = COALESCE(excluded.etag, pages.etag),
29
+ last_modified = COALESCE(excluded.last_modified, pages.last_modified),
30
+ html = COALESCE(excluded.html, pages.html),
31
+ noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
32
+ nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
33
+ security_error = COALESCE(excluded.security_error, pages.security_error),
34
+ retries = MAX(pages.retries, excluded.retries),
35
+ depth = MIN(pages.depth, excluded.depth),
36
+ discovered_via_sitemap = MAX(pages.discovered_via_sitemap, excluded.discovered_via_sitemap),
37
+ redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
38
+ bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
39
+ is_internal = COALESCE(excluded.is_internal, pages.is_internal),
40
+ crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
41
+ crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
42
+ trap_type = COALESCE(excluded.trap_type, pages.trap_type),
41
43
  updated_at = datetime('now')
42
44
  `);
43
45
  this.getIdStmt = this.db.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?');
@@ -55,14 +57,15 @@ export class PageRepository {
55
57
  etag: page.etag ?? null,
56
58
  last_modified: page.last_modified ?? null,
57
59
  html: page.html ?? null,
58
- soft404_score: page.soft404_score ?? null,
59
60
  noindex: page.noindex ?? 0,
60
61
  nofollow: page.nofollow ?? 0,
61
62
  security_error: page.security_error ?? null,
62
63
  retries: page.retries ?? 0,
63
64
  depth: page.depth ?? 0,
65
+ discovered_via_sitemap: page.discovered_via_sitemap ?? 0,
64
66
  redirect_chain: page.redirect_chain ?? null,
65
67
  bytes_received: page.bytes_received ?? null,
68
+ is_internal: page.is_internal ?? 1,
66
69
  crawl_trap_flag: page.crawl_trap_flag ?? 0,
67
70
  crawl_trap_risk: page.crawl_trap_risk ?? null,
68
71
  trap_type: page.trap_type ?? null,
@@ -83,11 +86,152 @@ export class PageRepository {
83
86
  getPage(siteId, url) {
84
87
  return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url);
85
88
  }
86
- getPagesBySnapshot(snapshotId) {
87
- return this.db.prepare('SELECT * FROM pages WHERE last_seen_snapshot_id = ?').all(snapshotId);
89
+ getPagesByUrls(siteId, urls) {
90
+ if (urls.length === 0)
91
+ return [];
92
+ const chunkSize = 900;
93
+ const results = [];
94
+ for (let i = 0; i < urls.length; i += chunkSize) {
95
+ const chunk = urls.slice(i, i + chunkSize);
96
+ const placeholders = chunk.map(() => '?').join(',');
97
+ const chunkResults = this.db.prepare(`SELECT * FROM pages WHERE site_id = ? AND normalized_url IN (${placeholders})`).all(siteId, ...chunk);
98
+ results.push(...chunkResults);
99
+ }
100
+ return results;
101
+ }
102
+ upsertMany(pages) {
103
+ if (pages.length === 0)
104
+ return new Map();
105
+ const upsertStmtWithReturn = this.db.prepare(`
106
+ INSERT INTO pages (
107
+ site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
108
+ http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
109
+ noindex, nofollow, security_error, retries, depth,
110
+ discovered_via_sitemap, redirect_chain, bytes_received, is_internal, crawl_trap_flag, crawl_trap_risk, trap_type,
111
+ updated_at
112
+ ) VALUES (
113
+ @site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
114
+ @http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
115
+ @noindex, @nofollow, @security_error, @retries, @depth,
116
+ @discovered_via_sitemap, @redirect_chain, @bytes_received, @is_internal, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
117
+ datetime('now')
118
+ )
119
+ ON CONFLICT(site_id, normalized_url) DO UPDATE SET
120
+ first_seen_snapshot_id = COALESCE(pages.first_seen_snapshot_id, excluded.first_seen_snapshot_id),
121
+ last_seen_snapshot_id = excluded.last_seen_snapshot_id,
122
+ http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
123
+ canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
124
+ content_hash = COALESCE(excluded.content_hash, pages.content_hash),
125
+ simhash = COALESCE(excluded.simhash, pages.simhash),
126
+ etag = COALESCE(excluded.etag, pages.etag),
127
+ last_modified = COALESCE(excluded.last_modified, pages.last_modified),
128
+ html = COALESCE(excluded.html, pages.html),
129
+ noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
130
+ nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
131
+ security_error = COALESCE(excluded.security_error, pages.security_error),
132
+ retries = MAX(pages.retries, excluded.retries),
133
+ depth = MIN(pages.depth, excluded.depth),
134
+ discovered_via_sitemap = MAX(pages.discovered_via_sitemap, excluded.discovered_via_sitemap),
135
+ redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
136
+ bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
137
+ is_internal = COALESCE(excluded.is_internal, pages.is_internal),
138
+ crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
139
+ crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
140
+ trap_type = COALESCE(excluded.trap_type, pages.trap_type),
141
+ updated_at = datetime('now')
142
+ RETURNING id
143
+ `);
144
+ const urlToId = new Map();
145
+ const tx = this.db.transaction((pagesBatch) => {
146
+ for (const page of pagesBatch) {
147
+ const params = {
148
+ site_id: page.site_id,
149
+ normalized_url: page.normalized_url,
150
+ first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
151
+ last_seen_snapshot_id: page.last_seen_snapshot_id,
152
+ http_status: page.http_status ?? null,
153
+ canonical_url: page.canonical_url ?? null,
154
+ content_hash: page.content_hash ?? null,
155
+ simhash: page.simhash ?? null,
156
+ etag: page.etag ?? null,
157
+ last_modified: page.last_modified ?? null,
158
+ html: page.html ?? null,
159
+ noindex: page.noindex ?? 0,
160
+ nofollow: page.nofollow ?? 0,
161
+ security_error: page.security_error ?? null,
162
+ retries: page.retries ?? 0,
163
+ depth: page.depth ?? 0,
164
+ discovered_via_sitemap: page.discovered_via_sitemap ?? 0,
165
+ redirect_chain: page.redirect_chain ?? null,
166
+ bytes_received: page.bytes_received ?? null,
167
+ is_internal: page.is_internal ?? 1,
168
+ crawl_trap_flag: page.crawl_trap_flag ?? 0,
169
+ crawl_trap_risk: page.crawl_trap_risk ?? null,
170
+ trap_type: page.trap_type ?? null,
171
+ };
172
+ const row = upsertStmtWithReturn.get(params);
173
+ urlToId.set(page.normalized_url, row.id);
174
+ }
175
+ });
176
+ tx(pages);
177
+ return urlToId;
178
+ }
179
+ getPagesBySnapshot(snapshotId, runType = 'completed') {
180
+ if (runType === 'single') {
181
+ return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').all(snapshotId);
182
+ }
183
+ return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId);
184
+ }
185
+ getPagesIdentityBySnapshot(snapshotId) {
186
+ // For identities, always loading all up to this point is fine for the crawler to map URLs to IDs.
187
+ return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId);
188
+ }
189
+ getPagesIteratorBySnapshot(snapshotId, runType = 'completed') {
190
+ if (runType === 'single') {
191
+ return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').iterate(snapshotId);
192
+ }
193
+ return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').iterate(snapshotId, snapshotId);
88
194
  }
89
195
  getIdByUrl(siteId, url) {
90
196
  const row = this.getIdStmt.get(siteId, url);
91
197
  return row?.id;
92
198
  }
199
+ reconcileInternalUrls(siteId, siteOrigin) {
200
+ const origin = siteOrigin.replace(/\/+$/, '');
201
+ const tx = this.db.transaction(() => {
202
+ const rows = this.db
203
+ .prepare("SELECT id, normalized_url FROM pages WHERE site_id = ? AND (normalized_url LIKE 'http://%' OR normalized_url LIKE 'https://%')")
204
+ .all(siteId);
205
+ for (const row of rows) {
206
+ let parsed;
207
+ try {
208
+ parsed = new URL(row.normalized_url);
209
+ }
210
+ catch {
211
+ continue;
212
+ }
213
+ if (parsed.origin !== origin) {
214
+ continue;
215
+ }
216
+ const targetPath = `${parsed.pathname}${parsed.search}`;
217
+ if (targetPath === row.normalized_url) {
218
+ continue;
219
+ }
220
+ const existing = this.db
221
+ .prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?')
222
+ .get(siteId, targetPath);
223
+ if (existing && existing.id !== row.id) {
224
+ this.db.prepare('UPDATE edges SET source_page_id = ? WHERE source_page_id = ?').run(existing.id, row.id);
225
+ this.db.prepare('UPDATE edges SET target_page_id = ? WHERE target_page_id = ?').run(existing.id, row.id);
226
+ this.db.prepare('UPDATE OR IGNORE metrics SET page_id = ? WHERE page_id = ?').run(existing.id, row.id);
227
+ this.db.prepare('DELETE FROM metrics WHERE page_id = ?').run(row.id);
228
+ this.db.prepare('DELETE FROM pages WHERE id = ?').run(row.id);
229
+ }
230
+ else {
231
+ this.db.prepare('UPDATE pages SET normalized_url = ? WHERE id = ?').run(targetPath, row.id);
232
+ }
233
+ }
234
+ });
235
+ tx();
236
+ }
93
237
  }
@@ -2,6 +2,8 @@ import { Database } from 'better-sqlite3';
2
2
  export interface Site {
3
3
  id: number;
4
4
  domain: string;
5
+ preferred_url: string | null;
6
+ ssl: number | null;
5
7
  created_at: string;
6
8
  settings_json: string | null;
7
9
  is_active: number;
@@ -9,7 +11,14 @@ export interface Site {
9
11
  export declare class SiteRepository {
10
12
  private db;
11
13
  constructor(db: Database);
14
+ getSiteById(id: number): Site | undefined;
12
15
  getSite(domain: string): Site | undefined;
16
+ getAllSites(): Site[];
13
17
  createSite(domain: string): number;
18
+ updateSitePreference(id: number, prefs: {
19
+ preferred_url: string;
20
+ ssl: number;
21
+ }): void;
14
22
  firstOrCreateSite(domain: string): Site;
23
+ deleteSite(id: number): void;
15
24
  }
@@ -3,14 +3,24 @@ export class SiteRepository {
3
3
  constructor(db) {
4
4
  this.db = db;
5
5
  }
6
+ getSiteById(id) {
7
+ return this.db.prepare('SELECT * FROM sites WHERE id = ?').get(id);
8
+ }
6
9
  getSite(domain) {
7
10
  return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain);
8
11
  }
12
+ getAllSites() {
13
+ return this.db.prepare('SELECT * FROM sites ORDER BY domain ASC').all();
14
+ }
9
15
  createSite(domain) {
10
16
  const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
11
17
  const info = stmt.run(domain);
12
18
  return info.lastInsertRowid;
13
19
  }
20
+ updateSitePreference(id, prefs) {
21
+ const stmt = this.db.prepare('UPDATE sites SET preferred_url = ?, ssl = ? WHERE id = ?');
22
+ stmt.run(prefs.preferred_url, prefs.ssl, id);
23
+ }
14
24
  firstOrCreateSite(domain) {
15
25
  let site = this.getSite(domain);
16
26
  if (!site) {
@@ -19,4 +29,7 @@ export class SiteRepository {
19
29
  }
20
30
  return site;
21
31
  }
32
+ deleteSite(id) {
33
+ this.db.prepare('DELETE FROM sites WHERE id = ?').run(id);
34
+ }
22
35
  }
@@ -2,11 +2,11 @@ import { Database } from 'better-sqlite3';
2
2
  export interface Snapshot {
3
3
  id: number;
4
4
  site_id: number;
5
- type: 'full' | 'partial' | 'incremental';
5
+ run_type: 'completed' | 'incremental' | 'single';
6
6
  created_at: string;
7
7
  node_count: number;
8
8
  edge_count: number;
9
- status: 'running' | 'completed' | 'failed';
9
+ status: 'queued' | 'running' | 'completed' | 'failed' | 'cancelled';
10
10
  limit_reached: number;
11
11
  health_score: number | null;
12
12
  orphan_count: number | null;
@@ -15,8 +15,17 @@ export interface Snapshot {
15
15
  export declare class SnapshotRepository {
16
16
  private db;
17
17
  constructor(db: Database);
18
- createSnapshot(siteId: number, type: 'full' | 'partial' | 'incremental', status?: 'running' | 'completed' | 'failed'): number;
19
- getLatestSnapshot(siteId: number, status?: 'completed' | 'running' | 'failed'): Snapshot | undefined;
20
- updateSnapshotStatus(id: number, status: 'completed' | 'failed', stats?: Partial<Snapshot>): void;
18
+ createSnapshot(siteId: number, runType: Snapshot['run_type'], status?: Snapshot['status']): number;
19
+ getLatestSnapshot(siteId: number, status?: Snapshot['status'], includeSingle?: boolean): Snapshot | undefined;
20
+ touchSnapshot(id: number): void;
21
+ getSnapshotCount(siteId: number): number;
22
+ /**
23
+ * Returns true if the site has ever had a completed full or incremental crawl.
24
+ * Single snapshots (from page --live) do NOT count as a "first crawl".
25
+ */
26
+ hasFullCrawl(siteId: number): boolean;
27
+ updateSnapshotStatus(id: number, status: Snapshot['status'], stats?: Partial<Snapshot>): void;
21
28
  getSnapshot(id: number): Snapshot | undefined;
29
+ deleteSnapshot(id: number): void;
30
+ pruneSnapshots(siteId: number, maxSnapshots: number, maxSingleSnapshots: number, protectedSnapshotId?: number): void;
22
31
  }
@@ -3,21 +3,39 @@ export class SnapshotRepository {
3
3
  constructor(db) {
4
4
  this.db = db;
5
5
  }
6
- createSnapshot(siteId, type, status = 'running') {
7
- const stmt = this.db.prepare('INSERT INTO snapshots (site_id, type, status) VALUES (?, ?, ?)');
8
- const info = stmt.run(siteId, type, status);
6
+ createSnapshot(siteId, runType, status = 'running') {
7
+ const stmt = this.db.prepare('INSERT INTO snapshots (site_id, run_type, status) VALUES (?, ?, ?)');
8
+ const info = stmt.run(siteId, runType, status);
9
9
  return info.lastInsertRowid;
10
10
  }
11
- getLatestSnapshot(siteId, status) {
11
+ getLatestSnapshot(siteId, status, includeSingle = false) {
12
12
  let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
13
+ if (!includeSingle) {
14
+ sql += ' AND run_type != \'single\'';
15
+ }
13
16
  const params = [siteId];
14
17
  if (status) {
15
18
  sql += ' AND status = ?';
16
19
  params.push(status);
17
20
  }
18
- sql += ' ORDER BY created_at DESC LIMIT 1';
21
+ sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
19
22
  return this.db.prepare(sql).get(...params);
20
23
  }
24
+ touchSnapshot(id) {
25
+ this.db.prepare(`UPDATE snapshots SET created_at = datetime('now') WHERE id = ?`).run(id);
26
+ }
27
+ getSnapshotCount(siteId) {
28
+ const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId);
29
+ return result.count;
30
+ }
31
+ /**
32
+ * Returns true if the site has ever had a completed full or incremental crawl.
33
+ * Single snapshots (from page --live) do NOT count as a "first crawl".
34
+ */
35
+ hasFullCrawl(siteId) {
36
+ const result = this.db.prepare(`SELECT COUNT(*) as count FROM snapshots WHERE site_id = ? AND run_type IN ('completed', 'incremental') AND status = 'completed'`).get(siteId);
37
+ return result.count > 0;
38
+ }
21
39
  updateSnapshotStatus(id, status, stats = {}) {
22
40
  const sets = ['status = ?'];
23
41
  const params = [status];
@@ -52,4 +70,45 @@ export class SnapshotRepository {
52
70
  getSnapshot(id) {
53
71
  return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id);
54
72
  }
73
+ deleteSnapshot(id) {
74
+ const tx = this.db.transaction(() => {
75
+ // Unlink pages from this snapshot to prevent FK constraint violations or data inconsistencies
76
+ this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
77
+ this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
78
+ // Cleanup: Delete pages that are no longer referenced by any snapshot
79
+ this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
80
+ // Delete the snapshot
81
+ this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
82
+ });
83
+ tx();
84
+ }
85
+ pruneSnapshots(siteId, maxSnapshots, maxSingleSnapshots, protectedSnapshotId) {
86
+ const tx = this.db.transaction(() => {
87
+ const singlesToDelete = this.db.prepare(`
88
+ SELECT id
89
+ FROM snapshots
90
+ WHERE site_id = ? AND run_type = 'single'
91
+ ORDER BY created_at DESC, id DESC
92
+ LIMIT -1 OFFSET ?
93
+ `).all(siteId, Math.max(0, maxSingleSnapshots));
94
+ const fullToDelete = this.db.prepare(`
95
+ SELECT id
96
+ FROM snapshots
97
+ WHERE site_id = ? AND run_type IN ('completed', 'incremental')
98
+ ORDER BY created_at DESC, id DESC
99
+ LIMIT -1 OFFSET ?
100
+ `).all(siteId, Math.max(0, maxSnapshots));
101
+ const ids = [...singlesToDelete, ...fullToDelete]
102
+ .map(r => r.id)
103
+ .filter(id => id !== protectedSnapshotId);
104
+ for (const id of ids) {
105
+ // Inline delete logic to keep operation inside this transaction.
106
+ this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
107
+ this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
108
+ this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
109
+ this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
110
+ }
111
+ });
112
+ tx();
113
+ }
55
114
  }
@@ -0,0 +1,9 @@
1
+ export interface ResetOptions {
2
+ reportsDir?: string;
3
+ dryRun?: boolean;
4
+ }
5
+ /**
6
+ * Completely resets the Crawlith state.
7
+ * Deletes the database, clears all locks, and optionally wipes the reports directory.
8
+ */
9
+ export declare function resetCrawlith(options?: ResetOptions): Promise<void>;
@@ -0,0 +1,32 @@
1
+ import fs from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import os from 'node:os';
4
+ import { closeDb, getDb, getDbPath } from './index.js';
5
+ import { LockManager } from '../lock/lockManager.js';
6
+ /**
7
+ * Completely resets the Crawlith state.
8
+ * Deletes the database, clears all locks, and optionally wipes the reports directory.
9
+ */
10
+ export async function resetCrawlith(options = {}) {
11
+ const { reportsDir, dryRun = false } = options;
12
+ if (dryRun) {
13
+ return;
14
+ }
15
+ // 1. Close database connection to release file handles
16
+ closeDb();
17
+ // 2. Clear all locks
18
+ await LockManager.clearAllLocks();
19
+ // 3. Remove the entire state directory (includes DB)
20
+ const dbPath = getDbPath();
21
+ if (dbPath !== ':memory:') {
22
+ const crawlithDir = path.join(os.homedir(), '.crawlith');
23
+ await fs.rm(crawlithDir, { recursive: true, force: true });
24
+ }
25
+ // 4. Remove reports directory if specified
26
+ if (reportsDir) {
27
+ const resolvedReportsDir = path.resolve(reportsDir);
28
+ await fs.rm(resolvedReportsDir, { recursive: true, force: true });
29
+ }
30
+ // 5. Re-initialize database to ensure schema is fresh for next use
31
+ getDb();
32
+ }
@@ -0,0 +1,12 @@
1
+ import { Database, Statement } from 'better-sqlite3';
2
+ export declare class Statements {
3
+ private db;
4
+ getPageIdByUrl: Statement;
5
+ insertPluginReport: Statement;
6
+ getPluginReport: Statement;
7
+ deleteSnapshotPlugins: Statement;
8
+ getSnapshot: Statement;
9
+ getMigration: Statement;
10
+ insertMigration: Statement;
11
+ constructor(db: Database);
12
+ }
@@ -0,0 +1,40 @@
1
+ export class Statements {
2
+ db;
3
+ getPageIdByUrl;
4
+ insertPluginReport;
5
+ getPluginReport;
6
+ deleteSnapshotPlugins;
7
+ getSnapshot;
8
+ getMigration;
9
+ insertMigration;
10
+ constructor(db) {
11
+ this.db = db;
12
+ this.getPageIdByUrl = this.db.prepare(`
13
+ SELECT id FROM pages
14
+ WHERE site_id = (SELECT site_id FROM snapshots WHERE id = ?)
15
+ AND normalized_url = ?
16
+ `);
17
+ this.insertPluginReport = this.db.prepare(`
18
+ INSERT OR REPLACE INTO plugin_reports
19
+ (snapshot_id, plugin_name, data, total_score, score_count, score_weight_sum, score_calculated_at)
20
+ VALUES (?, ?, ?, ?, ?, ?, ?)
21
+ `);
22
+ this.getPluginReport = this.db.prepare(`
23
+ SELECT data FROM plugin_reports
24
+ WHERE snapshot_id = ? AND plugin_name = ?
25
+ ORDER BY created_at DESC LIMIT 1
26
+ `);
27
+ this.deleteSnapshotPlugins = this.db.prepare(`
28
+ DELETE FROM plugin_reports WHERE snapshot_id = ?
29
+ `);
30
+ this.getSnapshot = this.db.prepare(`
31
+ SELECT id FROM snapshots WHERE id = ?
32
+ `);
33
+ this.getMigration = this.db.prepare(`
34
+ SELECT plugin_name FROM plugin_migrations WHERE plugin_name = ?
35
+ `);
36
+ this.insertMigration = this.db.prepare(`
37
+ INSERT INTO plugin_migrations (plugin_name) VALUES (?)
38
+ `);
39
+ }
40
+ }
@@ -12,11 +12,6 @@ export interface DiffResult {
12
12
  oldCanonical: string | null;
13
13
  newCanonical: string | null;
14
14
  }[];
15
- changedDuplicateGroup: {
16
- url: string;
17
- oldGroup: string | null;
18
- newGroup: string | null;
19
- }[];
20
15
  metricDeltas: {
21
16
  structuralEntropy: number;
22
17
  orphanCount: number;
@@ -6,7 +6,6 @@ export function compareGraphs(oldGraph, newGraph) {
6
6
  const removedUrls = [];
7
7
  const changedStatus = [];
8
8
  const changedCanonical = [];
9
- const changedDuplicateGroup = [];
10
9
  // Added & Changed
11
10
  for (const [url, newNode] of newNodes) {
12
11
  const oldNode = oldNodes.get(url);
@@ -26,16 +25,6 @@ export function compareGraphs(oldGraph, newGraph) {
26
25
  newCanonical: newNode.canonical || null
27
26
  });
28
27
  }
29
- // Changed Duplicate Group
30
- const oldGroup = oldNode.duplicateClusterId || null;
31
- const newGroup = newNode.duplicateClusterId || null;
32
- if (oldGroup !== newGroup) {
33
- changedDuplicateGroup.push({
34
- url,
35
- oldGroup,
36
- newGroup
37
- });
38
- }
39
28
  }
40
29
  }
41
30
  // Removed
@@ -58,7 +47,6 @@ export function compareGraphs(oldGraph, newGraph) {
58
47
  removedUrls,
59
48
  changedStatus,
60
49
  changedCanonical,
61
- changedDuplicateGroup,
62
50
  metricDeltas
63
51
  };
64
52
  }
@@ -0,0 +1,16 @@
1
+ import { Graph } from '../graph/graph.js';
2
+ export interface DiffOptions {
3
+ onlyCritical?: boolean;
4
+ }
5
+ export interface SnapshotDiff {
6
+ newPages: string[];
7
+ removedPages: string[];
8
+ changedPages: {
9
+ url: string;
10
+ changes: string[];
11
+ severity: 'low' | 'medium' | 'high';
12
+ }[];
13
+ }
14
+ export declare class DiffService {
15
+ compare(oldGraph: Graph | undefined, newGraph: Graph, _options?: DiffOptions): SnapshotDiff;
16
+ }
@@ -0,0 +1,41 @@
1
+ export class DiffService {
2
+ compare(oldGraph, newGraph, _options = {}) {
3
+ if (!oldGraph) {
4
+ return {
5
+ newPages: Array.from(newGraph.nodes.keys()),
6
+ removedPages: [],
7
+ changedPages: []
8
+ };
9
+ }
10
+ const oldUrls = new Set(oldGraph.nodes.keys());
11
+ const newUrls = new Set(newGraph.nodes.keys());
12
+ const newPages = Array.from(newUrls).filter(u => !oldUrls.has(u));
13
+ const removedPages = Array.from(oldUrls).filter(u => !newUrls.has(u));
14
+ const changedPages = [];
15
+ for (const url of newUrls) {
16
+ if (oldUrls.has(url)) {
17
+ const oldNode = oldGraph.nodes.get(url);
18
+ const newNode = newGraph.nodes.get(url);
19
+ const changes = [];
20
+ let severity = 'low';
21
+ if (oldNode.status !== newNode.status) {
22
+ changes.push(`status: ${oldNode.status} -> ${newNode.status}`);
23
+ severity = 'high';
24
+ }
25
+ if (oldNode.contentHash !== newNode.contentHash) {
26
+ changes.push('content changed');
27
+ if (severity !== 'high')
28
+ severity = 'medium';
29
+ }
30
+ if (oldNode.noindex !== newNode.noindex) {
31
+ changes.push(`noindex: ${oldNode.noindex} -> ${newNode.noindex}`);
32
+ severity = 'high';
33
+ }
34
+ if (changes.length > 0) {
35
+ changedPages.push({ url, changes, severity });
36
+ }
37
+ }
38
+ }
39
+ return { newPages, removedPages, changedPages };
40
+ }
41
+ }
@@ -0,0 +1,4 @@
1
+ export * from '../graph/graph.js';
2
+ export * from '../graph/metrics.js';
3
+ export * from '../graph/simhash.js';
4
+ export * from '../crawler/normalize.js';
@@ -0,0 +1,4 @@
1
+ export * from '../graph/graph.js';
2
+ export * from '../graph/metrics.js';
3
+ export * from '../graph/simhash.js';
4
+ export * from '../crawler/normalize.js';