ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,2910 @@
1
+ /**
2
+ * Database Migration Operations
3
+ *
4
+ * Contains the main migration functions: initializeDatabase, migrateToLatest,
5
+ * checkSchemaVersion, and getCurrentSchemaVersion.
6
+ *
7
+ * @module migrations/operations
8
+ */
9
+ import { MigrationError } from './types.js';
10
+ import { SCHEMA_VERSION, CREATE_CHUNKS_FTS_TABLE, CREATE_FTS_TRIGGERS, CREATE_FTS_INDEX_METADATA, CREATE_VLM_FTS_TABLE, CREATE_VLM_FTS_TRIGGERS, CREATE_EXTRACTIONS_TABLE, CREATE_FORM_FILLS_TABLE, CREATE_EXTRACTIONS_FTS_TABLE, CREATE_EXTRACTIONS_FTS_TRIGGERS, CREATE_UPLOADED_FILES_TABLE, CREATE_COMPARISONS_TABLE, CREATE_CLUSTERS_TABLE, CREATE_DOCUMENT_CLUSTERS_TABLE, CREATE_TAGS_TABLE, CREATE_ENTITY_TAGS_TABLE, CREATE_DOCUMENTS_FTS_TABLE, CREATE_DOCUMENTS_FTS_TRIGGERS, CREATE_USERS_TABLE, CREATE_AUDIT_LOG_TABLE, CREATE_ANNOTATIONS_TABLE, CREATE_DOCUMENT_LOCKS_TABLE, CREATE_WORKFLOW_STATES_TABLE, CREATE_APPROVAL_CHAINS_TABLE, CREATE_APPROVAL_STEPS_TABLE, CREATE_OBLIGATIONS_TABLE, CREATE_PLAYBOOKS_TABLE, CREATE_WEBHOOKS_TABLE, } from './schema-definitions.js';
11
+ // ─── Legacy entity/KG table definitions (inlined for migration chain v12→v25) ───
12
+ // These tables were removed from schema-definitions.ts in v26 but the migration
13
+ // functions that originally created them (v12→v13, v14→v15, v17→v18, etc.) still
14
+ // reference these constants so that old databases can migrate through the full chain.
15
+ // The v25→v26 migration then drops all of them.
16
+ const CREATE_ENTITIES_TABLE = `
17
+ CREATE TABLE IF NOT EXISTS entities (
18
+ id TEXT PRIMARY KEY NOT NULL,
19
+ document_id TEXT NOT NULL REFERENCES documents(id),
20
+ entity_type TEXT NOT NULL CHECK (entity_type IN ('person', 'organization', 'date', 'amount', 'case_number', 'location', 'statute', 'exhibit', 'medication', 'diagnosis', 'medical_device', 'other')),
21
+ raw_text TEXT NOT NULL,
22
+ normalized_text TEXT NOT NULL,
23
+ confidence REAL NOT NULL DEFAULT 0.0,
24
+ metadata TEXT,
25
+ provenance_id TEXT NOT NULL REFERENCES provenance(id),
26
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
27
+ )`;
28
+ const CREATE_ENTITY_MENTIONS_TABLE = `
29
+ CREATE TABLE IF NOT EXISTS entity_mentions (
30
+ id TEXT PRIMARY KEY NOT NULL,
31
+ entity_id TEXT NOT NULL REFERENCES entities(id),
32
+ document_id TEXT NOT NULL REFERENCES documents(id),
33
+ chunk_id TEXT REFERENCES chunks(id),
34
+ page_number INTEGER,
35
+ character_start INTEGER,
36
+ character_end INTEGER,
37
+ context_text TEXT,
38
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
39
+ )`;
40
+ const CREATE_KNOWLEDGE_NODES_TABLE = `
41
+ CREATE TABLE IF NOT EXISTS knowledge_nodes (
42
+ id TEXT PRIMARY KEY,
43
+ entity_type TEXT NOT NULL CHECK (entity_type IN ('person', 'organization', 'date', 'amount', 'case_number', 'location', 'statute', 'exhibit', 'medication', 'diagnosis', 'medical_device', 'other')),
44
+ canonical_name TEXT NOT NULL,
45
+ normalized_name TEXT NOT NULL,
46
+ aliases TEXT,
47
+ document_count INTEGER NOT NULL DEFAULT 1,
48
+ mention_count INTEGER NOT NULL DEFAULT 0,
49
+ edge_count INTEGER NOT NULL DEFAULT 0,
50
+ avg_confidence REAL NOT NULL DEFAULT 0.0,
51
+ metadata TEXT,
52
+ provenance_id TEXT NOT NULL,
53
+ created_at TEXT NOT NULL,
54
+ updated_at TEXT NOT NULL,
55
+ importance_score REAL,
56
+ resolution_type TEXT,
57
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id)
58
+ )
59
+ `;
60
+ const CREATE_KNOWLEDGE_EDGES_TABLE = `
61
+ CREATE TABLE IF NOT EXISTS knowledge_edges (
62
+ id TEXT PRIMARY KEY,
63
+ source_node_id TEXT NOT NULL,
64
+ target_node_id TEXT NOT NULL,
65
+ relationship_type TEXT NOT NULL CHECK (relationship_type IN ('co_mentioned', 'co_located', 'works_at', 'represents', 'located_in', 'filed_in', 'cites', 'references', 'party_to', 'related_to', 'precedes', 'occurred_at', 'treated_with', 'administered_via', 'managed_by', 'interacts_with', 'diagnosed_with', 'prescribed_by', 'admitted_to', 'supervised_by', 'filed_by', 'contraindicated_with')),
66
+ weight REAL NOT NULL DEFAULT 1.0,
67
+ evidence_count INTEGER NOT NULL DEFAULT 1,
68
+ document_ids TEXT NOT NULL,
69
+ metadata TEXT,
70
+ provenance_id TEXT NOT NULL,
71
+ created_at TEXT NOT NULL,
72
+ valid_from TEXT,
73
+ valid_until TEXT,
74
+ normalized_weight REAL DEFAULT 0,
75
+ contradiction_count INTEGER DEFAULT 0,
76
+ FOREIGN KEY (source_node_id) REFERENCES knowledge_nodes(id),
77
+ FOREIGN KEY (target_node_id) REFERENCES knowledge_nodes(id),
78
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id)
79
+ )
80
+ `;
81
+ const CREATE_NODE_ENTITY_LINKS_TABLE = `
82
+ CREATE TABLE IF NOT EXISTS node_entity_links (
83
+ id TEXT PRIMARY KEY,
84
+ node_id TEXT NOT NULL,
85
+ entity_id TEXT NOT NULL UNIQUE,
86
+ document_id TEXT NOT NULL,
87
+ similarity_score REAL NOT NULL DEFAULT 1.0,
88
+ resolution_method TEXT,
89
+ created_at TEXT NOT NULL,
90
+ FOREIGN KEY (node_id) REFERENCES knowledge_nodes(id),
91
+ FOREIGN KEY (entity_id) REFERENCES entities(id),
92
+ FOREIGN KEY (document_id) REFERENCES documents(id)
93
+ )
94
+ `;
95
+ const CREATE_KNOWLEDGE_NODES_FTS_TABLE = `
96
+ CREATE VIRTUAL TABLE IF NOT EXISTS knowledge_nodes_fts USING fts5(
97
+ canonical_name,
98
+ content='knowledge_nodes',
99
+ content_rowid='rowid',
100
+ tokenize='porter unicode61'
101
+ )
102
+ `;
103
+ const CREATE_KNOWLEDGE_NODES_FTS_TRIGGERS = [
104
+ `CREATE TRIGGER IF NOT EXISTS knowledge_nodes_fts_ai AFTER INSERT ON knowledge_nodes BEGIN
105
+ INSERT INTO knowledge_nodes_fts(rowid, canonical_name) VALUES (new.rowid, new.canonical_name);
106
+ END`,
107
+ `CREATE TRIGGER IF NOT EXISTS knowledge_nodes_fts_ad AFTER DELETE ON knowledge_nodes BEGIN
108
+ INSERT INTO knowledge_nodes_fts(knowledge_nodes_fts, rowid, canonical_name) VALUES ('delete', old.rowid, old.canonical_name);
109
+ END`,
110
+ `CREATE TRIGGER IF NOT EXISTS knowledge_nodes_fts_au AFTER UPDATE OF canonical_name ON knowledge_nodes BEGIN
111
+ INSERT INTO knowledge_nodes_fts(knowledge_nodes_fts, rowid, canonical_name) VALUES ('delete', old.rowid, old.canonical_name);
112
+ INSERT INTO knowledge_nodes_fts(rowid, canonical_name) VALUES (new.rowid, new.canonical_name);
113
+ END`,
114
+ ];
115
+ const CREATE_ENTITY_EXTRACTION_SEGMENTS_TABLE = `
116
+ CREATE TABLE IF NOT EXISTS entity_extraction_segments (
117
+ id TEXT PRIMARY KEY,
118
+ document_id TEXT NOT NULL REFERENCES documents(id),
119
+ ocr_result_id TEXT NOT NULL REFERENCES ocr_results(id),
120
+ segment_index INTEGER NOT NULL,
121
+ text TEXT NOT NULL,
122
+ character_start INTEGER NOT NULL,
123
+ character_end INTEGER NOT NULL,
124
+ text_length INTEGER NOT NULL,
125
+ overlap_previous INTEGER NOT NULL DEFAULT 0,
126
+ overlap_next INTEGER NOT NULL DEFAULT 0,
127
+ extraction_status TEXT NOT NULL DEFAULT 'pending'
128
+ CHECK (extraction_status IN ('pending', 'processing', 'complete', 'failed')),
129
+ entity_count INTEGER DEFAULT 0,
130
+ extracted_at TEXT,
131
+ error_message TEXT,
132
+ provenance_id TEXT REFERENCES provenance(id),
133
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
134
+ UNIQUE(document_id, segment_index)
135
+ )
136
+ `;
137
+ const CREATE_ENTITY_EMBEDDINGS_TABLE = `
138
+ CREATE TABLE IF NOT EXISTS entity_embeddings (
139
+ id TEXT PRIMARY KEY,
140
+ node_id TEXT NOT NULL REFERENCES knowledge_nodes(id),
141
+ original_text TEXT NOT NULL,
142
+ original_text_length INTEGER NOT NULL,
143
+ entity_type TEXT NOT NULL,
144
+ document_count INTEGER NOT NULL DEFAULT 1,
145
+ model_name TEXT NOT NULL DEFAULT 'nomic-embed-text-v1.5',
146
+ content_hash TEXT NOT NULL,
147
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
148
+ provenance_id TEXT REFERENCES provenance(id)
149
+ )
150
+ `;
151
+ const CREATE_VEC_ENTITY_EMBEDDINGS_TABLE = `
152
+ CREATE VIRTUAL TABLE IF NOT EXISTS vec_entity_embeddings USING vec0(
153
+ entity_embedding_id TEXT PRIMARY KEY,
154
+ vector FLOAT[768] distance_metric=cosine
155
+ )
156
+ `;
157
+ const CREATE_CORPUS_INTELLIGENCE_TABLE = `
158
+ CREATE TABLE IF NOT EXISTS corpus_intelligence (
159
+ id TEXT PRIMARY KEY,
160
+ database_name TEXT NOT NULL,
161
+ corpus_summary TEXT NOT NULL,
162
+ key_actors TEXT NOT NULL,
163
+ themes TEXT NOT NULL,
164
+ narrative_arcs TEXT,
165
+ entity_count INTEGER NOT NULL,
166
+ document_count INTEGER NOT NULL,
167
+ model TEXT NOT NULL,
168
+ provenance_id TEXT NOT NULL REFERENCES provenance(id),
169
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
170
+ updated_at TEXT NOT NULL DEFAULT (datetime('now'))
171
+ )`;
172
+ const CREATE_DOCUMENT_NARRATIVES_TABLE = `
173
+ CREATE TABLE IF NOT EXISTS document_narratives (
174
+ id TEXT PRIMARY KEY,
175
+ document_id TEXT NOT NULL UNIQUE REFERENCES documents(id),
176
+ narrative_text TEXT NOT NULL,
177
+ entity_roster TEXT NOT NULL,
178
+ corpus_context TEXT,
179
+ synthesis_count INTEGER DEFAULT 0,
180
+ model TEXT NOT NULL,
181
+ provenance_id TEXT NOT NULL REFERENCES provenance(id),
182
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
183
+ updated_at TEXT NOT NULL DEFAULT (datetime('now'))
184
+ )`;
185
+ const CREATE_ENTITY_ROLES_TABLE = `
186
+ CREATE TABLE IF NOT EXISTS entity_roles (
187
+ id TEXT PRIMARY KEY,
188
+ node_id TEXT NOT NULL REFERENCES knowledge_nodes(id),
189
+ role TEXT NOT NULL,
190
+ theme TEXT,
191
+ importance_rank INTEGER,
192
+ context_summary TEXT,
193
+ scope TEXT NOT NULL DEFAULT 'database',
194
+ scope_id TEXT,
195
+ model TEXT NOT NULL,
196
+ provenance_id TEXT NOT NULL REFERENCES provenance(id),
197
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
198
+ )`;
199
+ import { configurePragmas, initializeSchemaVersion, createTables, createVecTable, createIndexes, createFTSTables, initializeDatabaseMetadata, loadSqliteVecExtension, } from './schema-helpers.js';
200
+ import { computeFTSContentHash } from '../../search/bm25.js';
201
+ /**
202
+ * Check the current schema version of the database
203
+ * @param db - Database instance
204
+ * @returns Current schema version, or 0 if not initialized
205
+ */
206
+ export function checkSchemaVersion(db) {
207
+ try {
208
+ // Check if schema_version table exists
209
+ const tableExists = db
210
+ .prepare(`
211
+ SELECT name FROM sqlite_master
212
+ WHERE type = 'table' AND name = 'schema_version'
213
+ `)
214
+ .get();
215
+ if (!tableExists) {
216
+ return 0;
217
+ }
218
+ const row = db.prepare('SELECT version FROM schema_version WHERE id = ?').get(1);
219
+ return row?.version ?? 0;
220
+ }
221
+ catch (error) {
222
+ throw new MigrationError('Failed to check schema version', 'query', 'schema_version', error);
223
+ }
224
+ }
225
+ /**
226
+ * Get the current schema version constant
227
+ * @returns The current schema version number
228
+ */
229
+ export function getCurrentSchemaVersion() {
230
+ return SCHEMA_VERSION;
231
+ }
232
+ /**
233
+ * Initialize the database with all tables, indexes, and configuration
234
+ *
235
+ * This function is idempotent - safe to call multiple times.
236
+ * Creates tables only if they don't exist.
237
+ *
238
+ * @param db - Database instance from better-sqlite3
239
+ * @throws MigrationError if any operation fails
240
+ */
241
+ export function initializeDatabase(db) {
242
+ // Step 1: Configure pragmas (must be outside transaction)
243
+ configurePragmas(db);
244
+ // Step 2: Load sqlite-vec extension (must be before virtual table creation, outside transaction)
245
+ loadSqliteVecExtension(db);
246
+ // Steps 3-8 wrapped in a transaction so that if the process crashes mid-init,
247
+ // the DB won't have a version stamp with missing tables (MIG-5 fix).
248
+ // Schema version is stamped LAST so a crash before completion leaves version=0,
249
+ // causing a clean re-init on restart.
250
+ const initTransaction = db.transaction(() => {
251
+ // Step 3: Create tables in dependency order
252
+ createTables(db);
253
+ // Step 4: Create sqlite-vec virtual table
254
+ createVecTable(db);
255
+ // Step 5: Create indexes
256
+ createIndexes(db);
257
+ // Step 6: Create FTS5 tables and triggers
258
+ createFTSTables(db);
259
+ // Step 7: Initialize metadata
260
+ initializeDatabaseMetadata(db);
261
+ // Step 8: Initialize schema version tracking (LAST - so crash before here means version=0)
262
+ initializeSchemaVersion(db);
263
+ });
264
+ initTransaction();
265
+ }
266
+ /**
267
+ * Migrate from schema version 1 to version 2
268
+ *
269
+ * Changes in v2:
270
+ * - provenance.type: Added 'IMAGE' and 'VLM_DESCRIPTION' to CHECK constraint
271
+ * - provenance.source_type: Added 'IMAGE_EXTRACTION' and 'VLM' to CHECK constraint
272
+ *
273
+ * Note: SQLite CHECK constraints cannot be modified directly. However, since SQLite
274
+ * stores CHECK constraints as metadata and only validates at INSERT/UPDATE time,
275
+ * existing data remains valid. For new inserts, we recreate the table with the
276
+ * updated constraint.
277
+ *
278
+ * @param db - Database instance from better-sqlite3
279
+ * @throws MigrationError if migration fails
280
+ */
281
+ function migrateV1ToV2(db) {
282
+ try {
283
+ // SQLite doesn't support ALTER TABLE to modify CHECK constraints.
284
+ // We need to recreate the provenance table with the new constraints.
285
+ // Foreign keys must be disabled during table recreation to avoid
286
+ // constraint failures when dropping the old table (other tables reference it).
287
+ db.exec('PRAGMA foreign_keys = OFF');
288
+ db.exec('BEGIN TRANSACTION');
289
+ // Step 1: Create a new table with updated CHECK constraints
290
+ db.exec(`
291
+ CREATE TABLE provenance_new (
292
+ id TEXT PRIMARY KEY,
293
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING')),
294
+ created_at TEXT NOT NULL,
295
+ processed_at TEXT NOT NULL,
296
+ source_file_created_at TEXT,
297
+ source_file_modified_at TEXT,
298
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'EMBEDDING')),
299
+ source_path TEXT,
300
+ source_id TEXT,
301
+ root_document_id TEXT NOT NULL,
302
+ location TEXT,
303
+ content_hash TEXT NOT NULL,
304
+ input_hash TEXT,
305
+ file_hash TEXT,
306
+ processor TEXT NOT NULL,
307
+ processor_version TEXT NOT NULL,
308
+ processing_params TEXT NOT NULL,
309
+ processing_duration_ms INTEGER,
310
+ processing_quality_score REAL,
311
+ parent_id TEXT,
312
+ parent_ids TEXT NOT NULL,
313
+ chain_depth INTEGER NOT NULL,
314
+ chain_path TEXT,
315
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
316
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
317
+ )
318
+ `);
319
+ // Step 2: Copy existing data to the new table
320
+ db.exec(`
321
+ INSERT INTO provenance_new
322
+ SELECT * FROM provenance
323
+ `);
324
+ // Step 3: Drop the old table
325
+ db.exec('DROP TABLE provenance');
326
+ // Step 4: Rename the new table to the original name
327
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
328
+ // Step 5: Recreate indexes for the provenance table
329
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
330
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
331
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
332
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
333
+ // Step 6: Create images table (new in v2 - supports IMAGE provenance type)
334
+ db.exec(`
335
+ CREATE TABLE IF NOT EXISTS images (
336
+ id TEXT PRIMARY KEY,
337
+ document_id TEXT NOT NULL,
338
+ ocr_result_id TEXT NOT NULL,
339
+ page_number INTEGER NOT NULL,
340
+ bbox_x REAL NOT NULL,
341
+ bbox_y REAL NOT NULL,
342
+ bbox_width REAL NOT NULL,
343
+ bbox_height REAL NOT NULL,
344
+ image_index INTEGER NOT NULL,
345
+ format TEXT NOT NULL,
346
+ width INTEGER NOT NULL,
347
+ height INTEGER NOT NULL,
348
+ extracted_path TEXT,
349
+ file_size INTEGER,
350
+ vlm_status TEXT NOT NULL DEFAULT 'pending' CHECK (vlm_status IN ('pending', 'processing', 'complete', 'failed')),
351
+ vlm_description TEXT,
352
+ vlm_structured_data TEXT,
353
+ vlm_embedding_id TEXT,
354
+ vlm_model TEXT,
355
+ vlm_confidence REAL,
356
+ vlm_processed_at TEXT,
357
+ vlm_tokens_used INTEGER,
358
+ context_text TEXT,
359
+ provenance_id TEXT,
360
+ created_at TEXT NOT NULL,
361
+ error_message TEXT,
362
+ FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE,
363
+ FOREIGN KEY (ocr_result_id) REFERENCES ocr_results(id) ON DELETE CASCADE,
364
+ FOREIGN KEY (vlm_embedding_id) REFERENCES embeddings(id),
365
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id)
366
+ )
367
+ `);
368
+ db.exec('CREATE INDEX IF NOT EXISTS idx_images_document_id ON images(document_id)');
369
+ db.exec('CREATE INDEX IF NOT EXISTS idx_images_ocr_result_id ON images(ocr_result_id)');
370
+ db.exec('CREATE INDEX IF NOT EXISTS idx_images_page ON images(document_id, page_number)');
371
+ db.exec('CREATE INDEX IF NOT EXISTS idx_images_vlm_status ON images(vlm_status)');
372
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_images_pending ON images(vlm_status) WHERE vlm_status = 'pending'`);
373
+ db.exec('CREATE INDEX IF NOT EXISTS idx_images_provenance_id ON images(provenance_id)');
374
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
375
+ const fkViolations = db.pragma('foreign_key_check');
376
+ if (fkViolations.length > 0) {
377
+ throw new Error(`Foreign key integrity check failed after v1->v2 migration: ${fkViolations.length} violation(s). ` +
378
+ `First: ${JSON.stringify(fkViolations[0])}`);
379
+ }
380
+ db.exec('COMMIT');
381
+ db.exec('PRAGMA foreign_keys = ON');
382
+ }
383
+ catch (error) {
384
+ // Rollback on error
385
+ try {
386
+ db.exec('ROLLBACK');
387
+ db.exec('PRAGMA foreign_keys = ON');
388
+ }
389
+ catch (rollbackErr) {
390
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
391
+ }
392
+ const cause = error instanceof Error ? error.message : String(error);
393
+ throw new MigrationError(`Failed to migrate provenance table from v1 to v2: ${cause}`, 'migrate', 'provenance', error);
394
+ }
395
+ }
396
+ /**
397
+ * Migrate from schema version 2 to version 3
398
+ *
399
+ * Changes in v3:
400
+ * - embeddings.chunk_id: Changed from NOT NULL to nullable
401
+ * - embeddings.image_id: New column (nullable) for VLM description embeddings
402
+ * - embeddings: Added CHECK constraint (chunk_id IS NOT NULL OR image_id IS NOT NULL)
403
+ * - embeddings: Added FOREIGN KEY (image_id) REFERENCES images(id)
404
+ *
405
+ * This migration allows embeddings to reference either chunks (text embeddings)
406
+ * or images (VLM description embeddings).
407
+ *
408
+ * @param db - Database instance from better-sqlite3
409
+ * @throws MigrationError if migration fails
410
+ */
411
+ function migrateV2ToV3(db) {
412
+ try {
413
+ // Foreign keys must be disabled during table recreation
414
+ db.exec('PRAGMA foreign_keys = OFF');
415
+ db.exec('BEGIN TRANSACTION');
416
+ // Step 1: Create new embeddings table with updated schema
417
+ db.exec(`
418
+ CREATE TABLE embeddings_new (
419
+ id TEXT PRIMARY KEY,
420
+ chunk_id TEXT,
421
+ image_id TEXT,
422
+ document_id TEXT NOT NULL,
423
+ original_text TEXT NOT NULL,
424
+ original_text_length INTEGER NOT NULL,
425
+ source_file_path TEXT NOT NULL,
426
+ source_file_name TEXT NOT NULL,
427
+ source_file_hash TEXT NOT NULL,
428
+ page_number INTEGER,
429
+ page_range TEXT,
430
+ character_start INTEGER NOT NULL,
431
+ character_end INTEGER NOT NULL,
432
+ chunk_index INTEGER NOT NULL,
433
+ total_chunks INTEGER NOT NULL,
434
+ model_name TEXT NOT NULL,
435
+ model_version TEXT NOT NULL,
436
+ task_type TEXT NOT NULL CHECK (task_type IN ('search_document', 'search_query')),
437
+ inference_mode TEXT NOT NULL CHECK (inference_mode = 'local'),
438
+ gpu_device TEXT,
439
+ provenance_id TEXT NOT NULL UNIQUE,
440
+ content_hash TEXT NOT NULL,
441
+ created_at TEXT NOT NULL,
442
+ generation_duration_ms INTEGER,
443
+ FOREIGN KEY (chunk_id) REFERENCES chunks(id),
444
+ FOREIGN KEY (image_id) REFERENCES images(id),
445
+ FOREIGN KEY (document_id) REFERENCES documents(id),
446
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id),
447
+ CHECK (chunk_id IS NOT NULL OR image_id IS NOT NULL)
448
+ )
449
+ `);
450
+ // Step 2: Copy existing data (image_id will be NULL for existing embeddings)
451
+ db.exec(`
452
+ INSERT INTO embeddings_new (
453
+ id, chunk_id, image_id, document_id, original_text, original_text_length,
454
+ source_file_path, source_file_name, source_file_hash, page_number, page_range,
455
+ character_start, character_end, chunk_index, total_chunks, model_name,
456
+ model_version, task_type, inference_mode, gpu_device, provenance_id,
457
+ content_hash, created_at, generation_duration_ms
458
+ )
459
+ SELECT
460
+ id, chunk_id, NULL, document_id, original_text, original_text_length,
461
+ source_file_path, source_file_name, source_file_hash, page_number, page_range,
462
+ character_start, character_end, chunk_index, total_chunks, model_name,
463
+ model_version, task_type, inference_mode, gpu_device, provenance_id,
464
+ content_hash, created_at, generation_duration_ms
465
+ FROM embeddings
466
+ `);
467
+ // Step 3: Drop old table
468
+ db.exec('DROP TABLE embeddings');
469
+ // Step 4: Rename new table
470
+ db.exec('ALTER TABLE embeddings_new RENAME TO embeddings');
471
+ // Step 5: Recreate indexes
472
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_chunk_id ON embeddings(chunk_id)');
473
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_image_id ON embeddings(image_id)');
474
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_document_id ON embeddings(document_id)');
475
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_source_file ON embeddings(source_file_path)');
476
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_page ON embeddings(page_number)');
477
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
478
+ const fkViolations = db.pragma('foreign_key_check');
479
+ if (fkViolations.length > 0) {
480
+ throw new Error(`Foreign key integrity check failed after v2->v3 migration: ${fkViolations.length} violation(s). ` +
481
+ `First: ${JSON.stringify(fkViolations[0])}`);
482
+ }
483
+ db.exec('COMMIT');
484
+ db.exec('PRAGMA foreign_keys = ON');
485
+ }
486
+ catch (error) {
487
+ try {
488
+ db.exec('ROLLBACK');
489
+ db.exec('PRAGMA foreign_keys = ON');
490
+ }
491
+ catch (rollbackErr) {
492
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
493
+ }
494
+ const cause = error instanceof Error ? error.message : String(error);
495
+ throw new MigrationError(`Failed to migrate embeddings table from v2 to v3: ${cause}`, 'migrate', 'embeddings', error);
496
+ }
497
+ }
498
+ /**
499
+ * Migrate from schema version 3 to version 4
500
+ *
501
+ * Changes in v4:
502
+ * - chunks_fts: FTS5 virtual table for BM25 full-text search
503
+ * - chunks_fts_ai/ad/au: Sync triggers to keep FTS5 in sync with chunks
504
+ * - fts_index_metadata: Audit trail for FTS index rebuilds
505
+ *
506
+ * @param db - Database instance from better-sqlite3
507
+ * @throws MigrationError if migration fails
508
+ */
509
+ function migrateV3ToV4(db) {
510
+ try {
511
+ db.exec('BEGIN TRANSACTION');
512
+ // 1. Create FTS5 virtual table
513
+ db.exec(CREATE_CHUNKS_FTS_TABLE);
514
+ // 2. Create sync triggers
515
+ for (const trigger of CREATE_FTS_TRIGGERS) {
516
+ db.exec(trigger);
517
+ }
518
+ // 3. Create metadata table
519
+ db.exec(CREATE_FTS_INDEX_METADATA);
520
+ // 4. Populate FTS5 from existing chunks
521
+ db.exec("INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')");
522
+ // 5. Count indexed chunks and store metadata
523
+ const count = db.prepare('SELECT COUNT(*) as cnt FROM chunks').get();
524
+ const contentHash = computeFTSContentHash(db);
525
+ db.prepare(`
526
+ INSERT OR REPLACE INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
527
+ VALUES (1, ?, ?, 'porter unicode61', 4, ?)
528
+ `).run(new Date().toISOString(), count.cnt, contentHash);
529
+ db.exec('COMMIT');
530
+ }
531
+ catch (error) {
532
+ try {
533
+ db.exec('ROLLBACK');
534
+ }
535
+ catch (rollbackErr) {
536
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
537
+ }
538
+ const cause = error instanceof Error ? error.message : String(error);
539
+ throw new MigrationError(`Failed to migrate from v3 to v4 (FTS5 setup): ${cause}`, 'migrate', 'chunks_fts', error);
540
+ }
541
+ }
542
+ /**
543
+ * Migrate from schema version 4 to version 5
544
+ *
545
+ * Changes in v5:
546
+ * - images.block_type: Datalab block type (Figure, Picture, PageHeader, etc.)
547
+ * - images.is_header_footer: Boolean flag for header/footer images
548
+ * - images.content_hash: SHA-256 of image bytes for deduplication
549
+ * - idx_images_content_hash: Index for fast dedup lookups
550
+ *
551
+ * @param db - Database instance from better-sqlite3
552
+ * @throws MigrationError if migration fails
553
+ */
554
+ function migrateV4ToV5(db) {
555
+ db.exec('PRAGMA foreign_keys = OFF');
556
+ // Check existing columns for idempotency (safe on retry after partial failure)
557
+ const columns = db.prepare('PRAGMA table_info(images)').all();
558
+ const columnNames = new Set(columns.map((c) => c.name));
559
+ const transaction = db.transaction(() => {
560
+ if (!columnNames.has('block_type')) {
561
+ db.exec('ALTER TABLE images ADD COLUMN block_type TEXT');
562
+ }
563
+ if (!columnNames.has('is_header_footer')) {
564
+ db.exec('ALTER TABLE images ADD COLUMN is_header_footer INTEGER NOT NULL DEFAULT 0');
565
+ }
566
+ if (!columnNames.has('content_hash')) {
567
+ db.exec('ALTER TABLE images ADD COLUMN content_hash TEXT');
568
+ }
569
+ db.exec('CREATE INDEX IF NOT EXISTS idx_images_content_hash ON images(content_hash)');
570
+ // M-5: FK integrity check inside transaction so violations cause rollback
571
+ const fkViolations = db.pragma('foreign_key_check');
572
+ if (fkViolations.length > 0) {
573
+ throw new Error(`Foreign key integrity check failed after v4->v5 migration: ${fkViolations.length} violation(s). ` +
574
+ `First: ${JSON.stringify(fkViolations[0])}`);
575
+ }
576
+ });
577
+ try {
578
+ transaction();
579
+ db.exec('PRAGMA foreign_keys = ON');
580
+ }
581
+ catch (error) {
582
+ db.exec('PRAGMA foreign_keys = ON');
583
+ const cause = error instanceof Error ? error.message : String(error);
584
+ throw new MigrationError(`Failed to migrate from v4 to v5 (image filtering columns): ${cause}`, 'migrate', 'images', error);
585
+ }
586
+ }
587
+ /**
588
+ * Migrate from schema version 5 to version 6
589
+ *
590
+ * Changes in v6:
591
+ * - vlm_fts: FTS5 virtual table for VLM description full-text search
592
+ * - vlm_fts_ai/ad/au: Sync triggers on embeddings (where image_id IS NOT NULL)
593
+ * - fts_index_metadata: Remove CHECK (id = 1) constraint to allow id=2 row for VLM FTS
594
+ * - fts_index_metadata id=2: VLM FTS metadata row
595
+ *
596
+ * @param db - Database instance from better-sqlite3
597
+ * @throws MigrationError if migration fails
598
+ */
599
+ function migrateV5ToV6(db) {
600
+ try {
601
+ // Check if DDL phase already completed (safe on retry after partial failure)
602
+ const vlmFtsExists = db
603
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vlm_fts'")
604
+ .get();
605
+ const newMetadataExists = db
606
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='fts_index_metadata'")
607
+ .get();
608
+ const oldBackupExists = db
609
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='fts_index_metadata_old'")
610
+ .get();
611
+ if (!vlmFtsExists) {
612
+ // DDL phase not yet completed -- run it
613
+ // Only rename if the backup doesn't already exist from a previous interrupted run
614
+ if (!oldBackupExists && newMetadataExists) {
615
+ db.exec('ALTER TABLE fts_index_metadata RENAME TO fts_index_metadata_old');
616
+ }
617
+ // Create new metadata table (without CHECK (id = 1) constraint)
618
+ db.exec(`
619
+ CREATE TABLE IF NOT EXISTS fts_index_metadata (
620
+ id INTEGER PRIMARY KEY,
621
+ last_rebuild_at TEXT,
622
+ chunks_indexed INTEGER NOT NULL DEFAULT 0,
623
+ tokenizer TEXT NOT NULL DEFAULT 'porter unicode61',
624
+ schema_version INTEGER NOT NULL DEFAULT 7,
625
+ content_hash TEXT
626
+ )
627
+ `);
628
+ // Create VLM FTS5 virtual table
629
+ db.exec(CREATE_VLM_FTS_TABLE);
630
+ // Create VLM FTS sync triggers
631
+ for (const trigger of CREATE_VLM_FTS_TRIGGERS) {
632
+ db.exec(trigger);
633
+ }
634
+ }
635
+ // DML phase: always safe to retry (uses INSERT OR IGNORE, checks before DROP)
636
+ db.exec('BEGIN TRANSACTION');
637
+ try {
638
+ // Copy data from old table if it still exists and new table needs it
639
+ const oldStillExists = db
640
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='fts_index_metadata_old'")
641
+ .get();
642
+ if (oldStillExists) {
643
+ // Only copy if new table doesn't already have the data (id=1 row)
644
+ const hasChunkMetadata = db.prepare('SELECT id FROM fts_index_metadata WHERE id = 1').get();
645
+ if (!hasChunkMetadata) {
646
+ db.exec('INSERT OR IGNORE INTO fts_index_metadata SELECT * FROM fts_index_metadata_old');
647
+ }
648
+ // Safe to drop backup now that data is in the new table
649
+ db.exec('DROP TABLE fts_index_metadata_old');
650
+ }
651
+ // Insert VLM FTS metadata row (id=2)
652
+ const now = new Date().toISOString();
653
+ db.prepare(`
654
+ INSERT OR IGNORE INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
655
+ VALUES (2, ?, 0, 'porter unicode61', 6, NULL)
656
+ `).run(now);
657
+ // Populate vlm_fts from existing VLM embeddings
658
+ const vlmCount = db
659
+ .prepare('SELECT COUNT(*) as cnt FROM embeddings WHERE image_id IS NOT NULL')
660
+ .get();
661
+ if (vlmCount.cnt > 0) {
662
+ // Only populate if not already done (check FTS row count)
663
+ const ftsCount = db.prepare('SELECT COUNT(*) as cnt FROM vlm_fts').get();
664
+ if (ftsCount.cnt === 0) {
665
+ db.exec(`
666
+ INSERT INTO vlm_fts(rowid, original_text)
667
+ SELECT rowid, original_text FROM embeddings WHERE image_id IS NOT NULL
668
+ `);
669
+ }
670
+ // Update VLM FTS metadata with count
671
+ db.prepare('UPDATE fts_index_metadata SET chunks_indexed = ?, last_rebuild_at = ? WHERE id = 2').run(vlmCount.cnt, now);
672
+ }
673
+ db.exec('COMMIT');
674
+ }
675
+ catch (dmlError) {
676
+ try {
677
+ db.exec('ROLLBACK');
678
+ }
679
+ catch (rollbackErr) {
680
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
681
+ }
682
+ throw dmlError;
683
+ }
684
+ }
685
+ catch (error) {
686
+ const cause = error instanceof Error ? error.message : String(error);
687
+ throw new MigrationError(`Failed to migrate from v5 to v6 (VLM FTS setup): ${cause}`, 'migrate', 'vlm_fts', error);
688
+ }
689
+ }
690
+ /**
691
+ * Migrate from schema version 6 to version 7
692
+ *
693
+ * Changes in v7:
694
+ * - provenance.source_type: Added 'VLM_DEDUP' to CHECK constraint
695
+ * This allows VLM pipeline to record deduplicated image results with
696
+ * a distinct source_type for provenance tracking.
697
+ *
698
+ * @param db - Database instance from better-sqlite3
699
+ * @throws MigrationError if migration fails
700
+ */
701
+ function migrateV6ToV7(db) {
702
+ try {
703
+ db.exec('PRAGMA foreign_keys = OFF');
704
+ db.exec('BEGIN TRANSACTION');
705
+ // Step 1: Create new provenance table with VLM_DEDUP in source_type CHECK
706
+ db.exec(`
707
+ CREATE TABLE provenance_new (
708
+ id TEXT PRIMARY KEY,
709
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING')),
710
+ created_at TEXT NOT NULL,
711
+ processed_at TEXT NOT NULL,
712
+ source_file_created_at TEXT,
713
+ source_file_modified_at TEXT,
714
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'VLM_DEDUP', 'EMBEDDING')),
715
+ source_path TEXT,
716
+ source_id TEXT,
717
+ root_document_id TEXT NOT NULL,
718
+ location TEXT,
719
+ content_hash TEXT NOT NULL,
720
+ input_hash TEXT,
721
+ file_hash TEXT,
722
+ processor TEXT NOT NULL,
723
+ processor_version TEXT NOT NULL,
724
+ processing_params TEXT NOT NULL,
725
+ processing_duration_ms INTEGER,
726
+ processing_quality_score REAL,
727
+ parent_id TEXT,
728
+ parent_ids TEXT NOT NULL,
729
+ chain_depth INTEGER NOT NULL,
730
+ chain_path TEXT,
731
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
732
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
733
+ )
734
+ `);
735
+ // Step 2: Copy existing data
736
+ db.exec(`
737
+ INSERT INTO provenance_new
738
+ SELECT * FROM provenance
739
+ `);
740
+ // Step 3: Drop old table
741
+ db.exec('DROP TABLE provenance');
742
+ // Step 4: Rename new table
743
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
744
+ // Step 5: Recreate indexes
745
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
746
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
747
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
748
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
749
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
750
+ const fkViolations = db.pragma('foreign_key_check');
751
+ if (fkViolations.length > 0) {
752
+ throw new Error(`Foreign key integrity check failed after v6->v7 migration: ${fkViolations.length} violation(s). ` +
753
+ `First: ${JSON.stringify(fkViolations[0])}`);
754
+ }
755
+ db.exec('COMMIT');
756
+ db.exec('PRAGMA foreign_keys = ON');
757
+ }
758
+ catch (error) {
759
+ try {
760
+ db.exec('ROLLBACK');
761
+ db.exec('PRAGMA foreign_keys = ON');
762
+ }
763
+ catch (rollbackErr) {
764
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
765
+ }
766
+ const cause = error instanceof Error ? error.message : String(error);
767
+ throw new MigrationError(`Failed to migrate provenance table from v6 to v7: ${cause}`, 'migrate', 'provenance', error);
768
+ }
769
+ }
770
+ /**
771
+ * Migrate from schema version 9 to version 10
772
+ *
773
+ * Changes in v10:
774
+ * - embeddings.extraction_id: New column for extraction-sourced embeddings
775
+ * - embeddings CHECK: Now allows extraction_id-only rows
776
+ * - embeddings FK: extraction_id REFERENCES extractions(id)
777
+ * - idx_embeddings_extraction_id: New index
778
+ *
779
+ * @param db - Database instance from better-sqlite3
780
+ * @throws MigrationError if migration fails
781
+ */
782
+ function migrateV9ToV10(db) {
783
+ try {
784
+ db.exec('PRAGMA foreign_keys = OFF');
785
+ db.exec('BEGIN TRANSACTION');
786
+ // Step 1: Create new embeddings table with extraction_id + updated CHECK
787
+ db.exec(`
788
+ CREATE TABLE embeddings_new (
789
+ id TEXT PRIMARY KEY,
790
+ chunk_id TEXT,
791
+ image_id TEXT,
792
+ extraction_id TEXT,
793
+ document_id TEXT NOT NULL,
794
+ original_text TEXT NOT NULL,
795
+ original_text_length INTEGER NOT NULL,
796
+ source_file_path TEXT NOT NULL,
797
+ source_file_name TEXT NOT NULL,
798
+ source_file_hash TEXT NOT NULL,
799
+ page_number INTEGER,
800
+ page_range TEXT,
801
+ character_start INTEGER NOT NULL,
802
+ character_end INTEGER NOT NULL,
803
+ chunk_index INTEGER NOT NULL,
804
+ total_chunks INTEGER NOT NULL,
805
+ model_name TEXT NOT NULL,
806
+ model_version TEXT NOT NULL,
807
+ task_type TEXT NOT NULL CHECK (task_type IN ('search_document', 'search_query')),
808
+ inference_mode TEXT NOT NULL CHECK (inference_mode = 'local'),
809
+ gpu_device TEXT,
810
+ provenance_id TEXT NOT NULL UNIQUE,
811
+ content_hash TEXT NOT NULL,
812
+ created_at TEXT NOT NULL,
813
+ generation_duration_ms INTEGER,
814
+ FOREIGN KEY (chunk_id) REFERENCES chunks(id),
815
+ FOREIGN KEY (image_id) REFERENCES images(id),
816
+ FOREIGN KEY (extraction_id) REFERENCES extractions(id),
817
+ FOREIGN KEY (document_id) REFERENCES documents(id),
818
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id),
819
+ CHECK (chunk_id IS NOT NULL OR image_id IS NOT NULL OR extraction_id IS NOT NULL)
820
+ )
821
+ `);
822
+ // Step 2: Copy existing data (extraction_id = NULL for all existing embeddings)
823
+ db.exec(`
824
+ INSERT INTO embeddings_new (
825
+ id, chunk_id, image_id, extraction_id, document_id, original_text, original_text_length,
826
+ source_file_path, source_file_name, source_file_hash, page_number, page_range,
827
+ character_start, character_end, chunk_index, total_chunks, model_name,
828
+ model_version, task_type, inference_mode, gpu_device, provenance_id,
829
+ content_hash, created_at, generation_duration_ms
830
+ )
831
+ SELECT
832
+ id, chunk_id, image_id, NULL, document_id, original_text, original_text_length,
833
+ source_file_path, source_file_name, source_file_hash, page_number, page_range,
834
+ character_start, character_end, chunk_index, total_chunks, model_name,
835
+ model_version, task_type, inference_mode, gpu_device, provenance_id,
836
+ content_hash, created_at, generation_duration_ms
837
+ FROM embeddings
838
+ `);
839
+ // Step 3: Drop old table
840
+ db.exec('DROP TABLE embeddings');
841
+ // Step 4: Rename new table
842
+ db.exec('ALTER TABLE embeddings_new RENAME TO embeddings');
843
+ // Step 5: Recreate all embeddings indexes
844
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_chunk_id ON embeddings(chunk_id)');
845
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_image_id ON embeddings(image_id)');
846
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_extraction_id ON embeddings(extraction_id)');
847
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_document_id ON embeddings(document_id)');
848
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_source_file ON embeddings(source_file_path)');
849
+ db.exec('CREATE INDEX IF NOT EXISTS idx_embeddings_page ON embeddings(page_number)');
850
+ // Step 6: Recreate VLM FTS triggers (they reference embeddings table which was recreated)
851
+ // The triggers were lost when the old embeddings table was dropped.
852
+ // Check if vlm_fts exists - if so, recreate its triggers
853
+ const vlmFtsExists = db
854
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vlm_fts'")
855
+ .get();
856
+ if (vlmFtsExists) {
857
+ // Drop old triggers if they exist
858
+ db.exec('DROP TRIGGER IF EXISTS vlm_fts_ai');
859
+ db.exec('DROP TRIGGER IF EXISTS vlm_fts_ad');
860
+ db.exec('DROP TRIGGER IF EXISTS vlm_fts_au');
861
+ // Recreate
862
+ db.exec(`CREATE TRIGGER IF NOT EXISTS vlm_fts_ai AFTER INSERT ON embeddings
863
+ WHEN new.image_id IS NOT NULL BEGIN
864
+ INSERT INTO vlm_fts(rowid, original_text) VALUES (new.rowid, new.original_text);
865
+ END`);
866
+ db.exec(`CREATE TRIGGER IF NOT EXISTS vlm_fts_ad AFTER DELETE ON embeddings
867
+ WHEN old.image_id IS NOT NULL BEGIN
868
+ INSERT INTO vlm_fts(vlm_fts, rowid, original_text) VALUES('delete', old.rowid, old.original_text);
869
+ END`);
870
+ db.exec(`CREATE TRIGGER IF NOT EXISTS vlm_fts_au AFTER UPDATE OF original_text ON embeddings
871
+ WHEN new.image_id IS NOT NULL BEGIN
872
+ INSERT INTO vlm_fts(vlm_fts, rowid, original_text) VALUES('delete', old.rowid, old.original_text);
873
+ INSERT INTO vlm_fts(rowid, original_text) VALUES (new.rowid, new.original_text);
874
+ END`);
875
+ }
876
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
877
+ const fkViolations = db.pragma('foreign_key_check');
878
+ if (fkViolations.length > 0) {
879
+ throw new Error(`Foreign key integrity check failed after v9->v10 migration: ${fkViolations.length} violation(s). ` +
880
+ `First: ${JSON.stringify(fkViolations[0])}`);
881
+ }
882
+ db.exec('COMMIT');
883
+ db.exec('PRAGMA foreign_keys = ON');
884
+ }
885
+ catch (error) {
886
+ try {
887
+ db.exec('ROLLBACK');
888
+ db.exec('PRAGMA foreign_keys = ON');
889
+ }
890
+ catch (rollbackErr) {
891
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
892
+ }
893
+ const cause = error instanceof Error ? error.message : String(error);
894
+ throw new MigrationError(`Failed to migrate from v9 to v10 (extraction embeddings): ${cause}`, 'migrate', 'embeddings', error);
895
+ }
896
+ }
897
+ /**
898
+ * Migrate from schema version 10 to version 11
899
+ *
900
+ * Changes in v11:
901
+ * - ocr_results.json_blocks: JSON block hierarchy from Datalab
902
+ * - ocr_results.extras_json: Extra metadata (cost_breakdown, Datalab metadata)
903
+ *
904
+ * Uses ALTER TABLE ADD COLUMN (nullable TEXT columns, no table recreation needed).
905
+ *
906
+ * @param db - Database instance from better-sqlite3
907
+ * @throws MigrationError if migration fails
908
+ */
909
+ function migrateV10ToV11(db) {
910
+ try {
911
+ db.exec('PRAGMA foreign_keys = OFF');
912
+ const columns = db.prepare('PRAGMA table_info(ocr_results)').all();
913
+ const names = new Set(columns.map((c) => c.name));
914
+ const transaction = db.transaction(() => {
915
+ if (!names.has('json_blocks')) {
916
+ db.exec('ALTER TABLE ocr_results ADD COLUMN json_blocks TEXT');
917
+ }
918
+ if (!names.has('extras_json')) {
919
+ db.exec('ALTER TABLE ocr_results ADD COLUMN extras_json TEXT');
920
+ }
921
+ // M-5: FK integrity check inside transaction so violations cause rollback
922
+ const fkViolations = db.pragma('foreign_key_check');
923
+ if (fkViolations.length > 0) {
924
+ throw new Error(`Foreign key integrity check failed after v10->v11 migration: ${fkViolations.length} violation(s). ` +
925
+ `First: ${JSON.stringify(fkViolations[0])}`);
926
+ }
927
+ });
928
+ transaction();
929
+ db.exec('PRAGMA foreign_keys = ON');
930
+ }
931
+ catch (error) {
932
+ db.exec('PRAGMA foreign_keys = ON');
933
+ const cause = error instanceof Error ? error.message : String(error);
934
+ throw new MigrationError(`Failed to migrate from v10 to v11 (json_blocks, extras_json): ${cause}`, 'migrate', 'ocr_results', error);
935
+ }
936
+ }
937
+ /**
938
+ * Migrate database to the latest schema version
939
+ *
940
+ * Checks current version and applies any necessary migrations.
941
+ *
942
+ * @param db - Database instance from better-sqlite3
943
+ * @throws MigrationError if migration fails
944
+ */
945
+ export function migrateToLatest(db) {
946
+ const currentVersion = checkSchemaVersion(db);
947
+ if (currentVersion === 0) {
948
+ // Fresh database - initialize everything
949
+ initializeDatabase(db);
950
+ return;
951
+ }
952
+ if (currentVersion === SCHEMA_VERSION) {
953
+ // Already at latest version
954
+ return;
955
+ }
956
+ if (currentVersion > SCHEMA_VERSION) {
957
+ throw new MigrationError(`Database schema version (${String(currentVersion)}) is newer than supported version (${String(SCHEMA_VERSION)}). ` +
958
+ 'Please update the application.', 'version_check', undefined);
959
+ }
960
+ // Helper to bump schema_version immediately after each successful migration step.
961
+ // This ensures crash-safety: if the process dies between migrations, only the
962
+ // remaining migrations re-run on restart (MIG-1 fix).
963
+ const bumpVersion = (targetVersion) => {
964
+ try {
965
+ db.prepare('UPDATE schema_version SET version = ?, updated_at = ? WHERE id = 1').run(targetVersion, new Date().toISOString());
966
+ }
967
+ catch (error) {
968
+ throw new MigrationError(`Failed to update schema version to ${String(targetVersion)} after migration`, 'update', 'schema_version', error);
969
+ }
970
+ };
971
+ // Apply migrations incrementally, bumping version after each step
972
+ if (currentVersion < 2) {
973
+ migrateV1ToV2(db);
974
+ bumpVersion(2);
975
+ }
976
+ if (currentVersion < 3) {
977
+ migrateV2ToV3(db);
978
+ bumpVersion(3);
979
+ }
980
+ if (currentVersion < 4) {
981
+ migrateV3ToV4(db);
982
+ bumpVersion(4);
983
+ }
984
+ if (currentVersion < 5) {
985
+ migrateV4ToV5(db);
986
+ bumpVersion(5);
987
+ }
988
+ if (currentVersion < 6) {
989
+ migrateV5ToV6(db);
990
+ bumpVersion(6);
991
+ }
992
+ if (currentVersion < 7) {
993
+ migrateV6ToV7(db);
994
+ bumpVersion(7);
995
+ }
996
+ if (currentVersion < 8) {
997
+ migrateV7ToV8(db);
998
+ bumpVersion(8);
999
+ }
1000
+ if (currentVersion < 9) {
1001
+ migrateV8ToV9(db);
1002
+ bumpVersion(9);
1003
+ }
1004
+ if (currentVersion < 10) {
1005
+ migrateV9ToV10(db);
1006
+ bumpVersion(10);
1007
+ }
1008
+ if (currentVersion < 11) {
1009
+ migrateV10ToV11(db);
1010
+ bumpVersion(11);
1011
+ }
1012
+ if (currentVersion < 12) {
1013
+ migrateV11ToV12(db);
1014
+ bumpVersion(12);
1015
+ }
1016
+ if (currentVersion < 13) {
1017
+ migrateV12ToV13(db);
1018
+ bumpVersion(13);
1019
+ }
1020
+ if (currentVersion < 14) {
1021
+ migrateV13ToV14(db);
1022
+ bumpVersion(14);
1023
+ }
1024
+ if (currentVersion < 15) {
1025
+ migrateV14ToV15(db);
1026
+ bumpVersion(15);
1027
+ }
1028
+ if (currentVersion < 16) {
1029
+ migrateV15ToV16(db);
1030
+ bumpVersion(16);
1031
+ }
1032
+ if (currentVersion < 17) {
1033
+ migrateV16ToV17(db);
1034
+ bumpVersion(17);
1035
+ }
1036
+ if (currentVersion < 18) {
1037
+ migrateV17ToV18(db);
1038
+ bumpVersion(18);
1039
+ }
1040
+ if (currentVersion < 19) {
1041
+ migrateV18ToV19(db);
1042
+ bumpVersion(19);
1043
+ }
1044
+ if (currentVersion < 20) {
1045
+ migrateV19ToV20(db);
1046
+ bumpVersion(20);
1047
+ }
1048
+ if (currentVersion < 21) {
1049
+ migrateV20ToV21(db);
1050
+ bumpVersion(21);
1051
+ }
1052
+ if (currentVersion < 22) {
1053
+ migrateV21ToV22(db);
1054
+ bumpVersion(22);
1055
+ }
1056
+ if (currentVersion < 23) {
1057
+ migrateV22ToV23(db);
1058
+ bumpVersion(23);
1059
+ }
1060
+ if (currentVersion < 24) {
1061
+ migrateV23ToV24(db);
1062
+ bumpVersion(24);
1063
+ }
1064
+ if (currentVersion < 25) {
1065
+ migrateV24ToV25(db);
1066
+ bumpVersion(25);
1067
+ }
1068
+ if (currentVersion < 26) {
1069
+ migrateV25ToV26(db);
1070
+ bumpVersion(26);
1071
+ }
1072
+ if (currentVersion < 27) {
1073
+ migrateV26ToV27(db);
1074
+ bumpVersion(27);
1075
+ }
1076
+ if (currentVersion < 28) {
1077
+ migrateV27ToV28(db);
1078
+ bumpVersion(28);
1079
+ }
1080
+ if (currentVersion < 29) {
1081
+ migrateV28ToV29(db);
1082
+ bumpVersion(29);
1083
+ }
1084
+ if (currentVersion < 30) {
1085
+ migrateV29ToV30(db);
1086
+ bumpVersion(30);
1087
+ }
1088
+ if (currentVersion < 31) {
1089
+ // M-6: bumpVersion is passed into migrateV30ToV31 so it runs inside the
1090
+ // same transaction as the migration body, making them atomic.
1091
+ migrateV30ToV31(db, bumpVersion);
1092
+ }
1093
+ if (currentVersion < 32) {
1094
+ migrateV31ToV32(db);
1095
+ bumpVersion(32);
1096
+ }
1097
+ }
1098
+ /**
1099
+ * Migrate from schema version 7 to version 8
1100
+ *
1101
+ * Changes in v8:
1102
+ * - extractions: New table for structured data extracted via page_schema
1103
+ * - form_fills: New table for Datalab /fill API results
1104
+ * - documents: Added doc_title, doc_author, doc_subject columns
1105
+ * - provenance.type: Added 'EXTRACTION', 'FORM_FILL' to CHECK constraint
1106
+ * - provenance.source_type: Added 'EXTRACTION', 'FORM_FILL' to CHECK constraint
1107
+ * - New indexes: idx_extractions_document_id, idx_form_fills_status, idx_documents_doc_title
1108
+ *
1109
+ * @param db - Database instance from better-sqlite3
1110
+ * @throws MigrationError if migration fails
1111
+ */
1112
+ function migrateV7ToV8(db) {
1113
+ try {
1114
+ db.exec('PRAGMA foreign_keys = OFF');
1115
+ db.exec('BEGIN TRANSACTION');
1116
+ // Step 1: Create new tables
1117
+ db.exec(CREATE_EXTRACTIONS_TABLE);
1118
+ db.exec(CREATE_FORM_FILLS_TABLE);
1119
+ // Step 2: Add new columns to documents table
1120
+ const columns = db.prepare('PRAGMA table_info(documents)').all();
1121
+ const columnNames = new Set(columns.map((c) => c.name));
1122
+ if (!columnNames.has('doc_title')) {
1123
+ db.exec('ALTER TABLE documents ADD COLUMN doc_title TEXT');
1124
+ }
1125
+ if (!columnNames.has('doc_author')) {
1126
+ db.exec('ALTER TABLE documents ADD COLUMN doc_author TEXT');
1127
+ }
1128
+ if (!columnNames.has('doc_subject')) {
1129
+ db.exec('ALTER TABLE documents ADD COLUMN doc_subject TEXT');
1130
+ }
1131
+ // Step 3: Create new indexes
1132
+ db.exec('CREATE INDEX IF NOT EXISTS idx_extractions_document_id ON extractions(document_id)');
1133
+ db.exec('CREATE INDEX IF NOT EXISTS idx_form_fills_status ON form_fills(status)');
1134
+ db.exec('CREATE INDEX IF NOT EXISTS idx_documents_doc_title ON documents(doc_title)');
1135
+ // Step 4: Recreate provenance table with EXTRACTION and FORM_FILL in CHECK constraints
1136
+ db.exec(`
1137
+ CREATE TABLE provenance_new (
1138
+ id TEXT PRIMARY KEY,
1139
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL')),
1140
+ created_at TEXT NOT NULL,
1141
+ processed_at TEXT NOT NULL,
1142
+ source_file_created_at TEXT,
1143
+ source_file_modified_at TEXT,
1144
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'VLM_DEDUP', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL')),
1145
+ source_path TEXT,
1146
+ source_id TEXT,
1147
+ root_document_id TEXT NOT NULL,
1148
+ location TEXT,
1149
+ content_hash TEXT NOT NULL,
1150
+ input_hash TEXT,
1151
+ file_hash TEXT,
1152
+ processor TEXT NOT NULL,
1153
+ processor_version TEXT NOT NULL,
1154
+ processing_params TEXT NOT NULL,
1155
+ processing_duration_ms INTEGER,
1156
+ processing_quality_score REAL,
1157
+ parent_id TEXT,
1158
+ parent_ids TEXT NOT NULL,
1159
+ chain_depth INTEGER NOT NULL,
1160
+ chain_path TEXT,
1161
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
1162
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
1163
+ )
1164
+ `);
1165
+ // Step 5: Copy existing provenance data
1166
+ db.exec(`
1167
+ INSERT INTO provenance_new
1168
+ SELECT * FROM provenance
1169
+ `);
1170
+ // Step 6: Drop old provenance table
1171
+ db.exec('DROP TABLE provenance');
1172
+ // Step 7: Rename new table
1173
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
1174
+ // Step 8: Recreate provenance indexes
1175
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
1176
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
1177
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
1178
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
1179
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
1180
+ const fkViolations = db.pragma('foreign_key_check');
1181
+ if (fkViolations.length > 0) {
1182
+ throw new Error(`Foreign key integrity check failed after v7->v8 migration: ${fkViolations.length} violation(s). ` +
1183
+ `First: ${JSON.stringify(fkViolations[0])}`);
1184
+ }
1185
+ db.exec('COMMIT');
1186
+ db.exec('PRAGMA foreign_keys = ON');
1187
+ }
1188
+ catch (error) {
1189
+ try {
1190
+ db.exec('ROLLBACK');
1191
+ db.exec('PRAGMA foreign_keys = ON');
1192
+ }
1193
+ catch (rollbackErr) {
1194
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1195
+ }
1196
+ const cause = error instanceof Error ? error.message : String(error);
1197
+ throw new MigrationError(`Failed to migrate from v7 to v8 (extractions, form_fills, doc metadata): ${cause}`, 'migrate', 'provenance', error);
1198
+ }
1199
+ }
1200
+ /**
1201
+ * Migrate from schema version 8 to version 9
1202
+ *
1203
+ * Changes in v9:
1204
+ * - extractions_fts: FTS5 virtual table for extraction content full-text search
1205
+ * - extractions_fts_ai/ad/au: Sync triggers on extractions table
1206
+ * - fts_index_metadata id=3: Extraction FTS metadata row
1207
+ * - form_fills.cost_cents: Changed from INTEGER to REAL (fractional cents)
1208
+ *
1209
+ * @param db - Database instance from better-sqlite3
1210
+ * @throws MigrationError if migration fails
1211
+ */
1212
+ function migrateV8ToV9(db) {
1213
+ try {
1214
+ db.exec('PRAGMA foreign_keys = OFF');
1215
+ db.exec('BEGIN TRANSACTION');
1216
+ // Step 1: Create extractions FTS5 virtual table
1217
+ db.exec(CREATE_EXTRACTIONS_FTS_TABLE);
1218
+ // Step 2: Create extractions FTS sync triggers
1219
+ for (const trigger of CREATE_EXTRACTIONS_FTS_TRIGGERS) {
1220
+ db.exec(trigger);
1221
+ }
1222
+ // Step 3: Populate FTS from existing extractions
1223
+ db.exec("INSERT INTO extractions_fts(extractions_fts) VALUES('rebuild')");
1224
+ // Step 4: Add extraction FTS metadata row (id=3)
1225
+ const now = new Date().toISOString();
1226
+ const extractionCount = db.prepare('SELECT COUNT(*) as cnt FROM extractions').get().cnt;
1227
+ db.prepare(`
1228
+ INSERT OR IGNORE INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
1229
+ VALUES (3, ?, ?, 'porter unicode61', 9, NULL)
1230
+ `).run(now, extractionCount);
1231
+ // Step 5: Recreate form_fills with cost_cents REAL (was INTEGER)
1232
+ db.exec(`
1233
+ CREATE TABLE form_fills_new (
1234
+ id TEXT PRIMARY KEY NOT NULL,
1235
+ source_file_path TEXT NOT NULL,
1236
+ source_file_hash TEXT NOT NULL,
1237
+ field_data_json TEXT NOT NULL,
1238
+ context TEXT,
1239
+ confidence_threshold REAL NOT NULL DEFAULT 0.5,
1240
+ output_file_path TEXT,
1241
+ output_base64 TEXT,
1242
+ fields_filled TEXT NOT NULL DEFAULT '[]',
1243
+ fields_not_found TEXT NOT NULL DEFAULT '[]',
1244
+ page_count INTEGER,
1245
+ cost_cents REAL,
1246
+ status TEXT NOT NULL CHECK(status IN ('pending', 'processing', 'complete', 'failed')),
1247
+ error_message TEXT,
1248
+ provenance_id TEXT NOT NULL REFERENCES provenance(id),
1249
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
1250
+ )
1251
+ `);
1252
+ db.exec('INSERT INTO form_fills_new SELECT * FROM form_fills');
1253
+ db.exec('DROP TABLE form_fills');
1254
+ db.exec('ALTER TABLE form_fills_new RENAME TO form_fills');
1255
+ db.exec('CREATE INDEX IF NOT EXISTS idx_form_fills_status ON form_fills(status)');
1256
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
1257
+ const fkViolations = db.pragma('foreign_key_check');
1258
+ if (fkViolations.length > 0) {
1259
+ throw new Error(`Foreign key integrity check failed after v8->v9 migration: ${fkViolations.length} violation(s). ` +
1260
+ `First: ${JSON.stringify(fkViolations[0])}`);
1261
+ }
1262
+ db.exec('COMMIT');
1263
+ db.exec('PRAGMA foreign_keys = ON');
1264
+ }
1265
+ catch (error) {
1266
+ try {
1267
+ db.exec('ROLLBACK');
1268
+ db.exec('PRAGMA foreign_keys = ON');
1269
+ }
1270
+ catch (rollbackErr) {
1271
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1272
+ }
1273
+ const cause = error instanceof Error ? error.message : String(error);
1274
+ throw new MigrationError(`Failed to migrate from v8 to v9 (extractions FTS, cost_cents REAL): ${cause}`, 'migrate', 'extractions_fts', error);
1275
+ }
1276
+ }
1277
+ /**
1278
+ * Migrate from schema version 11 to version 12
1279
+ *
1280
+ * Changes in v12:
1281
+ * - uploaded_files: New table for Datalab cloud file uploads
1282
+ * - documents.datalab_file_id: New column linking documents to uploaded files
1283
+ * - 3 new indexes: idx_uploaded_files_file_hash, idx_uploaded_files_status, idx_uploaded_files_datalab_file_id
1284
+ *
1285
+ * @param db - Database instance from better-sqlite3
1286
+ * @throws MigrationError if migration fails
1287
+ */
1288
+ function migrateV11ToV12(db) {
1289
+ try {
1290
+ db.exec('PRAGMA foreign_keys = OFF');
1291
+ const transaction = db.transaction(() => {
1292
+ // Create uploaded_files table
1293
+ db.exec(CREATE_UPLOADED_FILES_TABLE);
1294
+ // Create indexes
1295
+ db.exec('CREATE INDEX IF NOT EXISTS idx_uploaded_files_file_hash ON uploaded_files(file_hash)');
1296
+ db.exec('CREATE INDEX IF NOT EXISTS idx_uploaded_files_status ON uploaded_files(upload_status)');
1297
+ db.exec('CREATE INDEX IF NOT EXISTS idx_uploaded_files_datalab_file_id ON uploaded_files(datalab_file_id)');
1298
+ // Add datalab_file_id column to documents
1299
+ const columns = db.prepare('PRAGMA table_info(documents)').all();
1300
+ if (!columns.some((c) => c.name === 'datalab_file_id')) {
1301
+ db.exec('ALTER TABLE documents ADD COLUMN datalab_file_id TEXT');
1302
+ }
1303
+ // M-5: FK integrity check inside transaction so violations cause rollback
1304
+ const fkViolations = db.pragma('foreign_key_check');
1305
+ if (fkViolations.length > 0) {
1306
+ throw new Error(`Foreign key integrity check failed after v11->v12 migration: ${fkViolations.length} violation(s). ` +
1307
+ `First: ${JSON.stringify(fkViolations[0])}`);
1308
+ }
1309
+ });
1310
+ transaction();
1311
+ db.exec('PRAGMA foreign_keys = ON');
1312
+ }
1313
+ catch (error) {
1314
+ db.exec('PRAGMA foreign_keys = ON');
1315
+ const cause = error instanceof Error ? error.message : String(error);
1316
+ throw new MigrationError(`Failed to migrate from v11 to v12 (uploaded_files): ${cause}`, 'migrate', 'uploaded_files', error);
1317
+ }
1318
+ }
1319
+ /**
1320
+ * Migrate from schema version 12 to version 13
1321
+ *
1322
+ * Changes in v13:
1323
+ * - provenance.type: Added 'ENTITY_EXTRACTION' to CHECK constraint
1324
+ * - provenance.source_type: Added 'ENTITY_EXTRACTION' to CHECK constraint
1325
+ * - entities: New table for named entities extracted from documents
1326
+ * - entity_mentions: New table for entity occurrence tracking
1327
+ * - 4 new indexes: idx_entities_document_id, idx_entities_entity_type,
1328
+ * idx_entities_normalized_text, idx_entity_mentions_entity_id
1329
+ *
1330
+ * @param db - Database instance from better-sqlite3
1331
+ * @throws MigrationError if migration fails
1332
+ */
1333
+ function migrateV12ToV13(db) {
1334
+ try {
1335
+ db.exec('PRAGMA foreign_keys = OFF');
1336
+ db.exec('BEGIN TRANSACTION');
1337
+ // Step 1: Recreate provenance table with ENTITY_EXTRACTION in CHECK constraints
1338
+ db.exec(`
1339
+ CREATE TABLE provenance_new (
1340
+ id TEXT PRIMARY KEY,
1341
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION')),
1342
+ created_at TEXT NOT NULL,
1343
+ processed_at TEXT NOT NULL,
1344
+ source_file_created_at TEXT,
1345
+ source_file_modified_at TEXT,
1346
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'VLM_DEDUP', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION')),
1347
+ source_path TEXT,
1348
+ source_id TEXT,
1349
+ root_document_id TEXT NOT NULL,
1350
+ location TEXT,
1351
+ content_hash TEXT NOT NULL,
1352
+ input_hash TEXT,
1353
+ file_hash TEXT,
1354
+ processor TEXT NOT NULL,
1355
+ processor_version TEXT NOT NULL,
1356
+ processing_params TEXT NOT NULL,
1357
+ processing_duration_ms INTEGER,
1358
+ processing_quality_score REAL,
1359
+ parent_id TEXT,
1360
+ parent_ids TEXT NOT NULL,
1361
+ chain_depth INTEGER NOT NULL,
1362
+ chain_path TEXT,
1363
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
1364
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
1365
+ )
1366
+ `);
1367
+ // Step 2: Copy existing data
1368
+ db.exec('INSERT INTO provenance_new SELECT * FROM provenance');
1369
+ // Step 3: Drop old table
1370
+ db.exec('DROP TABLE provenance');
1371
+ // Step 4: Rename new table
1372
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
1373
+ // Step 5: Recreate provenance indexes
1374
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
1375
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
1376
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
1377
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
1378
+ // Step 6: Create entities and entity_mentions tables
1379
+ db.exec(CREATE_ENTITIES_TABLE);
1380
+ db.exec(CREATE_ENTITY_MENTIONS_TABLE);
1381
+ // Step 7: Create indexes for new tables
1382
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entities_document_id ON entities(document_id)');
1383
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entities_entity_type ON entities(entity_type)');
1384
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entities_normalized_text ON entities(normalized_text)');
1385
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_mentions_entity_id ON entity_mentions(entity_id)');
1386
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
1387
+ const fkViolations = db.pragma('foreign_key_check');
1388
+ if (fkViolations.length > 0) {
1389
+ throw new Error(`Foreign key integrity check failed after v12->v13 migration: ${fkViolations.length} violation(s). ` +
1390
+ `First: ${JSON.stringify(fkViolations[0])}`);
1391
+ }
1392
+ db.exec('COMMIT');
1393
+ db.exec('PRAGMA foreign_keys = ON');
1394
+ }
1395
+ catch (error) {
1396
+ try {
1397
+ db.exec('ROLLBACK');
1398
+ db.exec('PRAGMA foreign_keys = ON');
1399
+ }
1400
+ catch (rollbackErr) {
1401
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1402
+ }
1403
+ const cause = error instanceof Error ? error.message : String(error);
1404
+ throw new MigrationError(`Failed to migrate from v12 to v13 (entity extraction): ${cause}`, 'migrate', 'provenance', error);
1405
+ }
1406
+ }
1407
+ /**
1408
+ * Migrate from schema version 13 to version 14
1409
+ *
1410
+ * Changes in v14:
1411
+ * - provenance.type: Added 'COMPARISON' to CHECK constraint
1412
+ * - provenance.source_type: Added 'COMPARISON' to CHECK constraint
1413
+ * - comparisons: New table for document comparison results
1414
+ * - 3 new indexes: idx_comparisons_doc1, idx_comparisons_doc2, idx_comparisons_created
1415
+ *
1416
+ * @param db - Database instance from better-sqlite3
1417
+ * @throws MigrationError if migration fails
1418
+ */
1419
+ function migrateV13ToV14(db) {
1420
+ try {
1421
+ db.exec('PRAGMA foreign_keys = OFF');
1422
+ db.exec('BEGIN TRANSACTION');
1423
+ // Step 1: Recreate provenance table with COMPARISON in CHECK constraints
1424
+ db.exec(`
1425
+ CREATE TABLE provenance_new (
1426
+ id TEXT PRIMARY KEY,
1427
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION', 'COMPARISON')),
1428
+ created_at TEXT NOT NULL,
1429
+ processed_at TEXT NOT NULL,
1430
+ source_file_created_at TEXT,
1431
+ source_file_modified_at TEXT,
1432
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'VLM_DEDUP', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION', 'COMPARISON')),
1433
+ source_path TEXT,
1434
+ source_id TEXT,
1435
+ root_document_id TEXT NOT NULL,
1436
+ location TEXT,
1437
+ content_hash TEXT NOT NULL,
1438
+ input_hash TEXT,
1439
+ file_hash TEXT,
1440
+ processor TEXT NOT NULL,
1441
+ processor_version TEXT NOT NULL,
1442
+ processing_params TEXT NOT NULL,
1443
+ processing_duration_ms INTEGER,
1444
+ processing_quality_score REAL,
1445
+ parent_id TEXT,
1446
+ parent_ids TEXT NOT NULL,
1447
+ chain_depth INTEGER NOT NULL,
1448
+ chain_path TEXT,
1449
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
1450
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
1451
+ )
1452
+ `);
1453
+ // Step 2: Copy existing data
1454
+ db.exec('INSERT INTO provenance_new SELECT * FROM provenance');
1455
+ // Step 3: Drop old table
1456
+ db.exec('DROP TABLE provenance');
1457
+ // Step 4: Rename new table
1458
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
1459
+ // Step 5: Recreate provenance indexes
1460
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
1461
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
1462
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
1463
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
1464
+ // Step 6: Create comparisons table
1465
+ db.exec(CREATE_COMPARISONS_TABLE);
1466
+ // Step 7: Create indexes for comparisons table
1467
+ db.exec('CREATE INDEX IF NOT EXISTS idx_comparisons_doc1 ON comparisons(document_id_1)');
1468
+ db.exec('CREATE INDEX IF NOT EXISTS idx_comparisons_doc2 ON comparisons(document_id_2)');
1469
+ db.exec('CREATE INDEX IF NOT EXISTS idx_comparisons_created ON comparisons(created_at)');
1470
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
1471
+ const fkViolations = db.pragma('foreign_key_check');
1472
+ if (fkViolations.length > 0) {
1473
+ throw new Error(`Foreign key integrity check failed after v13->v14 migration: ${fkViolations.length} violation(s). ` +
1474
+ `First: ${JSON.stringify(fkViolations[0])}`);
1475
+ }
1476
+ db.exec('COMMIT');
1477
+ db.exec('PRAGMA foreign_keys = ON');
1478
+ }
1479
+ catch (error) {
1480
+ try {
1481
+ db.exec('ROLLBACK');
1482
+ db.exec('PRAGMA foreign_keys = ON');
1483
+ }
1484
+ catch (rollbackErr) {
1485
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1486
+ }
1487
+ const cause = error instanceof Error ? error.message : String(error);
1488
+ throw new MigrationError(`Failed to migrate from v13 to v14 (document comparison): ${cause}`, 'migrate', 'provenance', error);
1489
+ }
1490
+ }
1491
+ /**
1492
+ * Migrate from schema version 15 to version 16
1493
+ *
1494
+ * Changes in v16:
1495
+ * - provenance.type: Added 'KNOWLEDGE_GRAPH' to CHECK constraint
1496
+ * - provenance.source_type: Added 'KNOWLEDGE_GRAPH' to CHECK constraint
1497
+ * - knowledge_nodes: New table for unified entities resolved across documents
1498
+ * - knowledge_edges: New table for relationships between knowledge nodes
1499
+ * - node_entity_links: New table linking knowledge nodes to source entity extractions
1500
+ * - 8 new indexes: idx_kn_entity_type, idx_kn_normalized_name, idx_kn_document_count,
1501
+ * idx_ke_source_node, idx_ke_target_node, idx_ke_relationship_type,
1502
+ * idx_nel_node_id, idx_nel_document_id
1503
+ *
1504
+ * @param db - Database instance from better-sqlite3
1505
+ * @throws MigrationError if migration fails
1506
+ */
1507
+ function migrateV15ToV16(db) {
1508
+ try {
1509
+ db.exec('PRAGMA foreign_keys = OFF');
1510
+ db.exec('BEGIN TRANSACTION');
1511
+ // Step 1: Recreate provenance table with KNOWLEDGE_GRAPH in CHECK constraints
1512
+ db.exec(`
1513
+ CREATE TABLE provenance_new (
1514
+ id TEXT PRIMARY KEY,
1515
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION', 'COMPARISON', 'CLUSTERING', 'KNOWLEDGE_GRAPH')),
1516
+ created_at TEXT NOT NULL,
1517
+ processed_at TEXT NOT NULL,
1518
+ source_file_created_at TEXT,
1519
+ source_file_modified_at TEXT,
1520
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'VLM_DEDUP', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION', 'COMPARISON', 'CLUSTERING', 'KNOWLEDGE_GRAPH')),
1521
+ source_path TEXT,
1522
+ source_id TEXT,
1523
+ root_document_id TEXT NOT NULL,
1524
+ location TEXT,
1525
+ content_hash TEXT NOT NULL,
1526
+ input_hash TEXT,
1527
+ file_hash TEXT,
1528
+ processor TEXT NOT NULL,
1529
+ processor_version TEXT NOT NULL,
1530
+ processing_params TEXT NOT NULL,
1531
+ processing_duration_ms INTEGER,
1532
+ processing_quality_score REAL,
1533
+ parent_id TEXT,
1534
+ parent_ids TEXT NOT NULL,
1535
+ chain_depth INTEGER NOT NULL,
1536
+ chain_path TEXT,
1537
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
1538
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
1539
+ )
1540
+ `);
1541
+ // Step 2: Copy existing data
1542
+ db.exec('INSERT INTO provenance_new SELECT * FROM provenance');
1543
+ // Step 3: Drop old table
1544
+ db.exec('DROP TABLE provenance');
1545
+ // Step 4: Rename new table
1546
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
1547
+ // Step 5: Recreate provenance indexes
1548
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
1549
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
1550
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
1551
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
1552
+ // Step 6: Create knowledge graph tables
1553
+ db.exec(CREATE_KNOWLEDGE_NODES_TABLE);
1554
+ db.exec(CREATE_KNOWLEDGE_EDGES_TABLE);
1555
+ db.exec(CREATE_NODE_ENTITY_LINKS_TABLE);
1556
+ // Step 7: Create indexes for knowledge graph tables
1557
+ db.exec('CREATE INDEX IF NOT EXISTS idx_kn_entity_type ON knowledge_nodes(entity_type)');
1558
+ db.exec('CREATE INDEX IF NOT EXISTS idx_kn_normalized_name ON knowledge_nodes(normalized_name)');
1559
+ db.exec('CREATE INDEX IF NOT EXISTS idx_kn_document_count ON knowledge_nodes(document_count DESC)');
1560
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_source_node ON knowledge_edges(source_node_id)');
1561
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_target_node ON knowledge_edges(target_node_id)');
1562
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_relationship_type ON knowledge_edges(relationship_type)');
1563
+ db.exec('CREATE INDEX IF NOT EXISTS idx_nel_node_id ON node_entity_links(node_id)');
1564
+ db.exec('CREATE INDEX IF NOT EXISTS idx_nel_document_id ON node_entity_links(document_id)');
1565
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
1566
+ const fkViolations = db.pragma('foreign_key_check');
1567
+ if (fkViolations.length > 0) {
1568
+ throw new Error(`Foreign key integrity check failed after v15->v16 migration: ${fkViolations.length} violation(s). ` +
1569
+ `First: ${JSON.stringify(fkViolations[0])}`);
1570
+ }
1571
+ db.exec('COMMIT');
1572
+ db.exec('PRAGMA foreign_keys = ON');
1573
+ }
1574
+ catch (error) {
1575
+ try {
1576
+ db.exec('ROLLBACK');
1577
+ db.exec('PRAGMA foreign_keys = ON');
1578
+ }
1579
+ catch (rollbackErr) {
1580
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1581
+ }
1582
+ const cause = error instanceof Error ? error.message : String(error);
1583
+ throw new MigrationError(`Failed to migrate from v15 to v16 (knowledge graph): ${cause}`, 'migrate', 'provenance', error);
1584
+ }
1585
+ }
1586
+ /**
1587
+ * Migrate from schema version 14 to version 15
1588
+ *
1589
+ * Changes in v15:
1590
+ * - provenance.type: Added 'CLUSTERING' to CHECK constraint
1591
+ * - provenance.source_type: Added 'CLUSTERING' to CHECK constraint
1592
+ * - clusters: New table for document clustering results
1593
+ * - document_clusters: New table for document-cluster assignments
1594
+ * - 6 new indexes: idx_clusters_run_id, idx_clusters_tag, idx_clusters_created,
1595
+ * idx_doc_clusters_document, idx_doc_clusters_cluster, idx_doc_clusters_run
1596
+ *
1597
+ * @param db - Database instance from better-sqlite3
1598
+ * @throws MigrationError if migration fails
1599
+ */
1600
+ function migrateV14ToV15(db) {
1601
+ try {
1602
+ db.exec('PRAGMA foreign_keys = OFF');
1603
+ db.exec('BEGIN TRANSACTION');
1604
+ // Step 1: Recreate provenance table with CLUSTERING in CHECK constraints
1605
+ db.exec(`
1606
+ CREATE TABLE provenance_new (
1607
+ id TEXT PRIMARY KEY,
1608
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION', 'COMPARISON', 'CLUSTERING')),
1609
+ created_at TEXT NOT NULL,
1610
+ processed_at TEXT NOT NULL,
1611
+ source_file_created_at TEXT,
1612
+ source_file_modified_at TEXT,
1613
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'VLM_DEDUP', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION', 'COMPARISON', 'CLUSTERING')),
1614
+ source_path TEXT,
1615
+ source_id TEXT,
1616
+ root_document_id TEXT NOT NULL,
1617
+ location TEXT,
1618
+ content_hash TEXT NOT NULL,
1619
+ input_hash TEXT,
1620
+ file_hash TEXT,
1621
+ processor TEXT NOT NULL,
1622
+ processor_version TEXT NOT NULL,
1623
+ processing_params TEXT NOT NULL,
1624
+ processing_duration_ms INTEGER,
1625
+ processing_quality_score REAL,
1626
+ parent_id TEXT,
1627
+ parent_ids TEXT NOT NULL,
1628
+ chain_depth INTEGER NOT NULL,
1629
+ chain_path TEXT,
1630
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
1631
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
1632
+ )
1633
+ `);
1634
+ // Step 2: Copy existing data
1635
+ db.exec('INSERT INTO provenance_new SELECT * FROM provenance');
1636
+ // Step 3: Drop old table
1637
+ db.exec('DROP TABLE provenance');
1638
+ // Step 4: Rename new table
1639
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
1640
+ // Step 5: Recreate provenance indexes
1641
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
1642
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
1643
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
1644
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
1645
+ // Step 6: Create clusters table
1646
+ db.exec(CREATE_CLUSTERS_TABLE);
1647
+ // Step 7: Create document_clusters table
1648
+ db.exec(CREATE_DOCUMENT_CLUSTERS_TABLE);
1649
+ // Step 8: Create indexes for clustering tables
1650
+ db.exec('CREATE INDEX IF NOT EXISTS idx_clusters_run_id ON clusters(run_id)');
1651
+ db.exec('CREATE INDEX IF NOT EXISTS idx_clusters_tag ON clusters(classification_tag)');
1652
+ db.exec('CREATE INDEX IF NOT EXISTS idx_clusters_created ON clusters(created_at DESC)');
1653
+ db.exec('CREATE INDEX IF NOT EXISTS idx_doc_clusters_document ON document_clusters(document_id)');
1654
+ db.exec('CREATE INDEX IF NOT EXISTS idx_doc_clusters_cluster ON document_clusters(cluster_id)');
1655
+ db.exec('CREATE INDEX IF NOT EXISTS idx_doc_clusters_run ON document_clusters(run_id)');
1656
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
1657
+ const fkViolations = db.pragma('foreign_key_check');
1658
+ if (fkViolations.length > 0) {
1659
+ throw new Error(`Foreign key integrity check failed after v14->v15 migration: ${fkViolations.length} violation(s). ` +
1660
+ `First: ${JSON.stringify(fkViolations[0])}`);
1661
+ }
1662
+ db.exec('COMMIT');
1663
+ db.exec('PRAGMA foreign_keys = ON');
1664
+ }
1665
+ catch (error) {
1666
+ try {
1667
+ db.exec('ROLLBACK');
1668
+ db.exec('PRAGMA foreign_keys = ON');
1669
+ }
1670
+ catch (rollbackErr) {
1671
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1672
+ }
1673
+ const cause = error instanceof Error ? error.message : String(error);
1674
+ throw new MigrationError(`Failed to migrate from v14 to v15 (document clustering): ${cause}`, 'migrate', 'provenance', error);
1675
+ }
1676
+ }
1677
+ /**
1678
+ * Migrate from schema version 16 to version 17
1679
+ *
1680
+ * Changes in v17 (knowledge graph optimization):
1681
+ * - knowledge_nodes.edge_count: New column tracking edge count per node
1682
+ * - node_entity_links.resolution_method: New column tracking how entity was resolved
1683
+ * - knowledge_edges: Expanded CHECK constraint with 'precedes', 'occurred_at' relationship types
1684
+ * - knowledge_nodes_fts: New FTS5 virtual table for knowledge node full-text search
1685
+ * - knowledge_nodes_fts_ai/ad/au: FTS5 sync triggers for knowledge_nodes
1686
+ * - idx_knowledge_nodes_canonical_lower: Case-insensitive index on canonical_name
1687
+ * - idx_entity_mentions_chunk_id: Index on entity_mentions.chunk_id for chunk-based lookups
1688
+ *
1689
+ * @param db - Database instance from better-sqlite3
1690
+ * @throws MigrationError if migration fails
1691
+ */
1692
+ function migrateV16ToV17(db) {
1693
+ try {
1694
+ db.exec('PRAGMA foreign_keys = OFF');
1695
+ db.exec('BEGIN TRANSACTION');
1696
+ // Step 1: Add resolution_method column to node_entity_links (if not already present from fresh schema)
1697
+ const nelColumns = db.pragma('table_info(node_entity_links)');
1698
+ if (!nelColumns.some((c) => c.name === 'resolution_method')) {
1699
+ db.exec('ALTER TABLE node_entity_links ADD COLUMN resolution_method TEXT');
1700
+ }
1701
+ // Step 2: Add edge_count column to knowledge_nodes (if not already present from fresh schema)
1702
+ const knColumns = db.pragma('table_info(knowledge_nodes)');
1703
+ if (!knColumns.some((c) => c.name === 'edge_count')) {
1704
+ db.exec('ALTER TABLE knowledge_nodes ADD COLUMN edge_count INTEGER NOT NULL DEFAULT 0');
1705
+ }
1706
+ // Step 3: Recreate knowledge_edges with expanded CHECK constraint
1707
+ // Include v20 columns (valid_from, valid_until, normalized_weight, contradiction_count)
1708
+ // so that SELECT * works regardless of whether the source table was created fresh (with v20 cols)
1709
+ // or via earlier migrations (without them).
1710
+ const keColumns = db.pragma('table_info(knowledge_edges)');
1711
+ const hasV20Cols = keColumns.some((c) => c.name === 'valid_from');
1712
+ db.exec(`
1713
+ CREATE TABLE knowledge_edges_new (
1714
+ id TEXT PRIMARY KEY,
1715
+ source_node_id TEXT NOT NULL,
1716
+ target_node_id TEXT NOT NULL,
1717
+ relationship_type TEXT NOT NULL CHECK (relationship_type IN (
1718
+ 'co_mentioned', 'co_located', 'works_at', 'represents',
1719
+ 'located_in', 'filed_in', 'cites', 'references',
1720
+ 'party_to', 'related_to', 'precedes', 'occurred_at'
1721
+ )),
1722
+ weight REAL NOT NULL DEFAULT 1.0,
1723
+ evidence_count INTEGER NOT NULL DEFAULT 1,
1724
+ document_ids TEXT NOT NULL,
1725
+ metadata TEXT,
1726
+ provenance_id TEXT NOT NULL,
1727
+ created_at TEXT NOT NULL,
1728
+ valid_from TEXT,
1729
+ valid_until TEXT,
1730
+ normalized_weight REAL DEFAULT 0,
1731
+ contradiction_count INTEGER DEFAULT 0,
1732
+ FOREIGN KEY (source_node_id) REFERENCES knowledge_nodes(id),
1733
+ FOREIGN KEY (target_node_id) REFERENCES knowledge_nodes(id),
1734
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id)
1735
+ )
1736
+ `);
1737
+ // Step 4: Copy existing edges (use explicit columns if source lacks v20 columns)
1738
+ if (hasV20Cols) {
1739
+ db.exec('INSERT INTO knowledge_edges_new SELECT * FROM knowledge_edges');
1740
+ }
1741
+ else {
1742
+ db.exec(`INSERT INTO knowledge_edges_new (id, source_node_id, target_node_id, relationship_type, weight, evidence_count, document_ids, metadata, provenance_id, created_at)
1743
+ SELECT id, source_node_id, target_node_id, relationship_type, weight, evidence_count, document_ids, metadata, provenance_id, created_at FROM knowledge_edges`);
1744
+ }
1745
+ // Step 5: Drop old edges table
1746
+ db.exec('DROP TABLE knowledge_edges');
1747
+ // Step 6: Rename new table
1748
+ db.exec('ALTER TABLE knowledge_edges_new RENAME TO knowledge_edges');
1749
+ // Step 7: Recreate indexes on knowledge_edges (dropped with old table)
1750
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_source_node ON knowledge_edges(source_node_id)');
1751
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_target_node ON knowledge_edges(target_node_id)');
1752
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_relationship_type ON knowledge_edges(relationship_type)');
1753
+ // Step 8: Create new optimization indexes
1754
+ db.exec('CREATE INDEX IF NOT EXISTS idx_knowledge_nodes_canonical_lower ON knowledge_nodes(canonical_name COLLATE NOCASE)');
1755
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_mentions_chunk_id ON entity_mentions(chunk_id)');
1756
+ // Step 9: Backfill edge_count from existing edges
1757
+ db.exec(`
1758
+ UPDATE knowledge_nodes SET edge_count = (
1759
+ SELECT COUNT(*) FROM knowledge_edges
1760
+ WHERE source_node_id = knowledge_nodes.id OR target_node_id = knowledge_nodes.id
1761
+ )
1762
+ `);
1763
+ // Step 10: Create knowledge_nodes_fts FTS5 table
1764
+ db.exec(CREATE_KNOWLEDGE_NODES_FTS_TABLE);
1765
+ // Step 11: Create FTS triggers
1766
+ for (const trigger of CREATE_KNOWLEDGE_NODES_FTS_TRIGGERS) {
1767
+ db.exec(trigger);
1768
+ }
1769
+ // Step 12: Populate FTS from existing knowledge_nodes
1770
+ const nodeCount = db.prepare('SELECT COUNT(*) as cnt FROM knowledge_nodes').get();
1771
+ if (nodeCount.cnt > 0) {
1772
+ db.exec(`
1773
+ INSERT INTO knowledge_nodes_fts(rowid, canonical_name)
1774
+ SELECT rowid, canonical_name FROM knowledge_nodes
1775
+ `);
1776
+ }
1777
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
1778
+ const fkViolations = db.pragma('foreign_key_check');
1779
+ if (fkViolations.length > 0) {
1780
+ throw new Error(`Foreign key integrity check failed after v16->v17 migration: ${fkViolations.length} violation(s). ` +
1781
+ `First: ${JSON.stringify(fkViolations[0])}`);
1782
+ }
1783
+ db.exec('COMMIT');
1784
+ db.exec('PRAGMA foreign_keys = ON');
1785
+ }
1786
+ catch (error) {
1787
+ try {
1788
+ db.exec('ROLLBACK');
1789
+ db.exec('PRAGMA foreign_keys = ON');
1790
+ }
1791
+ catch (rollbackErr) {
1792
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1793
+ }
1794
+ const cause = error instanceof Error ? error.message : String(error);
1795
+ throw new MigrationError(`Failed to migrate from v16 to v17 (knowledge graph optimization): ${cause}`, 'migrate', 'knowledge_edges', error);
1796
+ }
1797
+ }
1798
+ /**
1799
+ * Migrate from schema version 17 to version 18
1800
+ *
1801
+ * Changes in v18:
1802
+ * - entities.entity_type: Added 'medication', 'diagnosis' to CHECK constraint
1803
+ * - knowledge_nodes.entity_type: Added 'medication', 'diagnosis' to CHECK constraint
1804
+ *
1805
+ * SQLite CHECK constraints require table recreation to modify.
1806
+ *
1807
+ * @param db - Database instance from better-sqlite3
1808
+ * @throws MigrationError if migration fails
1809
+ */
1810
+ function migrateV17ToV18(db) {
1811
+ const entityTypeCheck = `('person', 'organization', 'date', 'amount', 'case_number', 'location', 'statute', 'exhibit', 'medication', 'diagnosis', 'medical_device', 'other')`;
1812
+ try {
1813
+ db.exec('PRAGMA foreign_keys = OFF');
1814
+ db.exec('BEGIN TRANSACTION');
1815
+ // Step 1: Recreate entities table with expanded CHECK constraint
1816
+ db.exec(`
1817
+ CREATE TABLE entities_new (
1818
+ id TEXT PRIMARY KEY NOT NULL,
1819
+ document_id TEXT NOT NULL REFERENCES documents(id),
1820
+ entity_type TEXT NOT NULL CHECK (entity_type IN ${entityTypeCheck}),
1821
+ raw_text TEXT NOT NULL,
1822
+ normalized_text TEXT NOT NULL,
1823
+ confidence REAL NOT NULL DEFAULT 0.0,
1824
+ metadata TEXT,
1825
+ provenance_id TEXT NOT NULL REFERENCES provenance(id),
1826
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
1827
+ )
1828
+ `);
1829
+ db.exec('INSERT INTO entities_new SELECT * FROM entities');
1830
+ db.exec('DROP TABLE entities');
1831
+ db.exec('ALTER TABLE entities_new RENAME TO entities');
1832
+ // Recreate entities indexes
1833
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entities_document_id ON entities(document_id)');
1834
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entities_entity_type ON entities(entity_type)');
1835
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entities_normalized_text ON entities(normalized_text)');
1836
+ // Step 2: Recreate knowledge_nodes table with expanded CHECK constraint
1837
+ // Include v20 columns (importance_score, resolution_type) so that SELECT * works
1838
+ // regardless of whether the source table was created fresh (with v20 cols) or via earlier migrations.
1839
+ const knColsV18 = db.pragma('table_info(knowledge_nodes)');
1840
+ const hasV20NodeCols = knColsV18.some((c) => c.name === 'importance_score');
1841
+ db.exec(`
1842
+ CREATE TABLE knowledge_nodes_new (
1843
+ id TEXT PRIMARY KEY,
1844
+ entity_type TEXT NOT NULL CHECK (entity_type IN ${entityTypeCheck}),
1845
+ canonical_name TEXT NOT NULL,
1846
+ normalized_name TEXT NOT NULL,
1847
+ aliases TEXT,
1848
+ document_count INTEGER NOT NULL DEFAULT 1,
1849
+ mention_count INTEGER NOT NULL DEFAULT 0,
1850
+ edge_count INTEGER NOT NULL DEFAULT 0,
1851
+ avg_confidence REAL NOT NULL DEFAULT 0.0,
1852
+ metadata TEXT,
1853
+ provenance_id TEXT NOT NULL,
1854
+ created_at TEXT NOT NULL,
1855
+ updated_at TEXT NOT NULL,
1856
+ importance_score REAL,
1857
+ resolution_type TEXT,
1858
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id)
1859
+ )
1860
+ `);
1861
+ if (hasV20NodeCols) {
1862
+ db.exec('INSERT INTO knowledge_nodes_new SELECT * FROM knowledge_nodes');
1863
+ }
1864
+ else {
1865
+ db.exec(`INSERT INTO knowledge_nodes_new (id, entity_type, canonical_name, normalized_name, aliases, document_count, mention_count, edge_count, avg_confidence, metadata, provenance_id, created_at, updated_at)
1866
+ SELECT id, entity_type, canonical_name, normalized_name, aliases, document_count, mention_count, edge_count, avg_confidence, metadata, provenance_id, created_at, updated_at FROM knowledge_nodes`);
1867
+ }
1868
+ // Drop FTS table and triggers before dropping knowledge_nodes (FTS references it)
1869
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_insert');
1870
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_delete');
1871
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_update');
1872
+ db.exec('DROP TABLE IF EXISTS knowledge_nodes_fts');
1873
+ db.exec('DROP TABLE knowledge_nodes');
1874
+ db.exec('ALTER TABLE knowledge_nodes_new RENAME TO knowledge_nodes');
1875
+ // Recreate knowledge_nodes indexes
1876
+ db.exec('CREATE INDEX IF NOT EXISTS idx_kn_entity_type ON knowledge_nodes(entity_type)');
1877
+ db.exec('CREATE INDEX IF NOT EXISTS idx_kn_normalized_name ON knowledge_nodes(normalized_name)');
1878
+ db.exec('CREATE INDEX IF NOT EXISTS idx_kn_document_count ON knowledge_nodes(document_count)');
1879
+ db.exec('CREATE INDEX IF NOT EXISTS idx_knowledge_nodes_canonical_lower ON knowledge_nodes(canonical_name COLLATE NOCASE)');
1880
+ // Recreate FTS5 table and triggers
1881
+ db.exec(`CREATE VIRTUAL TABLE IF NOT EXISTS knowledge_nodes_fts USING fts5(canonical_name, content='knowledge_nodes', content_rowid='rowid')`);
1882
+ db.exec(`CREATE TRIGGER IF NOT EXISTS knowledge_nodes_fts_insert AFTER INSERT ON knowledge_nodes BEGIN INSERT INTO knowledge_nodes_fts(rowid, canonical_name) VALUES (new.rowid, new.canonical_name); END`);
1883
+ db.exec(`CREATE TRIGGER IF NOT EXISTS knowledge_nodes_fts_delete AFTER DELETE ON knowledge_nodes BEGIN INSERT INTO knowledge_nodes_fts(knowledge_nodes_fts, rowid, canonical_name) VALUES ('delete', old.rowid, old.canonical_name); END`);
1884
+ db.exec(`CREATE TRIGGER IF NOT EXISTS knowledge_nodes_fts_update AFTER UPDATE ON knowledge_nodes BEGIN INSERT INTO knowledge_nodes_fts(knowledge_nodes_fts, rowid, canonical_name) VALUES ('delete', old.rowid, old.canonical_name); INSERT INTO knowledge_nodes_fts(rowid, canonical_name) VALUES (new.rowid, new.canonical_name); END`);
1885
+ // Repopulate FTS from existing data
1886
+ const nodeCount = db.prepare('SELECT COUNT(*) as cnt FROM knowledge_nodes').get();
1887
+ if (nodeCount.cnt > 0) {
1888
+ db.exec(`
1889
+ INSERT INTO knowledge_nodes_fts(rowid, canonical_name)
1890
+ SELECT rowid, canonical_name FROM knowledge_nodes
1891
+ `);
1892
+ }
1893
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
1894
+ const fkViolations = db.pragma('foreign_key_check');
1895
+ if (fkViolations.length > 0) {
1896
+ throw new Error(`Foreign key integrity check failed after v17->v18 migration: ${fkViolations.length} violation(s). ` +
1897
+ `First: ${JSON.stringify(fkViolations[0])}`);
1898
+ }
1899
+ db.exec('COMMIT');
1900
+ db.exec('PRAGMA foreign_keys = ON');
1901
+ }
1902
+ catch (error) {
1903
+ try {
1904
+ db.exec('ROLLBACK');
1905
+ db.exec('PRAGMA foreign_keys = ON');
1906
+ }
1907
+ catch (rollbackErr) {
1908
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1909
+ }
1910
+ const cause = error instanceof Error ? error.message : String(error);
1911
+ throw new MigrationError(`Failed to migrate from v17 to v18 (medical entity types): ${cause}`, 'migrate', 'entities', error);
1912
+ }
1913
+ }
1914
+ /**
1915
+ * Migrate from schema version 18 to version 19
1916
+ *
1917
+ * Changes in v19:
1918
+ * - entity_extraction_segments: New table for chunked entity extraction with provenance
1919
+ * Stores 50K-character segments with 10% overlap for focused Gemini extraction.
1920
+ * Each segment records its exact character_start/character_end in the OCR text
1921
+ * and links to provenance for full traceability.
1922
+ * - 3 new indexes: idx_segments_document, idx_segments_status, idx_segments_doc_status
1923
+ *
1924
+ * @param db - Database instance from better-sqlite3
1925
+ * @throws MigrationError if migration fails
1926
+ */
1927
+ function migrateV18ToV19(db) {
1928
+ try {
1929
+ db.exec('BEGIN TRANSACTION');
1930
+ // Step 1: Create entity_extraction_segments table
1931
+ db.exec(CREATE_ENTITY_EXTRACTION_SEGMENTS_TABLE);
1932
+ // Step 2: Create indexes
1933
+ db.exec('CREATE INDEX IF NOT EXISTS idx_segments_document ON entity_extraction_segments(document_id)');
1934
+ db.exec('CREATE INDEX IF NOT EXISTS idx_segments_status ON entity_extraction_segments(extraction_status)');
1935
+ db.exec('CREATE INDEX IF NOT EXISTS idx_segments_doc_status ON entity_extraction_segments(document_id, extraction_status)');
1936
+ db.exec('COMMIT');
1937
+ }
1938
+ catch (error) {
1939
+ try {
1940
+ db.exec('ROLLBACK');
1941
+ }
1942
+ catch (rollbackErr) {
1943
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
1944
+ }
1945
+ const cause = error instanceof Error ? error.message : String(error);
1946
+ throw new MigrationError(`Failed to migrate from v18 to v19 (entity extraction segments): ${cause}`, 'migrate', 'entity_extraction_segments', error);
1947
+ }
1948
+ }
1949
+ /**
1950
+ * Migrate from schema version 19 to version 20
1951
+ *
1952
+ * Changes in v20:
1953
+ * - knowledge_edges: Added valid_from, valid_until (TEXT) for temporal bounds
1954
+ * - knowledge_edges: Added normalized_weight (REAL DEFAULT 0) for weight normalization
1955
+ * - knowledge_edges: Added contradiction_count (INTEGER DEFAULT 0) for contradiction tracking
1956
+ * - knowledge_nodes: Added importance_score (REAL) for node ranking
1957
+ * - knowledge_nodes: Added resolution_type (TEXT) for entity resolution tracking
1958
+ * - entity_embeddings: New table for entity vector embeddings
1959
+ * - vec_entity_embeddings: New sqlite-vec virtual table for entity semantic search
1960
+ * - 3 new indexes: idx_entity_embeddings_entity_id, idx_entity_embeddings_node_id,
1961
+ * idx_entity_embeddings_content_hash
1962
+ *
1963
+ * Note: knowledge_nodes.updated_at already exists from the v16 schema, so it is NOT added here.
1964
+ *
1965
+ * @param db - Database instance from better-sqlite3
1966
+ * @throws MigrationError if migration fails
1967
+ */
1968
+ function migrateV19ToV20(db) {
1969
+ // M-5: PRAGMA foreign_keys in try-finally so it ALWAYS re-enables even on crash
1970
+ db.exec('PRAGMA foreign_keys = OFF');
1971
+ try {
1972
+ // M-5: Wrap all DDL in a transaction for atomicity
1973
+ db.exec('BEGIN TRANSACTION');
1974
+ try {
1975
+ // Step 1: Add new columns to knowledge_edges
1976
+ const edgeCols = db.pragma('table_info(knowledge_edges)');
1977
+ const edgeColNames = new Set(edgeCols.map((c) => c.name));
1978
+ if (!edgeColNames.has('valid_from')) {
1979
+ db.exec('ALTER TABLE knowledge_edges ADD COLUMN valid_from TEXT');
1980
+ }
1981
+ if (!edgeColNames.has('valid_until')) {
1982
+ db.exec('ALTER TABLE knowledge_edges ADD COLUMN valid_until TEXT');
1983
+ }
1984
+ if (!edgeColNames.has('normalized_weight')) {
1985
+ db.exec('ALTER TABLE knowledge_edges ADD COLUMN normalized_weight REAL DEFAULT 0');
1986
+ }
1987
+ if (!edgeColNames.has('contradiction_count')) {
1988
+ db.exec('ALTER TABLE knowledge_edges ADD COLUMN contradiction_count INTEGER DEFAULT 0');
1989
+ }
1990
+ // Step 2: Add new columns to knowledge_nodes
1991
+ const nodeCols = db.pragma('table_info(knowledge_nodes)');
1992
+ const nodeColNames = new Set(nodeCols.map((c) => c.name));
1993
+ if (!nodeColNames.has('importance_score')) {
1994
+ db.exec('ALTER TABLE knowledge_nodes ADD COLUMN importance_score REAL');
1995
+ }
1996
+ if (!nodeColNames.has('resolution_type')) {
1997
+ db.exec('ALTER TABLE knowledge_nodes ADD COLUMN resolution_type TEXT');
1998
+ }
1999
+ // Step 3: Add ocr_quality_score to chunks
2000
+ const chunkCols = db.pragma('table_info(chunks)');
2001
+ const chunkColNames = new Set(chunkCols.map((c) => c.name));
2002
+ if (!chunkColNames.has('ocr_quality_score')) {
2003
+ db.exec('ALTER TABLE chunks ADD COLUMN ocr_quality_score REAL');
2004
+ }
2005
+ // Step 4: Create placeholder entity_embeddings table (v21 will recreate with correct schema)
2006
+ db.exec(`CREATE TABLE IF NOT EXISTS entity_embeddings (
2007
+ id TEXT PRIMARY KEY,
2008
+ entity_id TEXT NOT NULL REFERENCES entities(id),
2009
+ node_id TEXT REFERENCES knowledge_nodes(id),
2010
+ embedding_model TEXT NOT NULL,
2011
+ dimensions INTEGER NOT NULL,
2012
+ content_hash TEXT NOT NULL,
2013
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
2014
+ provenance_id TEXT REFERENCES provenance(id)
2015
+ )`);
2016
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_embeddings_entity_id ON entity_embeddings(entity_id)');
2017
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_embeddings_node_id ON entity_embeddings(node_id)');
2018
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_embeddings_content_hash ON entity_embeddings(content_hash)');
2019
+ db.exec('COMMIT');
2020
+ }
2021
+ catch (error) {
2022
+ db.exec('ROLLBACK');
2023
+ throw error;
2024
+ }
2025
+ // Step 5: Create placeholder vec_entity_embeddings virtual table (v21 will recreate with correct PK)
2026
+ // Note: Virtual table creation (vec0) is placed outside the transaction because
2027
+ // vec0 virtual tables may not support transactional DDL in all SQLite builds.
2028
+ db.exec(`CREATE VIRTUAL TABLE IF NOT EXISTS vec_entity_embeddings USING vec0(
2029
+ id TEXT PRIMARY KEY,
2030
+ embedding float[768] distance_metric=cosine
2031
+ )`);
2032
+ // FK integrity check after all DDL is committed
2033
+ const fkViolations = db.pragma('foreign_key_check');
2034
+ if (fkViolations.length > 0) {
2035
+ throw new Error(`Foreign key integrity check failed after v19->v20 migration: ${fkViolations.length} violation(s). ` +
2036
+ `First: ${JSON.stringify(fkViolations[0])}`);
2037
+ }
2038
+ }
2039
+ finally {
2040
+ db.exec('PRAGMA foreign_keys = ON');
2041
+ }
2042
+ }
2043
+ /**
2044
+ * Migrate from schema version 20 to version 21
2045
+ *
2046
+ * Changes in v21:
2047
+ * - Rebuild entity_embeddings table with correct columns:
2048
+ * node_id, original_text, original_text_length, entity_type, document_count, model_name
2049
+ * (v20 table had entity_id, embedding_model, dimensions which didn't match embed_entities code)
2050
+ * - Rebuild vec_entity_embeddings with entity_embedding_id PK (was id)
2051
+ */
2052
+ function migrateV20ToV21(db) {
2053
+ // M-6: PRAGMA foreign_keys in try-finally so it ALWAYS re-enables even on crash
2054
+ db.exec('PRAGMA foreign_keys = OFF');
2055
+ try {
2056
+ // M-6: Wrap DROP + CREATE in a transaction for atomicity
2057
+ db.exec('BEGIN TRANSACTION');
2058
+ try {
2059
+ // Step 1: Drop and recreate entity_embeddings with correct schema
2060
+ // Safe because embed_entities never succeeded with the v20 schema
2061
+ // DROP TABLE removes the table's indexes automatically
2062
+ db.exec('DROP TABLE IF EXISTS entity_embeddings');
2063
+ db.exec(CREATE_ENTITY_EMBEDDINGS_TABLE);
2064
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_embeddings_node_id ON entity_embeddings(node_id)');
2065
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_embeddings_content_hash ON entity_embeddings(content_hash)');
2066
+ db.exec('COMMIT');
2067
+ }
2068
+ catch (error) {
2069
+ db.exec('ROLLBACK');
2070
+ throw error;
2071
+ }
2072
+ // Step 2: Drop and recreate vec_entity_embeddings with correct PK column name
2073
+ // Note: Virtual table operations (vec0) are placed outside the transaction because
2074
+ // vec0 virtual tables may not support transactional DDL in all SQLite builds.
2075
+ db.exec('DROP TABLE IF EXISTS vec_entity_embeddings');
2076
+ db.exec(CREATE_VEC_ENTITY_EMBEDDINGS_TABLE);
2077
+ // FK integrity check after all DDL is committed
2078
+ const fkViolations = db.pragma('foreign_key_check');
2079
+ if (fkViolations.length > 0) {
2080
+ throw new Error(`Foreign key integrity check failed after v20->v21 migration: ${fkViolations.length} violation(s). ` +
2081
+ `First: ${JSON.stringify(fkViolations[0])}`);
2082
+ }
2083
+ }
2084
+ finally {
2085
+ db.exec('PRAGMA foreign_keys = ON');
2086
+ }
2087
+ }
2088
+ /**
2089
+ * Migrate from schema version 21 to version 22
2090
+ *
2091
+ * Fixes FTS tokenizer and trigger inconsistencies (F-S3, F-S4, F-S5):
2092
+ * - F-S3: knowledge_nodes_fts was created WITHOUT `porter unicode61` tokenizer
2093
+ * in v18 migration (fresh DB has it). Recreated with correct tokenizer.
2094
+ * - F-S4: v18 update trigger fires on ANY column update. Fixed to fire only
2095
+ * on `canonical_name` changes (AFTER UPDATE OF canonical_name).
2096
+ * - F-S5: v18 triggers use `_insert/_delete/_update` naming. Fixed to use
2097
+ * `_ai/_ad/_au` naming convention matching fresh schema definitions.
2098
+ *
2099
+ * @param db - Database instance from better-sqlite3
2100
+ * @throws MigrationError if migration fails
2101
+ */
2102
+ function migrateV21ToV22(db) {
2103
+ try {
2104
+ db.exec('BEGIN TRANSACTION');
2105
+ // Step 1: Drop old FTS table and ALL trigger name variants
2106
+ // (covers both v18 naming and fresh-schema naming)
2107
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_insert');
2108
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_delete');
2109
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_update');
2110
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_ai');
2111
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_ad');
2112
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_au');
2113
+ db.exec('DROP TABLE IF EXISTS knowledge_nodes_fts');
2114
+ // Step 2: Recreate FTS table with porter tokenizer (matching schema-definitions.ts)
2115
+ db.exec(CREATE_KNOWLEDGE_NODES_FTS_TABLE);
2116
+ // Step 3: Create triggers with correct _ai/_ad/_au naming and
2117
+ // AFTER UPDATE OF canonical_name scoping (matching schema-definitions.ts)
2118
+ for (const trigger of CREATE_KNOWLEDGE_NODES_FTS_TRIGGERS) {
2119
+ db.exec(trigger);
2120
+ }
2121
+ // Step 4: Repopulate FTS from existing knowledge_nodes data
2122
+ const nodeCount = db.prepare('SELECT COUNT(*) as cnt FROM knowledge_nodes').get();
2123
+ if (nodeCount.cnt > 0) {
2124
+ db.exec(`
2125
+ INSERT INTO knowledge_nodes_fts(rowid, canonical_name)
2126
+ SELECT rowid, canonical_name FROM knowledge_nodes
2127
+ `);
2128
+ }
2129
+ db.exec('COMMIT');
2130
+ }
2131
+ catch (error) {
2132
+ try {
2133
+ db.exec('ROLLBACK');
2134
+ }
2135
+ catch (rollbackErr) {
2136
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
2137
+ }
2138
+ const cause = error instanceof Error ? error.message : String(error);
2139
+ throw new MigrationError(`Failed to migrate from v21 to v22 (FTS tokenizer/trigger fix): ${cause}`, 'migrate', 'knowledge_nodes_fts', error);
2140
+ }
2141
+ }
2142
+ /**
2143
+ * Migrate from schema version 22 to version 23
2144
+ *
2145
+ * Changes in v23:
2146
+ * - Add 4 medical relationship types to knowledge_edges CHECK constraint:
2147
+ * treated_with, administered_via, managed_by, interacts_with
2148
+ *
2149
+ * Strategy: Recreate knowledge_edges table with updated CHECK constraint,
2150
+ * copy all existing data, swap tables.
2151
+ *
2152
+ * @throws MigrationError if migration fails
2153
+ */
2154
+ function migrateV22ToV23(db) {
2155
+ try {
2156
+ db.exec('PRAGMA foreign_keys = OFF');
2157
+ db.exec('BEGIN TRANSACTION');
2158
+ // Check if knowledge_edges table exists (KG tables are only created in v15+)
2159
+ const tableExists = db
2160
+ .prepare("SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='table' AND name='knowledge_edges'")
2161
+ .get();
2162
+ if (tableExists.cnt === 0) {
2163
+ // No knowledge_edges table - nothing to migrate
2164
+ db.exec('COMMIT');
2165
+ db.exec('PRAGMA foreign_keys = ON');
2166
+ return;
2167
+ }
2168
+ // Step 1: Create new table with expanded CHECK constraint
2169
+ db.exec(`
2170
+ CREATE TABLE knowledge_edges_new (
2171
+ id TEXT PRIMARY KEY,
2172
+ source_node_id TEXT NOT NULL,
2173
+ target_node_id TEXT NOT NULL,
2174
+ relationship_type TEXT NOT NULL CHECK (relationship_type IN (
2175
+ 'co_mentioned', 'co_located', 'works_at', 'represents',
2176
+ 'located_in', 'filed_in', 'cites', 'references',
2177
+ 'party_to', 'related_to', 'precedes', 'occurred_at',
2178
+ 'treated_with', 'administered_via', 'managed_by', 'interacts_with'
2179
+ )),
2180
+ weight REAL NOT NULL DEFAULT 1.0,
2181
+ evidence_count INTEGER NOT NULL DEFAULT 1,
2182
+ document_ids TEXT NOT NULL,
2183
+ metadata TEXT,
2184
+ provenance_id TEXT NOT NULL,
2185
+ created_at TEXT NOT NULL,
2186
+ valid_from TEXT,
2187
+ valid_until TEXT,
2188
+ normalized_weight REAL DEFAULT 0,
2189
+ contradiction_count INTEGER DEFAULT 0,
2190
+ FOREIGN KEY (source_node_id) REFERENCES knowledge_nodes(id),
2191
+ FOREIGN KEY (target_node_id) REFERENCES knowledge_nodes(id),
2192
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id)
2193
+ )
2194
+ `);
2195
+ // Step 2: Copy all existing data
2196
+ db.exec(`
2197
+ INSERT INTO knowledge_edges_new
2198
+ SELECT id, source_node_id, target_node_id, relationship_type,
2199
+ weight, evidence_count, document_ids, metadata,
2200
+ provenance_id, created_at, valid_from, valid_until,
2201
+ normalized_weight, contradiction_count
2202
+ FROM knowledge_edges
2203
+ `);
2204
+ // Step 3: Drop old table and rename
2205
+ db.exec('DROP TABLE knowledge_edges');
2206
+ db.exec('ALTER TABLE knowledge_edges_new RENAME TO knowledge_edges');
2207
+ // Step 4: Recreate indexes (matching schema-definitions.ts names)
2208
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_source_node ON knowledge_edges(source_node_id)');
2209
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_target_node ON knowledge_edges(target_node_id)');
2210
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_relationship_type ON knowledge_edges(relationship_type)');
2211
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
2212
+ const fkViolations = db.pragma('foreign_key_check');
2213
+ if (fkViolations.length > 0) {
2214
+ throw new Error(`Foreign key integrity check failed after v22->v23 migration: ${fkViolations.length} violation(s). ` +
2215
+ `First: ${JSON.stringify(fkViolations[0])}`);
2216
+ }
2217
+ db.exec('COMMIT');
2218
+ db.exec('PRAGMA foreign_keys = ON');
2219
+ }
2220
+ catch (error) {
2221
+ try {
2222
+ db.exec('ROLLBACK');
2223
+ db.exec('PRAGMA foreign_keys = ON');
2224
+ }
2225
+ catch (rollbackErr) {
2226
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
2227
+ }
2228
+ const cause = error instanceof Error ? error.message : String(error);
2229
+ throw new MigrationError(`Failed to migrate from v22 to v23 (medical relationship types): ${cause}`, 'migrate', 'knowledge_edges', error);
2230
+ }
2231
+ }
2232
+ /**
2233
+ * Migrate from schema version 23 to version 24
2234
+ *
2235
+ * Changes in v24:
2236
+ * - Add index on entity_mentions(document_id) to eliminate full table scans
2237
+ * on queries that filter or join entity_mentions by document_id.
2238
+ *
2239
+ * @param db - Database instance from better-sqlite3
2240
+ * @throws MigrationError if migration fails
2241
+ */
2242
+ function migrateV23ToV24(db) {
2243
+ try {
2244
+ // entity_mentions table was created in v14 — skip index creation if table doesn't exist
2245
+ const tableExists = db
2246
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='entity_mentions'")
2247
+ .get();
2248
+ if (tableExists) {
2249
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_mentions_document_id ON entity_mentions(document_id)');
2250
+ }
2251
+ }
2252
+ catch (error) {
2253
+ const cause = error instanceof Error ? error.message : String(error);
2254
+ throw new MigrationError(`Failed to migrate from v23 to v24 (entity_mentions document_id index): ${cause}`, 'migrate', 'entity_mentions', error);
2255
+ }
2256
+ }
2257
+ /**
2258
+ * Migrate from schema version 24 to version 25
2259
+ *
2260
+ * Changes in v25 (AI Knowledge Synthesis):
2261
+ * - corpus_intelligence: New table for corpus-level AI summaries
2262
+ * - document_narratives: New table for document-level AI narratives
2263
+ * - entity_roles: New table for AI-determined entity roles
2264
+ * - knowledge_edges: 6 new relationship types added to CHECK constraint
2265
+ * - provenance: CORPUS_INTELLIGENCE added to type and source_type CHECK constraints
2266
+ * - 5 new indexes for the new tables
2267
+ *
2268
+ * @param db - Database instance from better-sqlite3
2269
+ * @throws MigrationError if migration fails
2270
+ */
2271
+ function migrateV24ToV25(db) {
2272
+ try {
2273
+ db.exec('PRAGMA foreign_keys = OFF');
2274
+ db.exec('BEGIN TRANSACTION');
2275
+ // Step 1: Create 3 new tables
2276
+ db.exec(CREATE_CORPUS_INTELLIGENCE_TABLE);
2277
+ db.exec(CREATE_DOCUMENT_NARRATIVES_TABLE);
2278
+ db.exec(CREATE_ENTITY_ROLES_TABLE);
2279
+ // Step 2: Create 6 new indexes
2280
+ db.exec('CREATE INDEX IF NOT EXISTS idx_corpus_intelligence_database ON corpus_intelligence(database_name)');
2281
+ db.exec('CREATE INDEX IF NOT EXISTS idx_document_narratives_document ON document_narratives(document_id)');
2282
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_roles_node ON entity_roles(node_id)');
2283
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_roles_theme ON entity_roles(theme)');
2284
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_roles_role ON entity_roles(role)');
2285
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_roles_scope ON entity_roles(scope, scope_id)');
2286
+ // Step 3: Expand knowledge_edges CHECK constraint with 6 new relationship types
2287
+ const edgesTableExists = db.prepare("SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='table' AND name='knowledge_edges'").get();
2288
+ if (edgesTableExists.cnt > 0) {
2289
+ db.exec(`
2290
+ CREATE TABLE knowledge_edges_new (
2291
+ id TEXT PRIMARY KEY,
2292
+ source_node_id TEXT NOT NULL,
2293
+ target_node_id TEXT NOT NULL,
2294
+ relationship_type TEXT NOT NULL CHECK (relationship_type IN (
2295
+ 'co_mentioned', 'co_located', 'works_at', 'represents',
2296
+ 'located_in', 'filed_in', 'cites', 'references',
2297
+ 'party_to', 'related_to', 'precedes', 'occurred_at',
2298
+ 'treated_with', 'administered_via', 'managed_by', 'interacts_with',
2299
+ 'diagnosed_with', 'prescribed_by', 'admitted_to', 'supervised_by', 'filed_by', 'contraindicated_with'
2300
+ )),
2301
+ weight REAL NOT NULL DEFAULT 1.0,
2302
+ evidence_count INTEGER NOT NULL DEFAULT 1,
2303
+ document_ids TEXT NOT NULL,
2304
+ metadata TEXT,
2305
+ provenance_id TEXT NOT NULL,
2306
+ created_at TEXT NOT NULL,
2307
+ valid_from TEXT,
2308
+ valid_until TEXT,
2309
+ normalized_weight REAL DEFAULT 0,
2310
+ contradiction_count INTEGER DEFAULT 0,
2311
+ FOREIGN KEY (source_node_id) REFERENCES knowledge_nodes(id),
2312
+ FOREIGN KEY (target_node_id) REFERENCES knowledge_nodes(id),
2313
+ FOREIGN KEY (provenance_id) REFERENCES provenance(id)
2314
+ )
2315
+ `);
2316
+ db.exec(`
2317
+ INSERT INTO knowledge_edges_new
2318
+ SELECT id, source_node_id, target_node_id, relationship_type,
2319
+ weight, evidence_count, document_ids, metadata,
2320
+ provenance_id, created_at, valid_from, valid_until,
2321
+ normalized_weight, contradiction_count
2322
+ FROM knowledge_edges
2323
+ `);
2324
+ db.exec('DROP TABLE knowledge_edges');
2325
+ db.exec('ALTER TABLE knowledge_edges_new RENAME TO knowledge_edges');
2326
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_source_node ON knowledge_edges(source_node_id)');
2327
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_target_node ON knowledge_edges(target_node_id)');
2328
+ db.exec('CREATE INDEX IF NOT EXISTS idx_ke_relationship_type ON knowledge_edges(relationship_type)');
2329
+ }
2330
+ // Step 4: Add CORPUS_INTELLIGENCE to provenance type and source_type CHECK constraints
2331
+ db.exec(`
2332
+ CREATE TABLE provenance_new (
2333
+ id TEXT PRIMARY KEY,
2334
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION', 'COMPARISON', 'CLUSTERING', 'KNOWLEDGE_GRAPH', 'CORPUS_INTELLIGENCE')),
2335
+ created_at TEXT NOT NULL,
2336
+ processed_at TEXT NOT NULL,
2337
+ source_file_created_at TEXT,
2338
+ source_file_modified_at TEXT,
2339
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'VLM_DEDUP', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'ENTITY_EXTRACTION', 'COMPARISON', 'CLUSTERING', 'KNOWLEDGE_GRAPH', 'CORPUS_INTELLIGENCE')),
2340
+ source_path TEXT,
2341
+ source_id TEXT,
2342
+ root_document_id TEXT NOT NULL,
2343
+ location TEXT,
2344
+ content_hash TEXT NOT NULL,
2345
+ input_hash TEXT,
2346
+ file_hash TEXT,
2347
+ processor TEXT NOT NULL,
2348
+ processor_version TEXT NOT NULL,
2349
+ processing_params TEXT NOT NULL,
2350
+ processing_duration_ms INTEGER,
2351
+ processing_quality_score REAL,
2352
+ parent_id TEXT,
2353
+ parent_ids TEXT NOT NULL,
2354
+ chain_depth INTEGER NOT NULL,
2355
+ chain_path TEXT,
2356
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
2357
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
2358
+ )
2359
+ `);
2360
+ db.exec('INSERT INTO provenance_new SELECT * FROM provenance');
2361
+ db.exec('DROP TABLE provenance');
2362
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
2363
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
2364
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
2365
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
2366
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
2367
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
2368
+ const fkViolations = db.pragma('foreign_key_check');
2369
+ if (fkViolations.length > 0) {
2370
+ throw new Error(`Foreign key integrity check failed after v24->v25 migration: ${fkViolations.length} violation(s). First: ${JSON.stringify(fkViolations[0])}`);
2371
+ }
2372
+ db.exec('COMMIT');
2373
+ db.exec('PRAGMA foreign_keys = ON');
2374
+ }
2375
+ catch (error) {
2376
+ try {
2377
+ db.exec('ROLLBACK');
2378
+ db.exec('PRAGMA foreign_keys = ON');
2379
+ }
2380
+ catch (rollbackErr) {
2381
+ console.error('[migrations] Rollback failed:', rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr));
2382
+ }
2383
+ const cause = error instanceof Error ? error.message : String(error);
2384
+ throw new MigrationError(`Failed to migrate from v24 to v25 (AI Knowledge Synthesis tables): ${cause}`, 'migrate', 'corpus_intelligence', error);
2385
+ }
2386
+ }
2387
+ /**
2388
+ * Migrate from schema version 25 to version 26
2389
+ *
2390
+ * BREAKING CHANGE: Removes all entity extraction and knowledge graph tables.
2391
+ * These features are being removed entirely - no backwards compatibility.
2392
+ *
2393
+ * Drops:
2394
+ * - entities, entity_mentions, knowledge_nodes, knowledge_edges
2395
+ * - node_entity_links, entity_extraction_segments
2396
+ * - entity_embeddings, vec_entity_embeddings
2397
+ * - corpus_intelligence, document_narratives, entity_roles
2398
+ * - knowledge_nodes_fts (FTS5 virtual table)
2399
+ * - All associated triggers and indexes
2400
+ * - Recreates provenance table without ENTITY_EXTRACTION/KNOWLEDGE_GRAPH/CORPUS_INTELLIGENCE
2401
+ * - Recreates comparisons table without entity_diff_json column
2402
+ */
2403
+ function migrateV25ToV26(db) {
2404
+ try {
2405
+ db.exec('PRAGMA foreign_keys = OFF');
2406
+ db.exec('BEGIN TRANSACTION');
2407
+ // Step 1: Drop entity/KG FTS triggers (must be before table drops)
2408
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_ai');
2409
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_ad');
2410
+ db.exec('DROP TRIGGER IF EXISTS knowledge_nodes_fts_au');
2411
+ // Step 2: Drop entity/KG indexes (IF EXISTS for safety)
2412
+ const entityKgIndexes = [
2413
+ 'idx_entities_document_id', 'idx_entities_entity_type', 'idx_entities_normalized_text',
2414
+ 'idx_entity_mentions_entity_id', 'idx_entity_mentions_document_id', 'idx_entity_mentions_chunk_id',
2415
+ 'idx_kn_entity_type', 'idx_kn_normalized_name', 'idx_kn_document_count',
2416
+ 'idx_ke_source_node', 'idx_ke_target_node', 'idx_ke_relationship_type',
2417
+ 'idx_nel_node_id', 'idx_nel_document_id',
2418
+ 'idx_knowledge_nodes_canonical_lower',
2419
+ 'idx_segments_document', 'idx_segments_status', 'idx_segments_doc_status',
2420
+ 'idx_entity_embeddings_node_id', 'idx_entity_embeddings_content_hash',
2421
+ 'idx_corpus_intelligence_database',
2422
+ 'idx_document_narratives_document',
2423
+ 'idx_entity_roles_node', 'idx_entity_roles_theme', 'idx_entity_roles_role', 'idx_entity_roles_scope',
2424
+ ];
2425
+ for (const idx of entityKgIndexes) {
2426
+ db.exec(`DROP INDEX IF EXISTS ${idx}`);
2427
+ }
2428
+ // Step 3: Drop entity/KG tables in FK-safe order
2429
+ // Virtual tables first (no FK dependencies)
2430
+ db.exec('DROP TABLE IF EXISTS vec_entity_embeddings');
2431
+ db.exec('DROP TABLE IF EXISTS knowledge_nodes_fts');
2432
+ // Tables with outgoing FKs first
2433
+ db.exec('DROP TABLE IF EXISTS entity_roles');
2434
+ db.exec('DROP TABLE IF EXISTS document_narratives');
2435
+ db.exec('DROP TABLE IF EXISTS corpus_intelligence');
2436
+ db.exec('DROP TABLE IF EXISTS entity_embeddings');
2437
+ db.exec('DROP TABLE IF EXISTS entity_extraction_segments');
2438
+ db.exec('DROP TABLE IF EXISTS node_entity_links');
2439
+ db.exec('DROP TABLE IF EXISTS knowledge_edges');
2440
+ db.exec('DROP TABLE IF EXISTS entity_mentions');
2441
+ db.exec('DROP TABLE IF EXISTS entities');
2442
+ db.exec('DROP TABLE IF EXISTS knowledge_nodes');
2443
+ // Step 4: Recreate provenance table without entity/KG types
2444
+ db.exec(`
2445
+ CREATE TABLE provenance_new (
2446
+ id TEXT PRIMARY KEY,
2447
+ type TEXT NOT NULL CHECK (type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'COMPARISON', 'CLUSTERING')),
2448
+ created_at TEXT NOT NULL,
2449
+ processed_at TEXT NOT NULL,
2450
+ source_file_created_at TEXT,
2451
+ source_file_modified_at TEXT,
2452
+ source_type TEXT NOT NULL CHECK (source_type IN ('FILE', 'OCR', 'CHUNKING', 'IMAGE_EXTRACTION', 'VLM', 'VLM_DEDUP', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'COMPARISON', 'CLUSTERING')),
2453
+ source_path TEXT,
2454
+ source_id TEXT,
2455
+ root_document_id TEXT NOT NULL,
2456
+ location TEXT,
2457
+ content_hash TEXT NOT NULL,
2458
+ input_hash TEXT,
2459
+ file_hash TEXT,
2460
+ processor TEXT NOT NULL,
2461
+ processor_version TEXT NOT NULL,
2462
+ processing_params TEXT NOT NULL,
2463
+ processing_duration_ms INTEGER,
2464
+ processing_quality_score REAL,
2465
+ parent_id TEXT,
2466
+ parent_ids TEXT NOT NULL,
2467
+ chain_depth INTEGER NOT NULL,
2468
+ chain_path TEXT,
2469
+ FOREIGN KEY (source_id) REFERENCES provenance_new(id),
2470
+ FOREIGN KEY (parent_id) REFERENCES provenance_new(id)
2471
+ )
2472
+ `);
2473
+ // Only copy rows with valid types (discard entity/KG provenance records)
2474
+ db.exec(`
2475
+ INSERT INTO provenance_new SELECT * FROM provenance
2476
+ WHERE type IN ('DOCUMENT', 'OCR_RESULT', 'CHUNK', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING', 'EXTRACTION', 'FORM_FILL', 'COMPARISON', 'CLUSTERING')
2477
+ `);
2478
+ db.exec('DROP TABLE provenance');
2479
+ db.exec('ALTER TABLE provenance_new RENAME TO provenance');
2480
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_source_id ON provenance(source_id)');
2481
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_type ON provenance(type)');
2482
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_root_document_id ON provenance(root_document_id)');
2483
+ db.exec('CREATE INDEX IF NOT EXISTS idx_provenance_parent_id ON provenance(parent_id)');
2484
+ // Step 5: Recreate comparisons table without entity_diff_json column
2485
+ db.exec(`
2486
+ CREATE TABLE comparisons_new (
2487
+ id TEXT PRIMARY KEY NOT NULL,
2488
+ document_id_1 TEXT NOT NULL REFERENCES documents(id),
2489
+ document_id_2 TEXT NOT NULL REFERENCES documents(id),
2490
+ similarity_ratio REAL NOT NULL,
2491
+ text_diff_json TEXT NOT NULL,
2492
+ structural_diff_json TEXT NOT NULL,
2493
+ summary TEXT NOT NULL,
2494
+ content_hash TEXT NOT NULL,
2495
+ provenance_id TEXT NOT NULL REFERENCES provenance(id),
2496
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
2497
+ processing_duration_ms INTEGER
2498
+ )
2499
+ `);
2500
+ db.exec(`
2501
+ INSERT INTO comparisons_new
2502
+ SELECT id, document_id_1, document_id_2, similarity_ratio,
2503
+ text_diff_json, structural_diff_json,
2504
+ summary, content_hash, provenance_id, created_at, processing_duration_ms
2505
+ FROM comparisons
2506
+ `);
2507
+ db.exec('DROP TABLE comparisons');
2508
+ db.exec('ALTER TABLE comparisons_new RENAME TO comparisons');
2509
+ db.exec('CREATE INDEX IF NOT EXISTS idx_comparisons_doc1 ON comparisons(document_id_1)');
2510
+ db.exec('CREATE INDEX IF NOT EXISTS idx_comparisons_doc2 ON comparisons(document_id_2)');
2511
+ db.exec('CREATE INDEX IF NOT EXISTS idx_comparisons_created ON comparisons(created_at)');
2512
+ // M-5: Verify FK integrity BEFORE commit so violations cause rollback
2513
+ const fkViolations = db.pragma('foreign_key_check');
2514
+ if (fkViolations.length > 0) {
2515
+ console.error(`[Migration v25->v26] FK violations detected: ${JSON.stringify(fkViolations.slice(0, 5))}`);
2516
+ throw new Error(`Foreign key integrity check failed after v25->v26 migration: ${fkViolations.length} violation(s)`);
2517
+ }
2518
+ db.exec('COMMIT');
2519
+ db.exec('PRAGMA foreign_keys = ON');
2520
+ console.error('[Migration] v25 -> v26: Removed entity extraction and knowledge graph tables');
2521
+ }
2522
+ catch (error) {
2523
+ try {
2524
+ db.exec('ROLLBACK');
2525
+ }
2526
+ catch (rollbackError) {
2527
+ console.error(`[migrations] CRITICAL: Failed to rollback v25->v26 migration: ${rollbackError instanceof Error ? rollbackError.message : String(rollbackError)}`);
2528
+ }
2529
+ db.exec('PRAGMA foreign_keys = ON');
2530
+ const cause = error instanceof Error ? error.message : String(error);
2531
+ throw new MigrationError(`Failed to migrate from v25 to v26 (entity/KG removal): ${cause}`, 'migrate', 'entity_kg_removal', error);
2532
+ }
2533
+ }
2534
+ /**
2535
+ * Migrate from schema version 26 to version 27
2536
+ *
2537
+ * Changes in v27 (Hybrid Section-Aware Chunking - Phase 1):
2538
+ * - chunks.heading_context: Heading text providing context for the chunk
2539
+ * - chunks.heading_level: Heading level (1-6) of the section
2540
+ * - chunks.section_path: Full section path (e.g., "Introduction > Background")
2541
+ * - chunks.content_types: JSON array of content types in the chunk
2542
+ * - chunks.is_atomic: Whether chunk should not be split further (default 0)
2543
+ * - chunks.chunking_strategy: Strategy used to create the chunk (default 'hybrid_section')
2544
+ *
2545
+ * Uses ALTER TABLE ADD COLUMN (safe for nullable/defaulted columns, no table recreation needed).
2546
+ *
2547
+ * @param db - Database instance from better-sqlite3
2548
+ * @throws MigrationError if migration fails
2549
+ */
2550
+ function migrateV26ToV27(db) {
2551
+ db.exec('PRAGMA foreign_keys = OFF');
2552
+ // Check existing columns for idempotency (safe on retry after partial failure)
2553
+ const columns = db.prepare('PRAGMA table_info(chunks)').all();
2554
+ const columnNames = new Set(columns.map((c) => c.name));
2555
+ const transaction = db.transaction(() => {
2556
+ if (!columnNames.has('heading_context')) {
2557
+ db.exec('ALTER TABLE chunks ADD COLUMN heading_context TEXT');
2558
+ }
2559
+ if (!columnNames.has('heading_level')) {
2560
+ db.exec('ALTER TABLE chunks ADD COLUMN heading_level INTEGER');
2561
+ }
2562
+ if (!columnNames.has('section_path')) {
2563
+ db.exec('ALTER TABLE chunks ADD COLUMN section_path TEXT');
2564
+ }
2565
+ if (!columnNames.has('content_types')) {
2566
+ db.exec('ALTER TABLE chunks ADD COLUMN content_types TEXT');
2567
+ }
2568
+ if (!columnNames.has('is_atomic')) {
2569
+ db.exec('ALTER TABLE chunks ADD COLUMN is_atomic INTEGER NOT NULL DEFAULT 0');
2570
+ }
2571
+ if (!columnNames.has('chunking_strategy')) {
2572
+ db.exec("ALTER TABLE chunks ADD COLUMN chunking_strategy TEXT NOT NULL DEFAULT 'hybrid_section'");
2573
+ }
2574
+ // M-5: FK integrity check inside transaction so violations cause rollback
2575
+ const fkViolations = db.pragma('foreign_key_check');
2576
+ if (fkViolations.length > 0) {
2577
+ throw new Error(`Foreign key integrity check failed after v26->v27 migration: ${fkViolations.length} violation(s). ` +
2578
+ `First: ${JSON.stringify(fkViolations[0])}`);
2579
+ }
2580
+ });
2581
+ try {
2582
+ transaction();
2583
+ db.exec('PRAGMA foreign_keys = ON');
2584
+ console.error('[Migration] v26 -> v27: Added hybrid section-aware chunking columns to chunks table');
2585
+ }
2586
+ catch (error) {
2587
+ db.exec('PRAGMA foreign_keys = ON');
2588
+ const cause = error instanceof Error ? error.message : String(error);
2589
+ throw new MigrationError(`Failed to migrate from v26 to v27 (hybrid section-aware chunking columns): ${cause}`, 'migrate', 'chunks', error);
2590
+ }
2591
+ }
2592
+ /**
2593
+ * Migrate from schema version 27 to version 28
2594
+ *
2595
+ * Changes in v28:
2596
+ * - saved_searches: New table for persisting search results
2597
+ * - New indexes: idx_saved_searches_name, idx_saved_searches_search_type, idx_saved_searches_created
2598
+ *
2599
+ * @param db - Database instance from better-sqlite3
2600
+ * @throws MigrationError if migration fails
2601
+ */
2602
+ function migrateV27ToV28(db) {
2603
+ console.error('[MIGRATION] Applying v27 → v28: Add saved_searches table');
2604
+ try {
2605
+ // L-5: Wrap CREATE TABLE + CREATE INDEX in a transaction for atomicity
2606
+ const transaction = db.transaction(() => {
2607
+ db.exec(`
2608
+ CREATE TABLE IF NOT EXISTS saved_searches (
2609
+ id TEXT PRIMARY KEY,
2610
+ name TEXT NOT NULL,
2611
+ query TEXT NOT NULL,
2612
+ search_type TEXT NOT NULL CHECK (search_type IN ('bm25', 'semantic', 'hybrid')),
2613
+ search_params TEXT NOT NULL DEFAULT '{}',
2614
+ result_count INTEGER NOT NULL,
2615
+ result_ids TEXT NOT NULL DEFAULT '[]',
2616
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
2617
+ notes TEXT
2618
+ )
2619
+ `);
2620
+ db.exec('CREATE INDEX IF NOT EXISTS idx_saved_searches_name ON saved_searches(name)');
2621
+ db.exec('CREATE INDEX IF NOT EXISTS idx_saved_searches_search_type ON saved_searches(search_type)');
2622
+ db.exec('CREATE INDEX IF NOT EXISTS idx_saved_searches_created ON saved_searches(created_at DESC)');
2623
+ });
2624
+ transaction();
2625
+ console.error('[MIGRATION] v28 migration complete: saved_searches table created');
2626
+ }
2627
+ catch (error) {
2628
+ const cause = error instanceof Error ? error.message : String(error);
2629
+ throw new MigrationError(`Failed to migrate from v27 to v28 (saved_searches table): ${cause}`, 'migrate', 'saved_searches', error);
2630
+ }
2631
+ }
2632
+ /**
2633
+ * Migrate from schema version 28 to version 29
2634
+ *
2635
+ * Changes in v29:
2636
+ * - tags: New table for user-defined tag labels
2637
+ * - entity_tags: New table for cross-entity tag assignments (document, chunk, image, extraction, cluster)
2638
+ * - New indexes: idx_entity_tags_entity, idx_entity_tags_tag
2639
+ *
2640
+ * @param db - Database instance from better-sqlite3
2641
+ * @throws MigrationError if migration fails
2642
+ */
2643
+ function migrateV28ToV29(db) {
2644
+ console.error('[MIGRATION] Applying v28 → v29: Add tags and entity_tags tables');
2645
+ try {
2646
+ // L-5: Wrap CREATE TABLE + CREATE INDEX in a transaction for atomicity
2647
+ const transaction = db.transaction(() => {
2648
+ db.exec(CREATE_TAGS_TABLE);
2649
+ db.exec(CREATE_ENTITY_TAGS_TABLE);
2650
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_tags_entity ON entity_tags(entity_id, entity_type)');
2651
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entity_tags_tag ON entity_tags(tag_id)');
2652
+ });
2653
+ transaction();
2654
+ console.error('[MIGRATION] v29 migration complete: tags and entity_tags tables created');
2655
+ }
2656
+ catch (error) {
2657
+ const cause = error instanceof Error ? error.message : String(error);
2658
+ throw new MigrationError(`Failed to migrate from v28 to v29 (tags tables): ${cause}`, 'migrate', 'tags', error);
2659
+ }
2660
+ }
2661
+ /**
2662
+ * Migrate from schema version 29 to version 30
2663
+ *
2664
+ * Changes in v30:
2665
+ * - documents_fts: FTS5 virtual table on doc_title, doc_author, doc_subject
2666
+ * - documents_fts triggers: insert, delete, update sync
2667
+ * - saved_searches: Add last_executed_at TEXT and execution_count INTEGER columns
2668
+ * - New indexes: idx_chunks_section_path, idx_chunks_heading_level
2669
+ *
2670
+ * @param db - Database instance from better-sqlite3
2671
+ * @throws MigrationError if migration fails
2672
+ */
2673
+ function migrateV29ToV30(db) {
2674
+ console.error('[MIGRATION] Applying v29 → v30: Documents FTS5, saved search analytics, chunk indexes');
2675
+ try {
2676
+ // 1. Create documents_fts FTS5 virtual table
2677
+ // Note: FTS5 virtual table creation is outside the transaction because
2678
+ // virtual tables manage their own storage and may not support transactional DDL.
2679
+ db.exec(CREATE_DOCUMENTS_FTS_TABLE);
2680
+ // 2. Create sync triggers
2681
+ for (const trigger of CREATE_DOCUMENTS_FTS_TRIGGERS) {
2682
+ db.exec(trigger);
2683
+ }
2684
+ // L-5: Wrap the remaining DDL + FTS population in a transaction for atomicity.
2685
+ // The FTS delete-all + insert must be atomic to avoid an empty index on crash.
2686
+ const transaction = db.transaction(() => {
2687
+ // 3. Populate from existing data (clear first for crash-retry idempotency)
2688
+ db.exec("INSERT INTO documents_fts(documents_fts) VALUES('delete-all')");
2689
+ db.exec(`
2690
+ INSERT INTO documents_fts(rowid, doc_title, doc_author, doc_subject)
2691
+ SELECT rowid, COALESCE(doc_title, ''), COALESCE(doc_author, ''), COALESCE(doc_subject, '')
2692
+ FROM documents
2693
+ `);
2694
+ // 4. Add saved search analytics columns (idempotent: check column existence first)
2695
+ const ssColumns = db.prepare('PRAGMA table_info(saved_searches)').all();
2696
+ const ssColumnNames = new Set(ssColumns.map((c) => c.name));
2697
+ if (!ssColumnNames.has('last_executed_at')) {
2698
+ db.exec('ALTER TABLE saved_searches ADD COLUMN last_executed_at TEXT');
2699
+ }
2700
+ if (!ssColumnNames.has('execution_count')) {
2701
+ db.exec('ALTER TABLE saved_searches ADD COLUMN execution_count INTEGER DEFAULT 0');
2702
+ }
2703
+ // 5. Create chunk performance indexes
2704
+ db.exec('CREATE INDEX IF NOT EXISTS idx_chunks_section_path ON chunks(section_path)');
2705
+ db.exec('CREATE INDEX IF NOT EXISTS idx_chunks_heading_level ON chunks(heading_level)');
2706
+ });
2707
+ transaction();
2708
+ console.error('[MIGRATION] v30 migration complete: documents_fts, saved search analytics, chunk indexes');
2709
+ }
2710
+ catch (error) {
2711
+ const cause = error instanceof Error ? error.message : String(error);
2712
+ throw new MigrationError(`Failed to migrate from v29 to v30 (documents FTS, saved search analytics): ${cause}`, 'migrate', 'documents_fts', error);
2713
+ }
2714
+ }
2715
+ /**
2716
+ * Migration v30 → v31: Document metadata indexes, VLM text enrichment
2717
+ *
2718
+ * Changes:
2719
+ * - New indexes: idx_documents_doc_author, idx_documents_doc_subject
2720
+ * - Backfills VLM extracted text into embeddings for FTS searchability
2721
+ *
2722
+ * M-6: bumpVersion is called inside the transaction so migration body and
2723
+ * version bump are atomic. If the process crashes, both roll back together.
2724
+ *
2725
+ * @param db - Database instance from better-sqlite3
2726
+ * @param bumpVersion - Callback to bump schema version (called inside transaction)
2727
+ * @throws MigrationError if migration fails
2728
+ */
2729
+ function migrateV30ToV31(db, bumpVersion) {
2730
+ console.error('[MIGRATION] Applying v30 → v31: document metadata indexes, VLM text enrichment');
2731
+ try {
2732
+ // M-6 / H-3: Wrap entire migration body + bumpVersion in a single transaction
2733
+ // so the UPDATE and version bump are atomic. If the process crashes between
2734
+ // them, both roll back and the migration re-runs cleanly on restart.
2735
+ const transaction = db.transaction(() => {
2736
+ db.exec('CREATE INDEX IF NOT EXISTS idx_documents_doc_author ON documents(doc_author)');
2737
+ db.exec('CREATE INDEX IF NOT EXISTS idx_documents_doc_subject ON documents(doc_subject)');
2738
+ // T2.10: Backfill VLM extracted text into embeddings for FTS searchability
2739
+ // Appends extracted text from vlm_structured_data to the embedding's original_text
2740
+ // so it enters the vlm_fts index automatically via existing triggers.
2741
+ //
2742
+ // H-3: Only update rows where GROUP_CONCAT produces a non-empty result.
2743
+ // Uses a subquery that returns NULL (not empty string) when no text found,
2744
+ // so the outer WHERE filters them out. No trailing space is appended.
2745
+ // L-12: Checks json_type(...) = 'array' before json_each() to avoid iterating
2746
+ // characters of a string or crashing on non-array $.extractedText values.
2747
+ db.exec(`
2748
+ UPDATE embeddings SET original_text = original_text || ' ' || (
2749
+ SELECT GROUP_CONCAT(value, ' ')
2750
+ FROM images i, json_each(json_extract(i.vlm_structured_data, '$.extractedText'))
2751
+ WHERE i.id = embeddings.image_id
2752
+ AND i.vlm_structured_data IS NOT NULL
2753
+ AND json_valid(i.vlm_structured_data)
2754
+ AND json_extract(i.vlm_structured_data, '$.extractedText') IS NOT NULL
2755
+ AND json_type(json_extract(i.vlm_structured_data, '$.extractedText')) = 'array'
2756
+ )
2757
+ WHERE embeddings.image_id IS NOT NULL
2758
+ AND EXISTS (
2759
+ SELECT 1 FROM images i
2760
+ WHERE i.id = embeddings.image_id
2761
+ AND i.vlm_structured_data IS NOT NULL
2762
+ AND json_valid(i.vlm_structured_data)
2763
+ AND json_extract(i.vlm_structured_data, '$.extractedText') IS NOT NULL
2764
+ AND json_type(json_extract(i.vlm_structured_data, '$.extractedText')) = 'array'
2765
+ )
2766
+ AND (
2767
+ SELECT GROUP_CONCAT(value, ' ')
2768
+ FROM images i, json_each(json_extract(i.vlm_structured_data, '$.extractedText'))
2769
+ WHERE i.id = embeddings.image_id
2770
+ AND i.vlm_structured_data IS NOT NULL
2771
+ AND json_valid(i.vlm_structured_data)
2772
+ AND json_type(json_extract(i.vlm_structured_data, '$.extractedText')) = 'array'
2773
+ ) IS NOT NULL
2774
+ `);
2775
+ // Rebuild VLM FTS index to pick up the updated text.
2776
+ // H-4: Check table existence first. If vlm_fts doesn't exist yet (fresh DB
2777
+ // still running through early migrations), skip cleanly. Any OTHER error
2778
+ // (corruption, SQL error) must propagate and fail the migration.
2779
+ const vlmFtsExists = db
2780
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vlm_fts'")
2781
+ .get();
2782
+ if (vlmFtsExists) {
2783
+ // Use delete-all + selective re-insert (NOT 'rebuild') because FTS5
2784
+ // external content 'rebuild' reads ALL rows from embeddings table,
2785
+ // including chunk embeddings (image_id IS NULL), creating ghost VLM results.
2786
+ db.exec("INSERT INTO vlm_fts(vlm_fts) VALUES('delete-all')");
2787
+ db.exec(`
2788
+ INSERT INTO vlm_fts(rowid, original_text)
2789
+ SELECT rowid, original_text FROM embeddings WHERE image_id IS NOT NULL
2790
+ `);
2791
+ console.error('[MIGRATION] VLM FTS index rebuilt with extracted text');
2792
+ }
2793
+ else {
2794
+ console.error('[MIGRATION] VLM FTS table does not exist yet, skipping rebuild');
2795
+ }
2796
+ // M-6: Bump version inside the transaction so it's atomic with the body
2797
+ bumpVersion(31);
2798
+ });
2799
+ transaction();
2800
+ console.error('[MIGRATION] v31 migration complete: indexes + VLM text enrichment');
2801
+ }
2802
+ catch (error) {
2803
+ const cause = error instanceof Error ? error.message : String(error);
2804
+ throw new MigrationError(`Failed to migrate v30 to v31: ${cause}`, 'migrate', 'document_indexes', error);
2805
+ }
2806
+ }
2807
+ /**
2808
+ * Migration v31 → v32: Multi-user, collaboration, workflow, CLM, and webhook tables
2809
+ *
2810
+ * Changes:
2811
+ * - 10 new tables: users, audit_log, annotations, document_locks, workflow_states,
2812
+ * approval_chains, approval_steps, obligations, playbooks, webhooks
2813
+ * - provenance: 4 new columns (user_id, agent_id, agent_metadata_json, chain_hash)
2814
+ * - saved_searches: 4 new columns (user_id, is_shared, alert_enabled, last_alert_at)
2815
+ * - 23 new indexes across all new tables
2816
+ *
2817
+ * @param db - Database instance from better-sqlite3
2818
+ * @throws MigrationError if migration fails
2819
+ */
2820
+ function migrateV31ToV32(db) {
2821
+ console.error('[MIGRATION] Applying v31 → v32: multi-user, collaboration, workflow, CLM, webhooks');
2822
+ try {
2823
+ db.exec('PRAGMA foreign_keys = OFF');
2824
+ const transaction = db.transaction(() => {
2825
+ // Step 1: Create all 10 new tables (users first, since others reference it)
2826
+ db.exec(CREATE_USERS_TABLE);
2827
+ db.exec(CREATE_AUDIT_LOG_TABLE);
2828
+ db.exec(CREATE_ANNOTATIONS_TABLE);
2829
+ db.exec(CREATE_DOCUMENT_LOCKS_TABLE);
2830
+ db.exec(CREATE_WORKFLOW_STATES_TABLE);
2831
+ db.exec(CREATE_APPROVAL_CHAINS_TABLE);
2832
+ db.exec(CREATE_APPROVAL_STEPS_TABLE);
2833
+ db.exec(CREATE_OBLIGATIONS_TABLE);
2834
+ db.exec(CREATE_PLAYBOOKS_TABLE);
2835
+ db.exec(CREATE_WEBHOOKS_TABLE);
2836
+ // Step 2: Add new columns to provenance table (idempotent via PRAGMA table_info check)
2837
+ const provColumns = db.prepare('PRAGMA table_info(provenance)').all();
2838
+ const provColumnNames = new Set(provColumns.map((c) => c.name));
2839
+ if (!provColumnNames.has('user_id')) {
2840
+ db.exec('ALTER TABLE provenance ADD COLUMN user_id TEXT');
2841
+ }
2842
+ if (!provColumnNames.has('agent_id')) {
2843
+ db.exec('ALTER TABLE provenance ADD COLUMN agent_id TEXT');
2844
+ }
2845
+ if (!provColumnNames.has('agent_metadata_json')) {
2846
+ db.exec('ALTER TABLE provenance ADD COLUMN agent_metadata_json TEXT');
2847
+ }
2848
+ if (!provColumnNames.has('chain_hash')) {
2849
+ db.exec('ALTER TABLE provenance ADD COLUMN chain_hash TEXT');
2850
+ }
2851
+ // Step 3: Add new columns to saved_searches table (idempotent via PRAGMA table_info check)
2852
+ const ssColumns = db.prepare('PRAGMA table_info(saved_searches)').all();
2853
+ const ssColumnNames = new Set(ssColumns.map((c) => c.name));
2854
+ if (!ssColumnNames.has('user_id')) {
2855
+ db.exec('ALTER TABLE saved_searches ADD COLUMN user_id TEXT');
2856
+ }
2857
+ if (!ssColumnNames.has('is_shared')) {
2858
+ db.exec('ALTER TABLE saved_searches ADD COLUMN is_shared INTEGER DEFAULT 0');
2859
+ }
2860
+ if (!ssColumnNames.has('alert_enabled')) {
2861
+ db.exec('ALTER TABLE saved_searches ADD COLUMN alert_enabled INTEGER DEFAULT 0');
2862
+ }
2863
+ if (!ssColumnNames.has('last_alert_at')) {
2864
+ db.exec('ALTER TABLE saved_searches ADD COLUMN last_alert_at TEXT');
2865
+ }
2866
+ // Step 4: Create all new indexes
2867
+ // Users indexes
2868
+ db.exec('CREATE INDEX IF NOT EXISTS idx_users_external_id ON users(external_id)');
2869
+ db.exec('CREATE INDEX IF NOT EXISTS idx_users_role ON users(role)');
2870
+ // Audit log indexes
2871
+ db.exec('CREATE INDEX IF NOT EXISTS idx_audit_log_user ON audit_log(user_id)');
2872
+ db.exec('CREATE INDEX IF NOT EXISTS idx_audit_log_action ON audit_log(action)');
2873
+ db.exec('CREATE INDEX IF NOT EXISTS idx_audit_log_entity ON audit_log(entity_type, entity_id)');
2874
+ db.exec('CREATE INDEX IF NOT EXISTS idx_audit_log_created ON audit_log(created_at)');
2875
+ // Annotations indexes
2876
+ db.exec('CREATE INDEX IF NOT EXISTS idx_annotations_document ON annotations(document_id)');
2877
+ db.exec('CREATE INDEX IF NOT EXISTS idx_annotations_chunk ON annotations(chunk_id)');
2878
+ db.exec('CREATE INDEX IF NOT EXISTS idx_annotations_user ON annotations(user_id)');
2879
+ db.exec('CREATE INDEX IF NOT EXISTS idx_annotations_type ON annotations(annotation_type)');
2880
+ db.exec('CREATE INDEX IF NOT EXISTS idx_annotations_status ON annotations(status)');
2881
+ // Workflow states indexes
2882
+ db.exec('CREATE INDEX IF NOT EXISTS idx_workflow_document ON workflow_states(document_id)');
2883
+ db.exec('CREATE INDEX IF NOT EXISTS idx_workflow_state ON workflow_states(state)');
2884
+ db.exec('CREATE INDEX IF NOT EXISTS idx_workflow_assigned ON workflow_states(assigned_to)');
2885
+ db.exec('CREATE INDEX IF NOT EXISTS idx_workflow_due ON workflow_states(due_date)');
2886
+ // Approval steps indexes
2887
+ db.exec('CREATE INDEX IF NOT EXISTS idx_approval_steps_doc ON approval_steps(document_id)');
2888
+ db.exec('CREATE INDEX IF NOT EXISTS idx_approval_steps_status ON approval_steps(status)');
2889
+ // Obligations indexes
2890
+ db.exec('CREATE INDEX IF NOT EXISTS idx_obligations_document ON obligations(document_id)');
2891
+ db.exec('CREATE INDEX IF NOT EXISTS idx_obligations_type ON obligations(obligation_type)');
2892
+ db.exec('CREATE INDEX IF NOT EXISTS idx_obligations_due ON obligations(due_date)');
2893
+ db.exec('CREATE INDEX IF NOT EXISTS idx_obligations_status ON obligations(status)');
2894
+ });
2895
+ transaction();
2896
+ db.exec('PRAGMA foreign_keys = ON');
2897
+ console.error('[MIGRATION] v32 migration complete: 10 new tables, provenance + saved_searches columns, 23 indexes');
2898
+ }
2899
+ catch (error) {
2900
+ try {
2901
+ db.exec('PRAGMA foreign_keys = ON');
2902
+ }
2903
+ catch (fkErr) {
2904
+ console.error('[migrations] Failed to restore foreign_keys pragma:', fkErr instanceof Error ? fkErr.message : String(fkErr));
2905
+ }
2906
+ const cause = error instanceof Error ? error.message : String(error);
2907
+ throw new MigrationError(`Failed to migrate v31 to v32 (multi-user, collaboration, workflow, CLM, webhooks): ${cause}`, 'migrate', 'users', error);
2908
+ }
2909
+ }
2910
+ //# sourceMappingURL=operations.js.map