ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,656 @@
1
+ /**
2
+ * BM25 Search Service using SQLite FTS5
3
+ *
4
+ * FAIL FAST: All errors throw immediately with detailed messages
5
+ * PROVENANCE: Every result includes provenance_id and content_hash
6
+ */
7
+ import crypto from 'crypto';
8
+ import { SCHEMA_VERSION } from '../storage/migrations/schema-definitions.js';
9
+ import { computeQualityMultiplier } from './quality.js';
10
+ /**
11
+ * Apply quality multiplier to BM25 results, re-sort, and re-rank.
12
+ */
13
+ function applyQualityAndRerank(results) {
14
+ for (const r of results) {
15
+ r.bm25_score *= computeQualityMultiplier(r.ocr_quality_score);
16
+ }
17
+ results.sort((a, b) => b.bm25_score - a.bm25_score);
18
+ for (let i = 0; i < results.length; i++) {
19
+ results[i].rank = i + 1;
20
+ }
21
+ }
22
+ export class BM25SearchService {
23
+ db;
24
+ constructor(db) {
25
+ this.db = db;
26
+ this.verifyFTSTableExists();
27
+ }
28
+ verifyFTSTableExists() {
29
+ const result = this.db
30
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_fts'")
31
+ .get();
32
+ if (!result) {
33
+ throw new Error('FTS5 table "chunks_fts" not found. Database must be at schema version 4. ' +
34
+ 'Re-select the database to trigger migration.');
35
+ }
36
+ }
37
+ search(options) {
38
+ const { query, limit = 10, phraseSearch = false, documentFilter, includeHighlight = true, chunkFilter, preSanitized = false, } = options;
39
+ if (!query || query.trim().length === 0) {
40
+ throw new Error('BM25 search query cannot be empty');
41
+ }
42
+ let ftsQuery;
43
+ if (phraseSearch) {
44
+ ftsQuery = `"${query.replace(/"/g, '""')}"`;
45
+ }
46
+ else if (preSanitized) {
47
+ // M-7: Defense-in-depth: verify the pre-sanitized query is actually safe
48
+ if (/["'()]/.test(query)) {
49
+ console.error(`[WARN] preSanitized query contains FTS5 metacharacters, falling back to sanitization: "${query}"`);
50
+ ftsQuery = sanitizeFTS5Query(query);
51
+ }
52
+ else {
53
+ ftsQuery = query;
54
+ }
55
+ }
56
+ else {
57
+ ftsQuery = sanitizeFTS5Query(query);
58
+ }
59
+ let sql = `
60
+ SELECT
61
+ c.id AS chunk_id,
62
+ (SELECT e.id FROM embeddings e WHERE e.chunk_id = c.id ORDER BY e.created_at DESC LIMIT 1) AS embedding_id,
63
+ c.document_id,
64
+ c.text AS original_text,
65
+ bm25(chunks_fts) AS bm25_score,
66
+ d.file_path AS source_file_path,
67
+ d.file_name AS source_file_name,
68
+ d.file_hash AS source_file_hash,
69
+ c.page_number,
70
+ c.character_start,
71
+ c.character_end,
72
+ c.chunk_index,
73
+ c.provenance_id,
74
+ c.text_hash AS content_hash,
75
+ c.heading_context,
76
+ c.section_path,
77
+ c.content_types,
78
+ c.is_atomic,
79
+ c.page_range,
80
+ c.heading_level,
81
+ d.doc_title,
82
+ d.doc_author,
83
+ d.doc_subject,
84
+ (SELECT o.parse_quality_score FROM ocr_results o WHERE o.document_id = c.document_id ORDER BY o.processing_completed_at DESC LIMIT 1) AS ocr_quality_score,
85
+ c.overlap_previous,
86
+ c.overlap_next,
87
+ c.chunking_strategy,
88
+ c.embedding_status,
89
+ d.page_count AS doc_page_count,
90
+ (SELECT o.datalab_mode FROM ocr_results o WHERE o.document_id = c.document_id ORDER BY o.processing_completed_at DESC LIMIT 1) AS datalab_mode,
91
+ (SELECT COUNT(*) FROM chunks c2 WHERE c2.document_id = c.document_id) AS total_chunks
92
+ ${includeHighlight ? ", snippet(chunks_fts, 0, '<mark>', '</mark>', '...', 32) AS highlight" : ''}
93
+ FROM chunks_fts
94
+ JOIN chunks c ON chunks_fts.rowid = c.rowid
95
+ JOIN documents d ON c.document_id = d.id
96
+ WHERE chunks_fts MATCH ?
97
+ `;
98
+ const params = [ftsQuery];
99
+ if (documentFilter && documentFilter.length > 0) {
100
+ sql += ` AND c.document_id IN (${documentFilter.map(() => '?').join(',')})`;
101
+ params.push(...documentFilter);
102
+ }
103
+ if (chunkFilter && chunkFilter.conditions.length > 0) {
104
+ for (const condition of chunkFilter.conditions) {
105
+ sql += ` AND ${condition}`;
106
+ }
107
+ params.push(...chunkFilter.params);
108
+ }
109
+ sql += ` ORDER BY bm25(chunks_fts) LIMIT ?`;
110
+ params.push(limit);
111
+ const rows = this.db.prepare(sql).all(...params);
112
+ // TY-09: Field casts below are intentional -- better-sqlite3 returns untyped Records.
113
+ // The SQL query guarantees these columns exist and have the expected types.
114
+ const results = rows.map((row, index) => ({
115
+ chunk_id: row.chunk_id,
116
+ image_id: null,
117
+ embedding_id: row.embedding_id ?? null,
118
+ extraction_id: null,
119
+ document_id: row.document_id,
120
+ original_text: row.original_text,
121
+ bm25_score: Math.abs(row.bm25_score),
122
+ rank: index + 1,
123
+ result_type: 'chunk',
124
+ source_file_path: row.source_file_path,
125
+ source_file_name: row.source_file_name,
126
+ source_file_hash: row.source_file_hash,
127
+ page_number: row.page_number,
128
+ character_start: row.character_start,
129
+ character_end: row.character_end,
130
+ chunk_index: row.chunk_index,
131
+ provenance_id: row.provenance_id,
132
+ content_hash: row.content_hash,
133
+ highlight: row.highlight,
134
+ heading_context: row.heading_context ?? null,
135
+ section_path: row.section_path ?? null,
136
+ content_types: row.content_types ?? null,
137
+ is_atomic: !!row.is_atomic,
138
+ page_range: row.page_range ?? null,
139
+ heading_level: row.heading_level ?? null,
140
+ ocr_quality_score: row.ocr_quality_score ?? null,
141
+ doc_title: row.doc_title ?? null,
142
+ doc_author: row.doc_author ?? null,
143
+ doc_subject: row.doc_subject ?? null,
144
+ overlap_previous: row.overlap_previous ?? 0,
145
+ overlap_next: row.overlap_next ?? 0,
146
+ chunking_strategy: row.chunking_strategy ?? null,
147
+ embedding_status: row.embedding_status ?? 'pending',
148
+ doc_page_count: row.doc_page_count ?? null,
149
+ datalab_mode: row.datalab_mode ?? null,
150
+ total_chunks: row.total_chunks ?? 0,
151
+ }));
152
+ applyQualityAndRerank(results);
153
+ return results;
154
+ }
155
+ /**
156
+ * Search VLM description embeddings using FTS5
157
+ * Queries vlm_fts JOIN embeddings JOIN images JOIN documents
158
+ *
159
+ * NOTE: VLM results only support page_range_filter from chunk filters
160
+ * (VLM embeddings don't have heading_context, section_path, etc.)
161
+ */
162
+ searchVLM(options) {
163
+ const { query, limit = 10, phraseSearch = false, documentFilter, includeHighlight = true, pageRangeFilter, preSanitized = false, } = options;
164
+ if (!query || query.trim().length === 0) {
165
+ throw new Error('BM25 search query cannot be empty');
166
+ }
167
+ // Check if vlm_fts table exists (v6+ only)
168
+ const vlmFtsExists = this.db
169
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vlm_fts'")
170
+ .get();
171
+ if (!vlmFtsExists)
172
+ return [];
173
+ let ftsQuery;
174
+ if (phraseSearch) {
175
+ ftsQuery = `"${query.replace(/"/g, '""')}"`;
176
+ }
177
+ else if (preSanitized) {
178
+ // M-7: Defense-in-depth: verify the pre-sanitized query is actually safe
179
+ if (/["'()]/.test(query)) {
180
+ console.error(`[WARN] preSanitized query contains FTS5 metacharacters, falling back to sanitization: "${query}"`);
181
+ ftsQuery = sanitizeFTS5Query(query);
182
+ }
183
+ else {
184
+ ftsQuery = query;
185
+ }
186
+ }
187
+ else {
188
+ ftsQuery = sanitizeFTS5Query(query);
189
+ }
190
+ let sql = `
191
+ SELECT
192
+ e.id AS embedding_id,
193
+ e.image_id,
194
+ e.document_id,
195
+ e.original_text,
196
+ bm25(vlm_fts) AS bm25_score,
197
+ d.file_path AS source_file_path,
198
+ d.file_name AS source_file_name,
199
+ d.file_hash AS source_file_hash,
200
+ e.page_number,
201
+ e.character_start,
202
+ e.character_end,
203
+ e.chunk_index,
204
+ e.provenance_id,
205
+ e.content_hash,
206
+ d.doc_title,
207
+ d.doc_author,
208
+ d.doc_subject,
209
+ (SELECT o.parse_quality_score FROM ocr_results o WHERE o.document_id = e.document_id ORDER BY o.processing_completed_at DESC LIMIT 1) AS ocr_quality_score
210
+ ${includeHighlight ? ", snippet(vlm_fts, 0, '<mark>', '</mark>', '...', 32) AS highlight" : ''}
211
+ FROM vlm_fts
212
+ JOIN embeddings e ON vlm_fts.rowid = e.rowid
213
+ JOIN documents d ON e.document_id = d.id
214
+ WHERE vlm_fts MATCH ?
215
+ `;
216
+ const params = [ftsQuery];
217
+ if (documentFilter && documentFilter.length > 0) {
218
+ sql += ` AND e.document_id IN (${documentFilter.map(() => '?').join(',')})`;
219
+ params.push(...documentFilter);
220
+ }
221
+ // VLM only supports page_range_filter (no heading/section/content_type)
222
+ if (pageRangeFilter) {
223
+ if (pageRangeFilter.min_page !== undefined) {
224
+ sql += ' AND e.page_number >= ?';
225
+ params.push(pageRangeFilter.min_page);
226
+ }
227
+ if (pageRangeFilter.max_page !== undefined) {
228
+ sql += ' AND e.page_number <= ?';
229
+ params.push(pageRangeFilter.max_page);
230
+ }
231
+ }
232
+ sql += ` ORDER BY bm25(vlm_fts) LIMIT ?`;
233
+ params.push(limit);
234
+ const rows = this.db.prepare(sql).all(...params);
235
+ const results = rows.map((row, index) => ({
236
+ chunk_id: null,
237
+ image_id: row.image_id,
238
+ embedding_id: row.embedding_id,
239
+ extraction_id: null,
240
+ document_id: row.document_id,
241
+ original_text: row.original_text,
242
+ bm25_score: Math.abs(row.bm25_score),
243
+ rank: index + 1,
244
+ result_type: 'vlm',
245
+ source_file_path: row.source_file_path,
246
+ source_file_name: row.source_file_name,
247
+ source_file_hash: row.source_file_hash,
248
+ page_number: row.page_number,
249
+ character_start: row.character_start,
250
+ character_end: row.character_end,
251
+ chunk_index: row.chunk_index,
252
+ provenance_id: row.provenance_id,
253
+ content_hash: row.content_hash,
254
+ highlight: row.highlight,
255
+ ocr_quality_score: row.ocr_quality_score ?? null,
256
+ doc_title: row.doc_title ?? null,
257
+ doc_author: row.doc_author ?? null,
258
+ doc_subject: row.doc_subject ?? null,
259
+ }));
260
+ applyQualityAndRerank(results);
261
+ return results;
262
+ }
263
+ /**
264
+ * Search extraction content using FTS5
265
+ * Queries extractions_fts JOIN extractions JOIN documents
266
+ *
267
+ * NOTE: Extractions don't have page numbers or chunk metadata,
268
+ * so chunkFilter and pageRangeFilter are not applied here.
269
+ */
270
+ searchExtractions(options) {
271
+ const { query, limit = 10, phraseSearch = false, documentFilter, includeHighlight = true, preSanitized = false, } = options;
272
+ if (!query || query.trim().length === 0) {
273
+ throw new Error('BM25 search query cannot be empty');
274
+ }
275
+ // Check if extractions_fts table exists (v9+ only)
276
+ const ftsExists = this.db
277
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='extractions_fts'")
278
+ .get();
279
+ if (!ftsExists)
280
+ return [];
281
+ let ftsQuery;
282
+ if (phraseSearch) {
283
+ ftsQuery = `"${query.replace(/"/g, '""')}"`;
284
+ }
285
+ else if (preSanitized) {
286
+ // M-7: Defense-in-depth: verify the pre-sanitized query is actually safe
287
+ if (/["'()]/.test(query)) {
288
+ console.error(`[WARN] preSanitized query contains FTS5 metacharacters, falling back to sanitization: "${query}"`);
289
+ ftsQuery = sanitizeFTS5Query(query);
290
+ }
291
+ else {
292
+ ftsQuery = query;
293
+ }
294
+ }
295
+ else {
296
+ ftsQuery = sanitizeFTS5Query(query);
297
+ }
298
+ let sql = `
299
+ SELECT
300
+ ex.id AS extraction_id,
301
+ ex.document_id,
302
+ ex.extraction_json AS original_text,
303
+ bm25(extractions_fts) AS bm25_score,
304
+ d.file_path AS source_file_path,
305
+ d.file_name AS source_file_name,
306
+ d.file_hash AS source_file_hash,
307
+ ex.provenance_id,
308
+ ex.content_hash,
309
+ d.doc_title,
310
+ d.doc_author,
311
+ d.doc_subject,
312
+ (SELECT o.parse_quality_score FROM ocr_results o WHERE o.document_id = ex.document_id ORDER BY o.processing_completed_at DESC LIMIT 1) AS ocr_quality_score,
313
+ (SELECT e.id FROM embeddings e WHERE e.extraction_id = ex.id ORDER BY e.created_at DESC LIMIT 1) AS embedding_id
314
+ ${includeHighlight ? ", snippet(extractions_fts, 0, '<mark>', '</mark>', '...', 32) AS highlight" : ''}
315
+ FROM extractions_fts
316
+ JOIN extractions ex ON extractions_fts.rowid = ex.rowid
317
+ JOIN documents d ON ex.document_id = d.id
318
+ WHERE extractions_fts MATCH ?
319
+ `;
320
+ const params = [ftsQuery];
321
+ if (documentFilter && documentFilter.length > 0) {
322
+ sql += ` AND ex.document_id IN (${documentFilter.map(() => '?').join(',')})`;
323
+ params.push(...documentFilter);
324
+ }
325
+ sql += ` ORDER BY bm25(extractions_fts) LIMIT ?`;
326
+ params.push(limit);
327
+ const rows = this.db.prepare(sql).all(...params);
328
+ const results = rows.map((row, index) => ({
329
+ chunk_id: null,
330
+ image_id: null,
331
+ embedding_id: row.embedding_id ?? null,
332
+ extraction_id: row.extraction_id,
333
+ document_id: row.document_id,
334
+ original_text: row.original_text,
335
+ bm25_score: Math.abs(row.bm25_score),
336
+ rank: index + 1,
337
+ result_type: 'extraction',
338
+ source_file_path: row.source_file_path,
339
+ source_file_name: row.source_file_name,
340
+ source_file_hash: row.source_file_hash,
341
+ page_number: null,
342
+ character_start: 0,
343
+ character_end: 0,
344
+ chunk_index: 0,
345
+ provenance_id: row.provenance_id,
346
+ content_hash: row.content_hash,
347
+ highlight: row.highlight,
348
+ ocr_quality_score: row.ocr_quality_score ?? null,
349
+ doc_title: row.doc_title ?? null,
350
+ doc_author: row.doc_author ?? null,
351
+ doc_subject: row.doc_subject ?? null,
352
+ }));
353
+ applyQualityAndRerank(results);
354
+ return results;
355
+ }
356
+ rebuildIndex() {
357
+ const start = Date.now();
358
+ this.db.exec("INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')");
359
+ const count = this.db.prepare('SELECT COUNT(*) as cnt FROM chunks').get();
360
+ const contentHash = this.computeContentHash();
361
+ const now = new Date().toISOString();
362
+ this.db
363
+ .prepare(`
364
+ INSERT INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
365
+ VALUES (1, ?, ?, 'porter unicode61', ?, ?)
366
+ ON CONFLICT(id) DO UPDATE SET
367
+ last_rebuild_at = excluded.last_rebuild_at,
368
+ chunks_indexed = excluded.chunks_indexed,
369
+ content_hash = excluded.content_hash
370
+ `)
371
+ .run(now, count.cnt, SCHEMA_VERSION, contentHash);
372
+ // Also rebuild VLM FTS if table exists
373
+ const vlmResult = this.rebuildVLMIndex();
374
+ // Also rebuild extractions FTS if table exists
375
+ const extractionResult = this.rebuildExtractionIndex();
376
+ const duration = Date.now() - start;
377
+ return {
378
+ chunks_indexed: count.cnt,
379
+ vlm_indexed: vlmResult.vlm_indexed,
380
+ extractions_indexed: extractionResult.extractions_indexed,
381
+ duration_ms: duration,
382
+ content_hash: contentHash,
383
+ };
384
+ }
385
+ /**
386
+ * Rebuild VLM FTS index from embeddings where image_id IS NOT NULL
387
+ */
388
+ rebuildVLMIndex() {
389
+ const vlmFtsExists = this.db
390
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vlm_fts'")
391
+ .get();
392
+ if (!vlmFtsExists)
393
+ return { vlm_indexed: 0, duration_ms: 0 };
394
+ const start = Date.now();
395
+ // L-15: Wrap delete-all + insert + metadata update in a transaction so a crash
396
+ // between delete-all and insert cannot leave an empty VLM FTS index.
397
+ // H-4 fix: FTS5 'rebuild' reads ALL rows from the content table (embeddings),
398
+ // including chunk embeddings (image_id IS NULL). This creates ghost VLM results.
399
+ // Instead: clear the index, then manually re-insert only VLM embeddings.
400
+ const rebuildTransaction = this.db.transaction(() => {
401
+ this.db.exec("INSERT INTO vlm_fts(vlm_fts) VALUES('delete-all')");
402
+ this.db.exec(`
403
+ INSERT INTO vlm_fts(rowid, original_text)
404
+ SELECT rowid, original_text FROM embeddings WHERE image_id IS NOT NULL
405
+ `);
406
+ const count = this.db
407
+ .prepare('SELECT COUNT(*) as cnt FROM embeddings WHERE image_id IS NOT NULL')
408
+ .get();
409
+ const now = new Date().toISOString();
410
+ this.db
411
+ .prepare(`
412
+ INSERT INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
413
+ VALUES (2, ?, ?, 'porter unicode61', ?, NULL)
414
+ ON CONFLICT(id) DO UPDATE SET
415
+ last_rebuild_at = excluded.last_rebuild_at,
416
+ chunks_indexed = excluded.chunks_indexed
417
+ `)
418
+ .run(now, count.cnt, SCHEMA_VERSION);
419
+ return count.cnt;
420
+ });
421
+ const vlmCount = rebuildTransaction();
422
+ return {
423
+ vlm_indexed: vlmCount,
424
+ duration_ms: Date.now() - start,
425
+ };
426
+ }
427
+ /**
428
+ * Rebuild extractions FTS index
429
+ */
430
+ rebuildExtractionIndex() {
431
+ const ftsExists = this.db
432
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='extractions_fts'")
433
+ .get();
434
+ if (!ftsExists)
435
+ return { extractions_indexed: 0, duration_ms: 0 };
436
+ const start = Date.now();
437
+ this.db.exec("INSERT INTO extractions_fts(extractions_fts) VALUES('rebuild')");
438
+ const count = this.db.prepare('SELECT COUNT(*) as cnt FROM extractions').get();
439
+ const now = new Date().toISOString();
440
+ this.db
441
+ .prepare(`
442
+ INSERT INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
443
+ VALUES (3, ?, ?, 'porter unicode61', ?, NULL)
444
+ ON CONFLICT(id) DO UPDATE SET
445
+ last_rebuild_at = excluded.last_rebuild_at,
446
+ chunks_indexed = excluded.chunks_indexed
447
+ `)
448
+ .run(now, count.cnt, SCHEMA_VERSION);
449
+ return {
450
+ extractions_indexed: count.cnt,
451
+ duration_ms: Date.now() - start,
452
+ };
453
+ }
454
+ /**
455
+ * Search document metadata (title, author, subject) using FTS5.
456
+ * Queries documents_fts table (v30+).
457
+ *
458
+ * Returns document IDs and metadata fields matching the query.
459
+ * Used to find documents by metadata rather than content.
460
+ */
461
+ searchDocumentMetadata(options) {
462
+ const { query, limit = 10, phraseSearch = false } = options;
463
+ if (!query || query.trim().length === 0) {
464
+ throw new Error('Document metadata search query cannot be empty');
465
+ }
466
+ // Check if documents_fts table exists (v30+ only)
467
+ const ftsExists = this.db
468
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='documents_fts'")
469
+ .get();
470
+ if (!ftsExists)
471
+ return [];
472
+ const ftsQuery = phraseSearch ? `"${query.replace(/"/g, '""')}"` : sanitizeFTS5Query(query);
473
+ const sql = `
474
+ SELECT
475
+ d.id AS document_id,
476
+ d.file_name,
477
+ d.doc_title,
478
+ d.doc_author,
479
+ d.doc_subject,
480
+ bm25(documents_fts) AS bm25_score
481
+ FROM documents_fts
482
+ JOIN documents d ON documents_fts.rowid = d.rowid
483
+ WHERE documents_fts MATCH ?
484
+ ORDER BY bm25(documents_fts)
485
+ LIMIT ?
486
+ `;
487
+ const rows = this.db.prepare(sql).all(ftsQuery, limit);
488
+ return rows.map((row, index) => ({
489
+ document_id: row.document_id,
490
+ file_name: row.file_name,
491
+ doc_title: row.doc_title ?? null,
492
+ doc_author: row.doc_author ?? null,
493
+ doc_subject: row.doc_subject ?? null,
494
+ bm25_score: Math.abs(row.bm25_score),
495
+ rank: index + 1,
496
+ result_type: 'document_metadata',
497
+ }));
498
+ }
499
+ /**
500
+ * Check whether all expected FTS triggers exist for a given set of trigger names.
501
+ * If all triggers are present, the FTS index is kept in sync atomically and cannot be stale.
502
+ * If any trigger is missing, the index IS stale (triggers are the sync mechanism).
503
+ */
504
+ checkTriggersExist(triggerNames) {
505
+ if (triggerNames.length === 0)
506
+ return true;
507
+ const placeholders = triggerNames.map(() => '?').join(',');
508
+ const row = this.db
509
+ .prepare(`SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='trigger' AND name IN (${placeholders})`)
510
+ .get(...triggerNames);
511
+ return row.cnt === triggerNames.length;
512
+ }
513
+ getStatus() {
514
+ const meta = this.db.prepare('SELECT * FROM fts_index_metadata WHERE id = 1').get();
515
+ if (!meta) {
516
+ throw new Error('FTS index metadata not found. Database migration to v4 may not have completed.');
517
+ }
518
+ const chunkCount = this.db.prepare('SELECT COUNT(*) as cnt FROM chunks').get().cnt;
519
+ // L-7 fix: Stale detection via trigger existence, not count comparison.
520
+ // FTS is maintained by triggers that fire atomically on INSERT/DELETE/UPDATE.
521
+ // If all triggers exist, the index is in sync by definition.
522
+ // If any trigger is missing, the index IS stale (sync mechanism is broken).
523
+ const chunksTriggersOk = this.checkTriggersExist([
524
+ 'chunks_fts_ai', 'chunks_fts_ad', 'chunks_fts_au',
525
+ ]);
526
+ // Get VLM FTS metadata (id=2) if it exists
527
+ const vlmMeta = this.db.prepare('SELECT * FROM fts_index_metadata WHERE id = 2').get();
528
+ const vlmCount = this.db
529
+ .prepare('SELECT COUNT(*) as cnt FROM embeddings WHERE image_id IS NOT NULL')
530
+ .get().cnt;
531
+ const vlmIndexed = vlmMeta?.chunks_indexed ?? 0;
532
+ const vlmTriggersOk = this.checkTriggersExist([
533
+ 'vlm_fts_ai', 'vlm_fts_ad', 'vlm_fts_au',
534
+ ]);
535
+ // Get extraction FTS metadata (id=3) if it exists
536
+ const extractionMeta = this.db
537
+ .prepare('SELECT * FROM fts_index_metadata WHERE id = 3')
538
+ .get();
539
+ const extractionCount = (() => {
540
+ try {
541
+ return this.db.prepare('SELECT COUNT(*) as cnt FROM extractions').get()
542
+ .cnt;
543
+ }
544
+ catch (error) {
545
+ console.error(`[BM25] Failed to count extractions: ${String(error)}`);
546
+ return 0;
547
+ }
548
+ })();
549
+ const extractionsIndexed = extractionMeta?.chunks_indexed ?? 0;
550
+ const extractionTriggersOk = this.checkTriggersExist([
551
+ 'extractions_fts_ai', 'extractions_fts_ad', 'extractions_fts_au',
552
+ ]);
553
+ return {
554
+ ...meta,
555
+ current_chunk_count: chunkCount,
556
+ index_stale: !chunksTriggersOk,
557
+ vlm_indexed: vlmIndexed,
558
+ current_vlm_count: vlmCount,
559
+ vlm_index_stale: !vlmTriggersOk,
560
+ vlm_last_rebuild_at: vlmMeta?.last_rebuild_at ?? null,
561
+ extractions_indexed: extractionsIndexed,
562
+ current_extraction_count: extractionCount,
563
+ extraction_index_stale: !extractionTriggersOk,
564
+ extraction_last_rebuild_at: extractionMeta?.last_rebuild_at ?? null,
565
+ };
566
+ }
567
+ computeContentHash() {
568
+ return computeFTSContentHash(this.db);
569
+ }
570
+ }
571
+ /**
572
+ * Sanitize a user-provided query for safe use in FTS5 MATCH expressions.
573
+ *
574
+ * - Preserves FTS5 boolean operators (AND, OR, NOT)
575
+ * - Treats hyphens as word separators (matching unicode61 tokenizer)
576
+ * - Strips all FTS5 metacharacters (' " ( ) * : ^ ~ + etc.)
577
+ * - Inserts implicit AND between consecutive non-operator tokens
578
+ * - Strips leading/trailing/consecutive operators
579
+ *
580
+ * This is the SINGLE authoritative FTS5 sanitizer for the entire codebase.
581
+ *
582
+ * @param query - Raw user query string
583
+ * @returns Sanitized FTS5 query string
584
+ * @throws Error if query contains no valid tokens after sanitization
585
+ */
586
+ export function sanitizeFTS5Query(query) {
587
+ const FTS5_OPERATORS = new Set(['AND', 'OR', 'NOT']);
588
+ const rawTokens = query
589
+ .trim()
590
+ .split(/\s+/)
591
+ .filter((t) => t.length > 0);
592
+ const result = [];
593
+ for (const raw of rawTokens) {
594
+ if (FTS5_OPERATORS.has(raw.toUpperCase())) {
595
+ result.push(raw.toUpperCase());
596
+ }
597
+ else {
598
+ // L-5: Treat hyphens as word separators (matching FTS5 unicode61 tokenizer)
599
+ const parts = raw
600
+ .split(/-/)
601
+ .map((p) => p.replace(/['"()*:^~+{}[\]\\;@<>#!$%&|,./`?]/g, ''))
602
+ .filter((p) => p.length > 0);
603
+ result.push(...parts);
604
+ }
605
+ }
606
+ // Strip leading/trailing operators and consecutive operators
607
+ while (result.length > 0 && FTS5_OPERATORS.has(result[0]))
608
+ result.shift();
609
+ while (result.length > 0 && FTS5_OPERATORS.has(result[result.length - 1]))
610
+ result.pop();
611
+ const cleaned = [];
612
+ for (const t of result) {
613
+ if (FTS5_OPERATORS.has(t) &&
614
+ cleaned.length > 0 &&
615
+ FTS5_OPERATORS.has(cleaned[cleaned.length - 1]))
616
+ continue;
617
+ cleaned.push(t);
618
+ }
619
+ // Strip leading NOT to prevent accidental negative-only queries
620
+ if (cleaned.length >= 2 && cleaned[0] === 'NOT') {
621
+ cleaned.shift();
622
+ }
623
+ const finalTokens = cleaned.filter((t) => t.length > 0);
624
+ if (finalTokens.length === 0) {
625
+ throw new Error('Query contains no valid search tokens after sanitization');
626
+ }
627
+ // Insert implicit AND between consecutive non-operator tokens
628
+ const parts = [];
629
+ for (let i = 0; i < finalTokens.length; i++) {
630
+ parts.push(finalTokens[i]);
631
+ if (i < finalTokens.length - 1 &&
632
+ !FTS5_OPERATORS.has(finalTokens[i]) &&
633
+ !FTS5_OPERATORS.has(finalTokens[i + 1])) {
634
+ parts.push('AND');
635
+ }
636
+ }
637
+ return parts.join(' ');
638
+ }
639
+ /**
640
+ * Compute SHA-256 content hash of all chunk IDs and text_hashes for FTS index integrity verification.
641
+ * L-10 fix: Uses incremental hashing with iterate() instead of loading all rows into memory.
642
+ * Used by both BM25SearchService and the v3->v4 migration.
643
+ */
644
+ export function computeFTSContentHash(db) {
645
+ const hash = crypto.createHash('sha256');
646
+ let first = true;
647
+ for (const row of db.prepare('SELECT id, text_hash FROM chunks ORDER BY id').iterate()) {
648
+ const r = row;
649
+ if (!first)
650
+ hash.update('|');
651
+ hash.update(`${r.id}:${r.text_hash}`);
652
+ first = false;
653
+ }
654
+ return 'sha256:' + hash.digest('hex');
655
+ }
656
+ //# sourceMappingURL=bm25.js.map