ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,2528 @@
1
+ /**
2
+ * Search MCP Tools
3
+ *
4
+ * Tools: ocr_search (unified: keyword/semantic/hybrid), ocr_fts_manage,
5
+ * ocr_search_export, ocr_benchmark_compare, ocr_rag_context,
6
+ * ocr_search_saved (unified: save/list/get/execute)
7
+ *
8
+ * CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
9
+ * Use console.error() for all logging.
10
+ *
11
+ * @module tools/search
12
+ */
13
+ import * as fs from 'fs';
14
+ import * as path from 'path';
15
+ import { safeMin, safeMax } from '../utils/math.js';
16
+ import { v4 as uuidv4 } from 'uuid';
17
+ import { z } from 'zod';
18
+ import { getEmbeddingService } from '../services/embedding/embedder.js';
19
+ import { DatabaseService } from '../services/storage/database/index.js';
20
+ import { VectorService } from '../services/storage/vector.js';
21
+ import { requireDatabase, getDefaultStoragePath, withDatabaseOperation } from '../server/state.js';
22
+ import { successResult } from '../server/types.js';
23
+ import { validateInput, sanitizePath, escapeLikePattern, SearchUnifiedInput, FTSManageInput, } from '../utils/validation.js';
24
+ import { MCPError } from '../server/errors.js';
25
+ import { formatResponse, handleError } from './shared.js';
26
+ import { BM25SearchService, sanitizeFTS5Query } from '../services/search/bm25.js';
27
+ import { RRFFusion } from '../services/search/fusion.js';
28
+ import { rerankResults } from '../services/search/reranker.js';
29
+ import { expandQuery, getExpandedTerms } from '../services/search/query-expander.js';
30
+ import { classifyQuery, isTableQuery } from '../services/search/query-classifier.js';
31
+ import { getClusterSummariesForDocument } from '../services/storage/database/cluster-operations.js';
32
+ import { getImage } from '../services/storage/database/image-operations.js';
33
+ import { computeBlockConfidence, isRepeatedHeaderFooter } from '../services/chunking/json-block-analyzer.js';
34
+ /**
35
+ * Group flat search results by their source document.
36
+ * Each group contains document-level metadata and the subset of results
37
+ * belonging to that document. Groups are sorted by result_count descending.
38
+ */
39
+ function groupResultsByDocument(results) {
40
+ const groups = new Map();
41
+ for (const r of results) {
42
+ const docId = (r.document_id ?? r.source_document_id);
43
+ if (!docId)
44
+ continue;
45
+ if (!groups.has(docId)) {
46
+ groups.set(docId, {
47
+ document_id: docId,
48
+ file_name: r.source_file_name ?? '',
49
+ file_path: r.source_file_path ?? '',
50
+ doc_title: r.doc_title ?? null,
51
+ doc_author: r.doc_author ?? null,
52
+ total_pages: r.doc_page_count ?? null,
53
+ total_chunks: r.total_chunks ?? 0,
54
+ ocr_quality_score: r.ocr_quality_score ?? null,
55
+ result_count: 0,
56
+ results: [],
57
+ });
58
+ }
59
+ const group = groups.get(docId);
60
+ group.result_count++;
61
+ group.results.push(r);
62
+ }
63
+ return {
64
+ grouped: Array.from(groups.values()).sort((a, b) => b.result_count - a.result_count),
65
+ total_documents: groups.size,
66
+ };
67
+ }
68
+ // ═══════════════════════════════════════════════════════════════════════════════
69
+ // METADATA FILTER RESOLVER
70
+ // ═══════════════════════════════════════════════════════════════════════════════
71
+ /**
72
+ * Resolve metadata_filter to document IDs.
73
+ * Returns existingDocFilter unchanged if no metadata filter is specified.
74
+ * Returns ['__no_match__'] sentinel if filter is specified but matches zero documents,
75
+ * ensuring downstream filters (e.g. resolveClusterFilter) correctly block all results.
76
+ */
77
+ function resolveMetadataFilter(db, metadataFilter, existingDocFilter) {
78
+ if (!metadataFilter)
79
+ return existingDocFilter;
80
+ const { doc_title, doc_author, doc_subject } = metadataFilter;
81
+ if (!doc_title && !doc_author && !doc_subject)
82
+ return existingDocFilter;
83
+ let sql = 'SELECT id FROM documents WHERE 1=1';
84
+ const params = [];
85
+ if (doc_title) {
86
+ sql += " AND doc_title LIKE ? ESCAPE '\\'";
87
+ params.push(`%${escapeLikePattern(doc_title)}%`);
88
+ }
89
+ if (doc_author) {
90
+ sql += " AND doc_author LIKE ? ESCAPE '\\'";
91
+ params.push(`%${escapeLikePattern(doc_author)}%`);
92
+ }
93
+ if (doc_subject) {
94
+ sql += " AND doc_subject LIKE ? ESCAPE '\\'";
95
+ params.push(`%${escapeLikePattern(doc_subject)}%`);
96
+ }
97
+ // If existing doc filter, intersect with it
98
+ if (existingDocFilter && existingDocFilter.length > 0) {
99
+ sql += ` AND id IN (${existingDocFilter.map(() => '?').join(',')})`;
100
+ params.push(...existingDocFilter);
101
+ }
102
+ const rows = db
103
+ .getConnection()
104
+ .prepare(sql)
105
+ .all(...params);
106
+ const ids = rows.map((r) => r.id);
107
+ // Return sentinel when metadata filter was specified but matched zero documents,
108
+ // so downstream filters (e.g. resolveClusterFilter) correctly intersect with empty set
109
+ // instead of treating it as "no filter".
110
+ if (ids.length === 0)
111
+ return ['__no_match__'];
112
+ return ids;
113
+ }
114
+ /**
115
+ * Resolve min_quality_score to filtered document IDs.
116
+ * If minQualityScore is undefined, returns existingDocFilter unchanged.
117
+ * If set, queries for documents with OCR quality >= threshold and intersects with existing filter.
118
+ */
119
+ function resolveQualityFilter(db, minQualityScore, existingDocFilter) {
120
+ if (minQualityScore === undefined || minQualityScore === 0)
121
+ return existingDocFilter;
122
+ const rows = db
123
+ .getConnection()
124
+ .prepare(`SELECT DISTINCT d.id FROM documents d
125
+ JOIN ocr_results o ON o.document_id = d.id
126
+ WHERE o.parse_quality_score IS NOT NULL AND o.parse_quality_score >= ?`)
127
+ .all(minQualityScore);
128
+ const qualityIds = new Set(rows.map((r) => r.id));
129
+ if (!existingDocFilter) {
130
+ // Return sentinel non-matchable ID when no documents pass quality filter,
131
+ // so BM25/semantic/hybrid search applies the empty IN() filter correctly.
132
+ if (qualityIds.size === 0)
133
+ return ['__no_match__'];
134
+ return [...qualityIds];
135
+ }
136
+ const filtered = existingDocFilter.filter((id) => qualityIds.has(id));
137
+ if (filtered.length === 0)
138
+ return ['__no_match__'];
139
+ return filtered;
140
+ }
141
+ /**
142
+ * Format provenance chain as summary array
143
+ */
144
+ function formatProvenanceChain(db, provenanceId) {
145
+ const chain = db.getProvenanceChain(provenanceId);
146
+ return chain.map((p) => ({
147
+ id: p.id,
148
+ type: p.type,
149
+ chain_depth: p.chain_depth,
150
+ processor: p.processor,
151
+ content_hash: p.content_hash,
152
+ }));
153
+ }
154
+ /**
155
+ * Resolve cluster_id filter to document IDs.
156
+ * Queries document_clusters to find all documents in the specified cluster,
157
+ * then intersects with any existing document filter.
158
+ */
159
+ function resolveClusterFilter(conn, clusterId, existingDocFilter) {
160
+ if (!clusterId)
161
+ return existingDocFilter;
162
+ const rows = conn
163
+ .prepare('SELECT document_id FROM document_clusters WHERE cluster_id = ?')
164
+ .all(clusterId);
165
+ const clusterDocIds = rows.map((r) => r.document_id);
166
+ if (clusterDocIds.length === 0)
167
+ return ['__no_match__'];
168
+ if (existingDocFilter && existingDocFilter.length > 0) {
169
+ const clusterSet = new Set(clusterDocIds);
170
+ const intersected = existingDocFilter.filter((id) => clusterSet.has(id));
171
+ return intersected.length === 0 ? ['__no_match__'] : intersected;
172
+ }
173
+ return clusterDocIds;
174
+ }
175
+ /**
176
+ * Resolve chunk-level filters to SQL WHERE clause fragments.
177
+ * Filters apply to the chunks table (alias 'c' in BM25, 'ch' in vector).
178
+ * The caller is responsible for alias translation if needed.
179
+ */
180
+ function resolveChunkFilter(filters) {
181
+ const conditions = [];
182
+ const params = [];
183
+ if (filters.content_type_filter && filters.content_type_filter.length > 0) {
184
+ // content_types is JSON array like '["table","text"]'
185
+ // Match if ANY of the requested types appear
186
+ const typeConditions = filters.content_type_filter.map(() => "c.content_types LIKE '%' || ? || '%'");
187
+ conditions.push(`(${typeConditions.join(' OR ')})`);
188
+ params.push(...filters.content_type_filter.map(t => `"${t}"`));
189
+ }
190
+ if (filters.section_path_filter) {
191
+ conditions.push("c.section_path LIKE ? || '%' ESCAPE '\\'");
192
+ params.push(escapeLikePattern(filters.section_path_filter));
193
+ }
194
+ if (filters.heading_filter) {
195
+ conditions.push("c.heading_context LIKE '%' || ? || '%' ESCAPE '\\'");
196
+ params.push(escapeLikePattern(filters.heading_filter));
197
+ }
198
+ if (filters.page_range_filter) {
199
+ if (filters.page_range_filter.min_page !== undefined) {
200
+ conditions.push('c.page_number >= ?');
201
+ params.push(filters.page_range_filter.min_page);
202
+ }
203
+ if (filters.page_range_filter.max_page !== undefined) {
204
+ conditions.push('c.page_number <= ?');
205
+ params.push(filters.page_range_filter.max_page);
206
+ }
207
+ }
208
+ if (filters.is_atomic_filter !== undefined) {
209
+ conditions.push(`c.is_atomic = ?`);
210
+ params.push(filters.is_atomic_filter ? 1 : 0);
211
+ }
212
+ if (filters.heading_level_filter) {
213
+ if (filters.heading_level_filter.min_level !== undefined) {
214
+ conditions.push('c.heading_level >= ?');
215
+ params.push(filters.heading_level_filter.min_level);
216
+ }
217
+ if (filters.heading_level_filter.max_level !== undefined) {
218
+ conditions.push('c.heading_level <= ?');
219
+ params.push(filters.heading_level_filter.max_level);
220
+ }
221
+ }
222
+ if (filters.min_page_count !== undefined) {
223
+ conditions.push('(SELECT page_count FROM documents WHERE id = c.document_id) >= ?');
224
+ params.push(filters.min_page_count);
225
+ }
226
+ if (filters.max_page_count !== undefined) {
227
+ conditions.push('(SELECT page_count FROM documents WHERE id = c.document_id) <= ?');
228
+ params.push(filters.max_page_count);
229
+ }
230
+ if (filters.table_columns_contain) {
231
+ // Filter to atomic table chunks with matching column headers in provenance processing_params
232
+ conditions.push(`c.is_atomic = 1`);
233
+ conditions.push(`EXISTS (SELECT 1 FROM provenance p WHERE p.id = c.provenance_id AND LOWER(p.processing_params) LIKE '%' || LOWER(?) || '%')`);
234
+ params.push(filters.table_columns_contain);
235
+ }
236
+ return { conditions, params };
237
+ }
238
+ /**
239
+ * Attach neighboring chunk context to search results.
240
+ * For each result with a chunk_id and chunk_index, fetches N neighbors before and after.
241
+ * Deduplicates: skips neighbors that are already primary results.
242
+ */
243
+ function attachContextChunks(conn, results, contextSize) {
244
+ if (contextSize <= 0 || results.length === 0)
245
+ return;
246
+ // Build set of primary result chunk IDs for dedup
247
+ const primaryChunkIds = new Set(results.map(r => r.chunk_id).filter(Boolean));
248
+ // Group results by document_id for batch queries
249
+ const byDoc = new Map();
250
+ for (const r of results) {
251
+ const docId = r.document_id;
252
+ const chunkIndex = r.chunk_index;
253
+ if (!docId || chunkIndex === undefined) {
254
+ r.context_before = [];
255
+ r.context_after = [];
256
+ continue;
257
+ }
258
+ if (!byDoc.has(docId))
259
+ byDoc.set(docId, []);
260
+ byDoc.get(docId).push(r);
261
+ }
262
+ for (const [docId, docResults] of byDoc) {
263
+ // Batch query: get all potentially needed chunks for this doc
264
+ const allIndices = docResults.map(r => r.chunk_index);
265
+ const minIdx = (safeMin(allIndices) ?? 0) - contextSize;
266
+ const maxIdx = (safeMax(allIndices) ?? 0) + contextSize;
267
+ const neighbors = conn.prepare(`SELECT id, text, chunk_index, page_number, heading_context, section_path, content_types
268
+ FROM chunks
269
+ WHERE document_id = ? AND chunk_index BETWEEN ? AND ?
270
+ ORDER BY chunk_index`).all(docId, minIdx, maxIdx);
271
+ const neighborMap = new Map(neighbors.map(n => [n.chunk_index, n]));
272
+ for (const r of docResults) {
273
+ const idx = r.chunk_index;
274
+ const before = [];
275
+ const after = [];
276
+ for (let i = idx - contextSize; i < idx; i++) {
277
+ const n = neighborMap.get(i);
278
+ if (n && !primaryChunkIds.has(n.id)) {
279
+ before.push({
280
+ chunk_id: n.id,
281
+ chunk_index: n.chunk_index,
282
+ text: n.text.substring(0, 500),
283
+ page_number: n.page_number,
284
+ heading_context: n.heading_context,
285
+ is_context: true,
286
+ });
287
+ }
288
+ }
289
+ for (let i = idx + 1; i <= idx + contextSize; i++) {
290
+ const n = neighborMap.get(i);
291
+ if (n && !primaryChunkIds.has(n.id)) {
292
+ after.push({
293
+ chunk_id: n.id,
294
+ chunk_index: n.chunk_index,
295
+ text: n.text.substring(0, 500),
296
+ page_number: n.page_number,
297
+ heading_context: n.heading_context,
298
+ is_context: true,
299
+ });
300
+ }
301
+ }
302
+ r.context_before = before;
303
+ r.context_after = after;
304
+ }
305
+ }
306
+ }
307
+ /**
308
+ * Attach table metadata to search results for table chunks.
309
+ * For each result where content_types contains "table",
310
+ * queries provenance processing_params to extract table_columns, table_row_count, table_column_count.
311
+ * Batches queries by chunk_id.
312
+ */
313
+ function attachTableMetadata(conn, results) {
314
+ // Find table chunk IDs (any chunk with "table" in content_types, not just atomic)
315
+ const tableChunkIds = [];
316
+ for (const r of results) {
317
+ if (r.chunk_id && typeof r.content_types === 'string' && r.content_types.includes('"table"')) {
318
+ tableChunkIds.push(r.chunk_id);
319
+ }
320
+ }
321
+ if (tableChunkIds.length === 0)
322
+ return;
323
+ // Batch query provenance for table metadata via chunks.provenance_id -> provenance.id
324
+ const placeholders = tableChunkIds.map(() => '?').join(',');
325
+ const rows = conn.prepare(`SELECT c.id AS chunk_id, p.processing_params
326
+ FROM chunks c
327
+ INNER JOIN provenance p ON c.provenance_id = p.id
328
+ WHERE c.id IN (${placeholders})`).all(...tableChunkIds);
329
+ // Build map: chunk_id -> table metadata
330
+ const metadataMap = new Map();
331
+ for (const row of rows) {
332
+ if (metadataMap.has(row.chunk_id))
333
+ continue;
334
+ try {
335
+ const params = JSON.parse(row.processing_params);
336
+ if (params.table_columns) {
337
+ metadataMap.set(row.chunk_id, {
338
+ table_columns: params.table_columns,
339
+ table_row_count: params.table_row_count ?? 0,
340
+ table_column_count: params.table_column_count ?? 0,
341
+ });
342
+ }
343
+ }
344
+ catch (error) {
345
+ console.error(`[search] Failed to parse processing_params for chunk ${row.chunk_id}: ${error instanceof Error ? error.message : String(error)}`);
346
+ }
347
+ }
348
+ // Attach to results as top-level fields
349
+ for (const r of results) {
350
+ const meta = r.chunk_id ? metadataMap.get(r.chunk_id) : undefined;
351
+ if (meta) {
352
+ r.table_columns = meta.table_columns;
353
+ r.table_row_count = meta.table_row_count;
354
+ r.table_column_count = meta.table_column_count;
355
+ }
356
+ }
357
+ }
358
+ /**
359
+ * Exclude chunks tagged as repeated headers/footers (T2.8).
360
+ * Queries entity_tags for the system:repeated_header_footer tag
361
+ * and filters them out of the results array.
362
+ * Returns a new filtered array.
363
+ */
364
+ function excludeRepeatedHeaderFooterChunks(conn, results) {
365
+ const taggedChunks = conn.prepare(`SELECT et.entity_id FROM entity_tags et
366
+ JOIN tags t ON t.id = et.tag_id
367
+ WHERE t.name = 'system:repeated_header_footer' AND et.entity_type = 'chunk'`).all();
368
+ if (taggedChunks.length === 0)
369
+ return results;
370
+ const excludeChunkIds = new Set(taggedChunks.map(t => t.entity_id));
371
+ return results.filter(r => {
372
+ const chunkId = r.chunk_id;
373
+ return !chunkId || !excludeChunkIds.has(chunkId);
374
+ });
375
+ }
376
+ // ═══════════════════════════════════════════════════════════════════════════════
377
+ // V7 INTELLIGENCE OPTIMIZATION - COMPACT MODE & PROVENANCE SUMMARY
378
+ // ═══════════════════════════════════════════════════════════════════════════════
379
+ /**
380
+ * Map a full search result to compact format, keeping only essential fields.
381
+ * Reduces token count by ~77% per result.
382
+ */
383
+ function compactResult(r, mode) {
384
+ let scoreField;
385
+ switch (mode) {
386
+ case 'keyword':
387
+ scoreField = 'bm25_score';
388
+ break;
389
+ case 'hybrid':
390
+ scoreField = 'rrf_score';
391
+ break;
392
+ default:
393
+ scoreField = 'similarity_score';
394
+ break;
395
+ }
396
+ return {
397
+ document_id: r.document_id,
398
+ chunk_id: r.chunk_id,
399
+ original_text: r.original_text,
400
+ source_file_name: r.source_file_name,
401
+ page_number: r.page_number,
402
+ score: r[scoreField] ?? r.similarity_score ?? r.bm25_score ?? r.rrf_score,
403
+ result_type: r.result_type,
404
+ };
405
+ }
406
+ /**
407
+ * Build a one-line provenance summary string from the provenance chain.
408
+ * Format: "FILE → OCR (marker, 92% quality) → Chunk 3 → Embedding"
409
+ */
410
+ function buildProvenanceSummary(db, provenanceId) {
411
+ if (!provenanceId)
412
+ return undefined;
413
+ try {
414
+ const chain = db.getProvenanceChain(provenanceId);
415
+ if (!chain || chain.length === 0)
416
+ return undefined;
417
+ const parts = [];
418
+ for (const link of chain) {
419
+ switch (link.type) {
420
+ case 'DOCUMENT': {
421
+ const sourceType = link.source_type;
422
+ parts.push(sourceType?.toUpperCase() ?? 'DOCUMENT');
423
+ break;
424
+ }
425
+ case 'OCR_RESULT': {
426
+ const qualityScore = link.processing_quality_score;
427
+ const qualityStr = qualityScore != null
428
+ ? `, quality ${qualityScore.toFixed(1)}/5.0`
429
+ : '';
430
+ parts.push(`OCR (${link.processor ?? 'unknown'}${qualityStr})`);
431
+ break;
432
+ }
433
+ case 'CHUNK': {
434
+ const chunkIndex = link.location?.chunk_index;
435
+ const chunkStr = chunkIndex !== undefined ? ` ${chunkIndex + 1}` : '';
436
+ parts.push(`Chunk${chunkStr}`);
437
+ break;
438
+ }
439
+ case 'EMBEDDING':
440
+ parts.push('Embedding');
441
+ break;
442
+ case 'VLM_DESCRIPTION':
443
+ parts.push('VLM');
444
+ break;
445
+ default:
446
+ parts.push(link.type);
447
+ break;
448
+ }
449
+ }
450
+ return parts.join(' \u2192 ');
451
+ }
452
+ catch (err) {
453
+ console.error(`[search] Failed to build provenance summary for ${provenanceId}: ${err instanceof Error ? err.message : String(err)}`);
454
+ return undefined;
455
+ }
456
+ }
457
+ /**
458
+ * Apply V7 compact mode and provenance summary to response data.
459
+ * Modifies responseData.results in place. Must be called BEFORE grouping.
460
+ */
461
+ function applyV7Transforms(responseData, input, db, mode) {
462
+ // V7: Attach provenance summary one-liners BEFORE compact (compact strips provenance_id)
463
+ if (input.include_provenance_summary) {
464
+ for (const r of responseData.results) {
465
+ r.provenance_summary = buildProvenanceSummary(db, r.provenance_id);
466
+ }
467
+ }
468
+ // V7: Apply compact mode - strip results to essential fields only
469
+ if (input.compact) {
470
+ responseData.results = responseData.results.map(r => {
471
+ const compacted = compactResult(r, mode);
472
+ // Preserve provenance_summary if it was attached above
473
+ if (r.provenance_summary)
474
+ compacted.provenance_summary = r.provenance_summary;
475
+ return compacted;
476
+ });
477
+ responseData.compact = true;
478
+ }
479
+ }
480
+ /**
481
+ * Attach cluster context to search results.
482
+ * For each unique document_id in results, queries cluster membership
483
+ * and attaches cluster_context array to each result.
484
+ */
485
+ function attachClusterContext(conn, results) {
486
+ const docIds = [...new Set(results.map((r) => r.document_id).filter(Boolean))];
487
+ if (docIds.length === 0)
488
+ return;
489
+ const clusterCache = new Map();
490
+ for (const docId of docIds) {
491
+ try {
492
+ const summaries = getClusterSummariesForDocument(conn, docId);
493
+ clusterCache.set(docId, summaries.map((s) => ({
494
+ cluster_id: s.id,
495
+ cluster_label: s.label,
496
+ run_id: s.run_id,
497
+ })));
498
+ }
499
+ catch (error) {
500
+ console.error(`[Search] Failed to get cluster summaries for document ${docId}: ${String(error)}`);
501
+ clusterCache.set(docId, []);
502
+ }
503
+ }
504
+ for (const r of results) {
505
+ const docId = r.document_id;
506
+ if (docId) {
507
+ r.cluster_context = clusterCache.get(docId) ?? [];
508
+ }
509
+ }
510
+ }
511
+ /**
512
+ * Attach cross-document context (cluster memberships and related comparisons)
513
+ * to the first result per document. This gives callers awareness of how each
514
+ * source document relates to the wider corpus without bloating every result.
515
+ */
516
+ function attachCrossDocumentContext(conn, results) {
517
+ const docIds = [...new Set(results.map(r => (r.document_id ?? r.source_document_id)).filter(Boolean))];
518
+ if (docIds.length === 0)
519
+ return;
520
+ const contextMap = new Map();
521
+ for (const docId of docIds) {
522
+ try {
523
+ // Get cluster memberships
524
+ const clusters = conn.prepare(`SELECT c.id, c.label, c.classification_tag, dc.similarity_to_centroid
525
+ FROM document_clusters dc JOIN clusters c ON c.id = dc.cluster_id
526
+ WHERE dc.document_id = ? LIMIT 3`).all(docId);
527
+ // Get comparison summaries (documents already compared to this one)
528
+ const comparisons = conn.prepare(`SELECT
529
+ CASE WHEN document_id_1 = ? THEN document_id_2 ELSE document_id_1 END as related_doc_id,
530
+ similarity_ratio, summary
531
+ FROM comparisons
532
+ WHERE document_id_1 = ? OR document_id_2 = ?
533
+ ORDER BY similarity_ratio DESC LIMIT 3`).all(docId, docId, docId);
534
+ contextMap.set(docId, {
535
+ clusters: clusters.length > 0 ? clusters : null,
536
+ related_documents: comparisons.length > 0 ? comparisons : null,
537
+ });
538
+ }
539
+ catch (error) {
540
+ console.error(`[Search] Failed to get cross-document context for ${docId}: ${String(error)}`);
541
+ }
542
+ }
543
+ // Attach to first result per document (not every result to reduce noise)
544
+ const seen = new Set();
545
+ for (const r of results) {
546
+ const docId = (r.document_id ?? r.source_document_id);
547
+ if (docId && !seen.has(docId)) {
548
+ seen.add(docId);
549
+ const ctx = contextMap.get(docId);
550
+ if (ctx) {
551
+ r.document_context = ctx;
552
+ }
553
+ }
554
+ }
555
+ }
556
+ /**
557
+ * Enrich VLM search results with image metadata (extracted_path, page_number, dimensions, etc.).
558
+ * For results with an image_id, looks up the image record and attaches its metadata.
559
+ * Non-VLM results and results with missing images are left unchanged.
560
+ */
561
+ function enrichVLMResultsWithImageMetadata(conn, results) {
562
+ for (const result of results) {
563
+ if (result.image_id) {
564
+ const image = getImage(conn, result.image_id);
565
+ if (image) {
566
+ result.image_extracted_path = image.extracted_path;
567
+ result.image_page_number = image.page_number;
568
+ result.image_dimensions = { width: image.dimensions.width, height: image.dimensions.height };
569
+ result.image_block_type = image.block_type;
570
+ result.image_format = image.format;
571
+ }
572
+ }
573
+ }
574
+ }
575
+ /**
576
+ * Apply post-retrieval score boosting based on chunk metadata.
577
+ *
578
+ * Tasks 2.1-2.3 + 4.3 integration:
579
+ * - Heading level boost: H1=1.3x, H2=1.2x, H3=1.1x, body=1.0x
580
+ * - Atomic chunk boost: complete semantic units get 1.1x
581
+ * - Content-type preference: query keyword matching boosts table/code/list results
582
+ * - Block confidence: computed from content types via computeBlockConfidence (0.8x-1.16x)
583
+ *
584
+ * Mutates score fields (bm25_score, similarity_score, rrf_score) in place.
585
+ */
586
+ function applyMetadataBoosts(results, options) {
587
+ for (const r of results) {
588
+ let boost = 1.0;
589
+ // Task 2.1: Heading level boost: H1=1.3x, H2=1.2x, H3=1.1x, body=1.0x
590
+ if (options.headingBoost !== false) {
591
+ const level = r.heading_level ?? 5;
592
+ const clampedLevel = Math.min(Math.max(level, 1), 4);
593
+ boost *= 1 + (0.1 * (4 - clampedLevel));
594
+ }
595
+ // Task 2.2: Atomic chunk boost: complete semantic units get 1.1x
596
+ if (options.atomicBoost !== false && r.is_atomic) {
597
+ boost *= 1.1;
598
+ }
599
+ // Task 2.3: Content-type preference based on query keywords
600
+ if (options.contentTypeQuery) {
601
+ const q = options.contentTypeQuery.toLowerCase();
602
+ const contentTypes = r.content_types;
603
+ if (contentTypes) {
604
+ if (/\b(table|data|statistic|row|column|figure|chart)\b/.test(q) && contentTypes.includes('"table"')) {
605
+ boost *= 1.2;
606
+ }
607
+ if (/\b(code|function|class|method|import|variable|api)\b/.test(q) && contentTypes.includes('"code"')) {
608
+ boost *= 1.2;
609
+ }
610
+ if (/\b(list|items|steps|requirements|criteria)\b/.test(q) && contentTypes.includes('"list"')) {
611
+ boost *= 1.15;
612
+ }
613
+ }
614
+ }
615
+ // Task 4.3 integration: Block confidence from content types (computed on-the-fly)
616
+ try {
617
+ const contentTypesRaw = r.content_types;
618
+ if (contentTypesRaw) {
619
+ const parsed = JSON.parse(contentTypesRaw);
620
+ if (Array.isArray(parsed) && parsed.length > 0) {
621
+ const blockConf = computeBlockConfidence(parsed);
622
+ boost *= 0.8 + (0.4 * blockConf); // range: 0.8x to 1.16x
623
+ }
624
+ }
625
+ }
626
+ catch (error) {
627
+ console.error(`[search] Failed to parse content_types for chunk ${r.chunk_id ?? 'unknown'} during quality boost: ${error instanceof Error ? error.message : String(error)}`);
628
+ }
629
+ // Task 7.1: Header/footer penalty - demote chunks matching repeated headers/footers
630
+ // Two-tier detection:
631
+ // 1. Explicit: caller provides known repeated texts from detectRepeatedHeadersFooters()
632
+ // 2. Heuristic: short chunks with typical header/footer patterns get penalized
633
+ const chunkText = r.original_text ?? '';
634
+ if (options.repeatedHeaderFooterTexts && options.repeatedHeaderFooterTexts.length > 0) {
635
+ if (chunkText.length > 0 && isRepeatedHeaderFooter(chunkText, options.repeatedHeaderFooterTexts)) {
636
+ boost *= 0.5;
637
+ }
638
+ }
639
+ // Heuristic header/footer detection for short, boilerplate-like chunks
640
+ const trimmed = chunkText.trim();
641
+ if (trimmed.length > 0 && trimmed.length < 80) {
642
+ const lowerText = trimmed.toLowerCase();
643
+ const isLikelyBoilerplate = /^page\s+\d+(\s+of\s+\d+)?$/i.test(trimmed) ||
644
+ /^\d+$/.test(trimmed) ||
645
+ /^-\s*\d+\s*-$/.test(trimmed) ||
646
+ lowerText.includes('confidential') ||
647
+ lowerText.includes('all rights reserved') ||
648
+ /^copyright\s/i.test(trimmed) ||
649
+ /^\u00a9\s/.test(trimmed);
650
+ if (isLikelyBoilerplate) {
651
+ boost *= 0.5;
652
+ }
653
+ }
654
+ // Clamp aggregate multiplier to [0.5, 2.0] to prevent compounding penalties (M-9)
655
+ // from overwhelming relevance scores and to cap the max boost ratio at 4x (M-11).
656
+ const clampedBoost = Math.max(0.5, Math.min(2.0, boost));
657
+ // Apply clamped boost to whichever score field exists
658
+ if (r.bm25_score != null)
659
+ r.bm25_score = r.bm25_score * clampedBoost;
660
+ if (r.similarity_score != null)
661
+ r.similarity_score = r.similarity_score * clampedBoost;
662
+ if (r.rrf_score != null)
663
+ r.rrf_score = r.rrf_score * clampedBoost;
664
+ }
665
+ }
666
+ /**
667
+ * Apply document length normalization to gently penalize results from very long documents.
668
+ * Uses sqrt(median/docChunks) clamped to [0.7, 1.0] so short documents are unaffected
669
+ * and very long documents get a modest penalty.
670
+ *
671
+ * Mutates score fields (bm25_score, similarity_score, rrf_score) in place.
672
+ * Skips normalization when all results come from a single document.
673
+ */
674
+ function applyLengthNormalization(results, db) {
675
+ const docIds = [...new Set(results.map(r => r.document_id).filter(Boolean))];
676
+ if (docIds.length <= 1)
677
+ return; // No normalization needed for single-document results
678
+ const placeholders = docIds.map(() => '?').join(',');
679
+ const rows = db.getConnection()
680
+ .prepare(`SELECT document_id, COUNT(*) as chunk_count FROM chunks WHERE document_id IN (${placeholders}) GROUP BY document_id`)
681
+ .all(...docIds);
682
+ const chunkCounts = new Map(rows.map(r => [r.document_id, r.chunk_count]));
683
+ const counts = [...chunkCounts.values()].sort((a, b) => a - b);
684
+ const median = counts[Math.floor(counts.length / 2)] || 1;
685
+ for (const r of results) {
686
+ const docChunks = chunkCounts.get(r.document_id) ?? median;
687
+ const factor = Math.sqrt(median / Math.max(docChunks, 1));
688
+ const clampedFactor = Math.max(0.7, Math.min(1.0, factor));
689
+ if (r.bm25_score != null)
690
+ r.bm25_score = r.bm25_score * clampedFactor;
691
+ if (r.similarity_score != null)
692
+ r.similarity_score = r.similarity_score * clampedFactor;
693
+ if (r.rrf_score != null)
694
+ r.rrf_score = r.rrf_score * clampedFactor;
695
+ }
696
+ }
697
+ /**
698
+ * Remove duplicate chunks from search results by content_hash (Task 7.3).
699
+ * Keeps only the first occurrence of each hash value. Results without a hash
700
+ * are always kept. Returns a new array (does not mutate the input).
701
+ */
702
+ function deduplicateByContentHash(results) {
703
+ const seen = new Set();
704
+ return results.filter(r => {
705
+ const hash = r.content_hash ?? null;
706
+ if (!hash)
707
+ return true;
708
+ if (seen.has(hash))
709
+ return false;
710
+ seen.add(hash);
711
+ return true;
712
+ });
713
+ }
714
+ /**
715
+ * Attach optional provenance chain to a search result object.
716
+ * Shared by BM25, semantic, and hybrid handlers (both reranked and non-reranked paths).
717
+ *
718
+ * @param provenanceKey - Response field name for provenance chain ('provenance' or 'provenance_chain')
719
+ */
720
+ function attachProvenance(result, db, provenanceId, includeProvenance, provenanceKey = 'provenance') {
721
+ if (includeProvenance) {
722
+ result[provenanceKey] = formatProvenanceChain(db, provenanceId);
723
+ }
724
+ }
725
+ /**
726
+ * Apply chunk proximity boost to hybrid search results.
727
+ * Results from the same document whose chunk indexes are within 2 of each other
728
+ * get their rrf_score multiplied by (1 + 0.1 * nearbyCount), rewarding
729
+ * clusters of nearby relevant chunks.
730
+ */
731
+ function applyChunkProximityBoost(results) {
732
+ const byDoc = new Map();
733
+ for (let i = 0; i < results.length; i++) {
734
+ const docId = results[i].document_id;
735
+ const chunkIndex = results[i].chunk_index;
736
+ if (docId && chunkIndex !== undefined && chunkIndex !== null) {
737
+ if (!byDoc.has(docId))
738
+ byDoc.set(docId, []);
739
+ byDoc.get(docId).push({ idx: i, chunkIndex });
740
+ }
741
+ }
742
+ let boostedCount = 0;
743
+ for (const entries of byDoc.values()) {
744
+ if (entries.length < 2)
745
+ continue;
746
+ for (const entry of entries) {
747
+ const nearbyCount = entries.filter((e) => Math.abs(e.chunkIndex - entry.chunkIndex) <= 2 && e.chunkIndex !== entry.chunkIndex).length;
748
+ if (nearbyCount > 0) {
749
+ const currentScore = results[entry.idx].rrf_score;
750
+ if (typeof currentScore === 'number') {
751
+ results[entry.idx].rrf_score = currentScore * (1 + 0.1 * nearbyCount);
752
+ boostedCount++;
753
+ }
754
+ }
755
+ }
756
+ }
757
+ return boostedCount > 0 ? { boosted_results: boostedCount } : undefined;
758
+ }
759
+ /**
760
+ * Convert BM25 results (with bm25_score and rank) to ranked format for RRF fusion.
761
+ */
762
+ function toBm25Ranked(results) {
763
+ return results.map((r) => ({
764
+ chunk_id: r.chunk_id,
765
+ image_id: r.image_id,
766
+ extraction_id: r.extraction_id,
767
+ embedding_id: r.embedding_id ?? '',
768
+ document_id: r.document_id,
769
+ original_text: r.original_text,
770
+ result_type: r.result_type,
771
+ source_file_path: r.source_file_path,
772
+ source_file_name: r.source_file_name,
773
+ source_file_hash: r.source_file_hash,
774
+ page_number: r.page_number,
775
+ character_start: r.character_start,
776
+ character_end: r.character_end,
777
+ chunk_index: r.chunk_index,
778
+ provenance_id: r.provenance_id,
779
+ content_hash: r.content_hash,
780
+ rank: r.rank,
781
+ score: r.bm25_score,
782
+ heading_context: r.heading_context ?? null,
783
+ section_path: r.section_path ?? null,
784
+ content_types: r.content_types ?? null,
785
+ is_atomic: r.is_atomic ?? false,
786
+ page_range: r.page_range ?? null,
787
+ heading_level: r.heading_level ?? null,
788
+ ocr_quality_score: r.ocr_quality_score ?? null,
789
+ doc_title: r.doc_title ?? null,
790
+ doc_author: r.doc_author ?? null,
791
+ doc_subject: r.doc_subject ?? null,
792
+ overlap_previous: r.overlap_previous ?? 0,
793
+ overlap_next: r.overlap_next ?? 0,
794
+ chunking_strategy: r.chunking_strategy ?? null,
795
+ embedding_status: r.embedding_status ?? 'pending',
796
+ doc_page_count: r.doc_page_count ?? null,
797
+ datalab_mode: r.datalab_mode ?? null,
798
+ total_chunks: r.total_chunks ?? 0,
799
+ }));
800
+ }
801
+ /**
802
+ * Convert semantic search results (with similarity_score) to ranked format for RRF fusion.
803
+ */
804
+ function toSemanticRanked(results) {
805
+ return results.map((r, i) => ({
806
+ chunk_id: r.chunk_id,
807
+ image_id: r.image_id,
808
+ extraction_id: r.extraction_id,
809
+ embedding_id: r.embedding_id,
810
+ document_id: r.document_id,
811
+ original_text: r.original_text,
812
+ result_type: r.result_type,
813
+ source_file_path: r.source_file_path,
814
+ source_file_name: r.source_file_name,
815
+ source_file_hash: r.source_file_hash,
816
+ page_number: r.page_number,
817
+ character_start: r.character_start,
818
+ character_end: r.character_end,
819
+ chunk_index: r.chunk_index,
820
+ total_chunks: r.total_chunks ?? 0,
821
+ provenance_id: r.provenance_id,
822
+ content_hash: r.content_hash,
823
+ rank: i + 1,
824
+ score: r.similarity_score,
825
+ heading_context: r.heading_context ?? null,
826
+ section_path: r.section_path ?? null,
827
+ content_types: r.content_types ?? null,
828
+ is_atomic: r.is_atomic ?? false,
829
+ page_range: r.chunk_page_range ?? null,
830
+ heading_level: r.heading_level ?? null,
831
+ ocr_quality_score: r.ocr_quality_score ?? null,
832
+ doc_title: r.doc_title ?? null,
833
+ doc_author: r.doc_author ?? null,
834
+ doc_subject: r.doc_subject ?? null,
835
+ overlap_previous: r.overlap_previous ?? 0,
836
+ overlap_next: r.overlap_next ?? 0,
837
+ chunking_strategy: r.chunking_strategy ?? null,
838
+ embedding_status: r.embedding_status ?? 'pending',
839
+ doc_page_count: r.doc_page_count ?? null,
840
+ datalab_mode: r.datalab_mode ?? null,
841
+ }));
842
+ }
843
+ // ═══════════════════════════════════════════════════════════════════════════════
844
+ // SEARCH TOOL HANDLERS
845
+ // ═══════════════════════════════════════════════════════════════════════════════
846
+ /**
847
+ * Internal: Semantic vector search logic (called by unified handler)
848
+ */
849
+ async function handleSearchSemanticInternal(params) {
850
+ try {
851
+ return await withDatabaseOperation(async ({ db, vector }) => {
852
+ // Params already validated and enriched by handleSearchUnified
853
+ const input = params;
854
+ const conn = db.getConnection();
855
+ // Semantic mode: skip query expansion entirely.
856
+ // expand_query produces FTS5 OR-joined terms which have zero effect on vector search.
857
+ // The embedding is always generated from the original query.
858
+ // Resolve metadata filter to document IDs, then chain through quality + cluster filters
859
+ const documentFilter = resolveClusterFilter(conn, input.cluster_id, resolveQualityFilter(db, input.min_quality_score, resolveMetadataFilter(db, input.metadata_filter, input.document_filter)));
860
+ // Resolve chunk-level filters
861
+ const chunkFilter = resolveChunkFilter({
862
+ content_type_filter: input.content_type_filter,
863
+ section_path_filter: input.section_path_filter,
864
+ heading_filter: input.heading_filter,
865
+ page_range_filter: input.page_range_filter,
866
+ is_atomic_filter: input.is_atomic_filter,
867
+ heading_level_filter: input.heading_level_filter,
868
+ min_page_count: input.min_page_count,
869
+ max_page_count: input.max_page_count,
870
+ table_columns_contain: input.table_columns_contain,
871
+ });
872
+ // Generate query embedding from original query
873
+ const embedder = getEmbeddingService();
874
+ let embeddingQuery = input.query;
875
+ if (input.section_path_filter) {
876
+ embeddingQuery = `[Section: ${input.section_path_filter}] ${embeddingQuery}`;
877
+ }
878
+ const queryVector = await embedder.embedSearchQuery(embeddingQuery);
879
+ const limit = input.limit ?? 10;
880
+ const searchLimit = input.rerank ? Math.max(limit * 2, 20) : limit;
881
+ const requestedThreshold = input.similarity_threshold ?? 0.7;
882
+ // Task 3.5: Adaptive similarity threshold
883
+ // When user does NOT explicitly provide a threshold, use adaptive mode:
884
+ // fetch extra candidates with low floor, then compute threshold from distribution
885
+ const userExplicitlySetThreshold = params.similarity_threshold !== undefined;
886
+ const useAdaptiveThreshold = !userExplicitlySetThreshold;
887
+ const searchThreshold = useAdaptiveThreshold ? 0.1 : requestedThreshold;
888
+ const adaptiveFetchLimit = useAdaptiveThreshold ? Math.max(searchLimit * 3, 30) : searchLimit;
889
+ // Search for similar vectors
890
+ const results = vector.searchSimilar(queryVector, {
891
+ limit: adaptiveFetchLimit,
892
+ threshold: searchThreshold,
893
+ documentFilter,
894
+ chunkFilter: chunkFilter.conditions.length > 0 ? chunkFilter : undefined,
895
+ pageRangeFilter: input.page_range_filter,
896
+ });
897
+ // Task 3.5: Compute adaptive threshold from result distribution
898
+ let effectiveThreshold = requestedThreshold;
899
+ let thresholdInfo;
900
+ if (useAdaptiveThreshold && results.length > 1) {
901
+ const scores = results.map(r => r.similarity_score);
902
+ const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
903
+ const variance = scores.reduce((a, b) => a + (b - mean) ** 2, 0) / scores.length;
904
+ const stddev = Math.sqrt(variance);
905
+ const adaptiveRaw = mean - stddev;
906
+ effectiveThreshold = Math.max(0.15, Math.min(0.5, adaptiveRaw));
907
+ thresholdInfo = {
908
+ mode: 'adaptive',
909
+ requested: requestedThreshold,
910
+ effective: Math.round(effectiveThreshold * 1000) / 1000,
911
+ adaptive_raw: Math.round(adaptiveRaw * 1000) / 1000,
912
+ distribution: {
913
+ mean: Math.round(mean * 1000) / 1000,
914
+ stddev: Math.round(stddev * 1000) / 1000,
915
+ candidates_evaluated: results.length,
916
+ },
917
+ };
918
+ }
919
+ else if (useAdaptiveThreshold) {
920
+ // Too few results for stats, fall back to default
921
+ effectiveThreshold = requestedThreshold;
922
+ thresholdInfo = {
923
+ mode: 'adaptive_fallback',
924
+ requested: requestedThreshold,
925
+ effective: requestedThreshold,
926
+ reason: 'too_few_results_for_adaptive',
927
+ };
928
+ }
929
+ else {
930
+ thresholdInfo = {
931
+ mode: 'explicit',
932
+ requested: requestedThreshold,
933
+ effective: requestedThreshold,
934
+ };
935
+ }
936
+ // Filter results by effective threshold and apply final limit
937
+ const thresholdFiltered = results
938
+ .filter(r => r.similarity_score >= effectiveThreshold)
939
+ .slice(0, searchLimit);
940
+ let finalResults;
941
+ let rerankInfo;
942
+ if (input.rerank && thresholdFiltered.length > 0) {
943
+ const rerankInput = thresholdFiltered.map((r) => ({
944
+ chunk_id: r.chunk_id,
945
+ image_id: r.image_id,
946
+ extraction_id: r.extraction_id,
947
+ embedding_id: r.embedding_id,
948
+ document_id: r.document_id,
949
+ original_text: r.original_text,
950
+ result_type: r.result_type,
951
+ source_file_path: r.source_file_path,
952
+ source_file_name: r.source_file_name,
953
+ source_file_hash: r.source_file_hash,
954
+ page_number: r.page_number,
955
+ character_start: r.character_start,
956
+ character_end: r.character_end,
957
+ chunk_index: r.chunk_index,
958
+ provenance_id: r.provenance_id,
959
+ content_hash: r.content_hash,
960
+ rank: 0,
961
+ score: r.similarity_score,
962
+ }));
963
+ const reranked = await rerankResults(input.query, rerankInput, limit);
964
+ finalResults = reranked.map((r) => {
965
+ const original = thresholdFiltered[r.original_index];
966
+ const result = {
967
+ embedding_id: original.embedding_id,
968
+ chunk_id: original.chunk_id,
969
+ image_id: original.image_id,
970
+ extraction_id: original.extraction_id ?? null,
971
+ document_id: original.document_id,
972
+ result_type: original.result_type,
973
+ similarity_score: original.similarity_score,
974
+ original_text: original.original_text,
975
+ source_file_path: original.source_file_path,
976
+ source_file_name: original.source_file_name,
977
+ source_file_hash: original.source_file_hash,
978
+ page_number: original.page_number,
979
+ character_start: original.character_start,
980
+ character_end: original.character_end,
981
+ chunk_index: original.chunk_index,
982
+ total_chunks: original.total_chunks,
983
+ content_hash: original.content_hash,
984
+ provenance_id: original.provenance_id,
985
+ heading_context: original.heading_context ?? null,
986
+ section_path: original.section_path ?? null,
987
+ content_types: original.content_types ?? null,
988
+ is_atomic: original.is_atomic ?? false,
989
+ chunk_page_range: original.chunk_page_range ?? null,
990
+ heading_level: original.heading_level ?? null,
991
+ ocr_quality_score: original.ocr_quality_score ?? null,
992
+ doc_title: original.doc_title ?? null,
993
+ doc_author: original.doc_author ?? null,
994
+ doc_subject: original.doc_subject ?? null,
995
+ overlap_previous: original.overlap_previous ?? 0,
996
+ overlap_next: original.overlap_next ?? 0,
997
+ chunking_strategy: original.chunking_strategy ?? null,
998
+ embedding_status: original.embedding_status ?? 'pending',
999
+ doc_page_count: original.doc_page_count ?? null,
1000
+ datalab_mode: original.datalab_mode ?? null,
1001
+ rerank_score: r.relevance_score,
1002
+ rerank_reasoning: r.reasoning,
1003
+ };
1004
+ attachProvenance(result, db, original.provenance_id, !!input.include_provenance);
1005
+ return result;
1006
+ });
1007
+ rerankInfo = {
1008
+ reranked: true,
1009
+ candidates_evaluated: Math.min(thresholdFiltered.length, 20),
1010
+ results_returned: finalResults.length,
1011
+ };
1012
+ }
1013
+ else {
1014
+ finalResults = thresholdFiltered.map((r) => {
1015
+ const result = {
1016
+ embedding_id: r.embedding_id,
1017
+ chunk_id: r.chunk_id,
1018
+ image_id: r.image_id,
1019
+ extraction_id: r.extraction_id ?? null,
1020
+ document_id: r.document_id,
1021
+ result_type: r.result_type,
1022
+ similarity_score: r.similarity_score,
1023
+ original_text: r.original_text,
1024
+ source_file_path: r.source_file_path,
1025
+ source_file_name: r.source_file_name,
1026
+ source_file_hash: r.source_file_hash,
1027
+ page_number: r.page_number,
1028
+ character_start: r.character_start,
1029
+ character_end: r.character_end,
1030
+ chunk_index: r.chunk_index,
1031
+ total_chunks: r.total_chunks,
1032
+ content_hash: r.content_hash,
1033
+ provenance_id: r.provenance_id,
1034
+ heading_context: r.heading_context ?? null,
1035
+ section_path: r.section_path ?? null,
1036
+ content_types: r.content_types ?? null,
1037
+ is_atomic: r.is_atomic ?? false,
1038
+ chunk_page_range: r.chunk_page_range ?? null,
1039
+ heading_level: r.heading_level ?? null,
1040
+ ocr_quality_score: r.ocr_quality_score ?? null,
1041
+ doc_title: r.doc_title ?? null,
1042
+ doc_author: r.doc_author ?? null,
1043
+ doc_subject: r.doc_subject ?? null,
1044
+ overlap_previous: r.overlap_previous ?? 0,
1045
+ overlap_next: r.overlap_next ?? 0,
1046
+ chunking_strategy: r.chunking_strategy ?? null,
1047
+ embedding_status: r.embedding_status ?? 'pending',
1048
+ doc_page_count: r.doc_page_count ?? null,
1049
+ datalab_mode: r.datalab_mode ?? null,
1050
+ };
1051
+ attachProvenance(result, db, r.provenance_id, !!input.include_provenance);
1052
+ return result;
1053
+ });
1054
+ }
1055
+ // Apply metadata-based score boosts and length normalization
1056
+ applyMetadataBoosts(finalResults, { contentTypeQuery: input.query });
1057
+ applyLengthNormalization(finalResults, db);
1058
+ // Re-sort by similarity_score after boosts
1059
+ finalResults.sort((a, b) => b.similarity_score - a.similarity_score);
1060
+ // Enrich VLM results with image metadata
1061
+ enrichVLMResultsWithImageMetadata(conn, finalResults);
1062
+ // Task 7.3: Deduplicate by content_hash if requested
1063
+ if (input.exclude_duplicate_chunks) {
1064
+ finalResults = deduplicateByContentHash(finalResults);
1065
+ }
1066
+ // T2.8: Exclude system:repeated_header_footer tagged chunks by default
1067
+ if (!input.include_headers_footers) {
1068
+ finalResults = excludeRepeatedHeaderFooterChunks(conn, finalResults);
1069
+ }
1070
+ // Task 3.1: Cluster context included by default (unless explicitly false)
1071
+ const clusterContextIncluded = input.include_cluster_context && finalResults.length > 0;
1072
+ if (clusterContextIncluded) {
1073
+ attachClusterContext(conn, finalResults);
1074
+ }
1075
+ // Phase 4: Attach neighbor context chunks if requested
1076
+ const contextChunkCount = input.include_context_chunks ?? 0;
1077
+ if (contextChunkCount > 0) {
1078
+ attachContextChunks(conn, finalResults, contextChunkCount);
1079
+ }
1080
+ // Phase 5: Attach table metadata for atomic table chunks
1081
+ attachTableMetadata(conn, finalResults);
1082
+ // T2.12: Attach cross-document context if requested
1083
+ if (input.include_document_context) {
1084
+ attachCrossDocumentContext(conn, finalResults);
1085
+ }
1086
+ const responseData = {
1087
+ query: input.query,
1088
+ results: finalResults,
1089
+ total: finalResults.length,
1090
+ threshold: effectiveThreshold,
1091
+ threshold_info: thresholdInfo,
1092
+ metadata_boosts_applied: true,
1093
+ cluster_context_included: clusterContextIncluded,
1094
+ next_steps: finalResults.length === 0
1095
+ ? [
1096
+ { tool: 'ocr_search', description: 'Try different keywords, mode, or broader query' },
1097
+ { tool: 'ocr_ingest_files', description: 'Add more documents to expand searchable content' },
1098
+ ]
1099
+ : finalResults.length === 1
1100
+ ? [
1101
+ { tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
1102
+ { tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
1103
+ { tool: 'ocr_document_find_similar', description: 'Find related documents' },
1104
+ ]
1105
+ : [
1106
+ { tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
1107
+ { tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
1108
+ { tool: 'ocr_document_page', description: 'Read the full page a result came from' },
1109
+ ],
1110
+ };
1111
+ // No query_expansion in semantic mode — expansion only applies to BM25/hybrid.
1112
+ if (rerankInfo) {
1113
+ responseData.rerank = rerankInfo;
1114
+ }
1115
+ // V7: Apply compact mode and provenance summaries before grouping
1116
+ applyV7Transforms(responseData, input, db, 'semantic');
1117
+ if (input.group_by_document) {
1118
+ const { grouped, total_documents } = groupResultsByDocument(responseData.results);
1119
+ const groupedResponse = {
1120
+ ...responseData,
1121
+ total_results: finalResults.length,
1122
+ total_documents,
1123
+ documents: grouped,
1124
+ };
1125
+ delete groupedResponse.results;
1126
+ delete groupedResponse.total;
1127
+ return formatResponse(successResult(groupedResponse));
1128
+ }
1129
+ return formatResponse(successResult(responseData));
1130
+ }); // end withDatabaseOperation
1131
+ }
1132
+ catch (error) {
1133
+ return handleError(error);
1134
+ }
1135
+ }
1136
+ /**
1137
+ * Internal: BM25 full-text keyword search logic (called by unified handler)
1138
+ */
1139
+ async function handleSearchKeywordInternal(params) {
1140
+ try {
1141
+ return await withDatabaseOperation(async ({ db }) => {
1142
+ // Params already validated and enriched by handleSearchUnified
1143
+ const input = params;
1144
+ const conn = db.getConnection();
1145
+ // Expand query with domain-specific synonyms + corpus cluster terms if requested
1146
+ const tableQueryDetected = isTableQuery(input.query);
1147
+ let searchQuery = input.query;
1148
+ let queryExpansion;
1149
+ if (input.expand_query) {
1150
+ searchQuery = expandQuery(input.query, db, tableQueryDetected);
1151
+ queryExpansion = getExpandedTerms(input.query, db, tableQueryDetected);
1152
+ }
1153
+ // Resolve metadata filter to document IDs, then chain through quality + cluster filters
1154
+ const documentFilter = resolveClusterFilter(conn, input.cluster_id, resolveQualityFilter(db, input.min_quality_score, resolveMetadataFilter(db, input.metadata_filter, input.document_filter)));
1155
+ // Resolve chunk-level filters
1156
+ const chunkFilter = resolveChunkFilter({
1157
+ content_type_filter: input.content_type_filter,
1158
+ section_path_filter: input.section_path_filter,
1159
+ heading_filter: input.heading_filter,
1160
+ page_range_filter: input.page_range_filter,
1161
+ is_atomic_filter: input.is_atomic_filter,
1162
+ heading_level_filter: input.heading_level_filter,
1163
+ min_page_count: input.min_page_count,
1164
+ max_page_count: input.max_page_count,
1165
+ table_columns_contain: input.table_columns_contain,
1166
+ });
1167
+ const bm25 = new BM25SearchService(conn);
1168
+ const limit = input.limit ?? 10;
1169
+ // Over-fetch from both sources (limit * 2) since we merge and truncate
1170
+ const fetchLimit = input.rerank ? Math.max(limit * 2, 20) : limit * 2;
1171
+ // Search chunks FTS
1172
+ // When expand_query produced an OR-joined FTS5 expression, pass preSanitized
1173
+ // to prevent sanitizeFTS5Query from inserting implicit AND (H-2 fix).
1174
+ const preSanitized = !!input.expand_query;
1175
+ const chunkResults = bm25.search({
1176
+ query: searchQuery,
1177
+ limit: fetchLimit,
1178
+ phraseSearch: input.phrase_search,
1179
+ documentFilter,
1180
+ includeHighlight: input.include_highlight,
1181
+ chunkFilter: chunkFilter.conditions.length > 0 ? chunkFilter : undefined,
1182
+ preSanitized,
1183
+ });
1184
+ // Search VLM FTS
1185
+ const vlmResults = bm25.searchVLM({
1186
+ query: searchQuery,
1187
+ limit: fetchLimit,
1188
+ phraseSearch: input.phrase_search,
1189
+ documentFilter,
1190
+ includeHighlight: input.include_highlight,
1191
+ pageRangeFilter: input.page_range_filter,
1192
+ preSanitized,
1193
+ });
1194
+ // Search extractions FTS
1195
+ const extractionResults = bm25.searchExtractions({
1196
+ query: searchQuery,
1197
+ limit: fetchLimit,
1198
+ phraseSearch: input.phrase_search,
1199
+ documentFilter,
1200
+ includeHighlight: input.include_highlight,
1201
+ preSanitized,
1202
+ });
1203
+ // Merge by score (higher is better), apply combined limit
1204
+ const mergeLimit = input.rerank ? Math.max(limit * 2, 20) : limit;
1205
+ const allResults = [...chunkResults, ...vlmResults, ...extractionResults]
1206
+ .sort((a, b) => b.bm25_score - a.bm25_score)
1207
+ .slice(0, mergeLimit);
1208
+ // Re-rank after merge
1209
+ const rankedResults = allResults.map((r, i) => ({ ...r, rank: i + 1 }));
1210
+ let finalResults;
1211
+ let rerankInfo;
1212
+ if (input.rerank && rankedResults.length > 0) {
1213
+ const rerankInput = rankedResults.map((r) => ({ ...r }));
1214
+ const reranked = await rerankResults(input.query, rerankInput, limit);
1215
+ finalResults = reranked.map((r) => {
1216
+ const original = rankedResults[r.original_index];
1217
+ const base = {
1218
+ ...original,
1219
+ rerank_score: r.relevance_score,
1220
+ rerank_reasoning: r.reasoning,
1221
+ };
1222
+ attachProvenance(base, db, original.provenance_id, !!input.include_provenance, 'provenance_chain');
1223
+ return base;
1224
+ });
1225
+ rerankInfo = {
1226
+ reranked: true,
1227
+ candidates_evaluated: Math.min(rankedResults.length, 20),
1228
+ results_returned: finalResults.length,
1229
+ };
1230
+ }
1231
+ else {
1232
+ finalResults = rankedResults.map((r) => {
1233
+ const base = { ...r };
1234
+ attachProvenance(base, db, r.provenance_id, !!input.include_provenance, 'provenance_chain');
1235
+ return base;
1236
+ });
1237
+ }
1238
+ // Apply metadata-based score boosts and length normalization
1239
+ applyMetadataBoosts(finalResults, { contentTypeQuery: input.query });
1240
+ applyLengthNormalization(finalResults, db);
1241
+ // Re-sort by bm25_score after boosts
1242
+ finalResults.sort((a, b) => b.bm25_score - a.bm25_score);
1243
+ // Enrich VLM results with image metadata
1244
+ enrichVLMResultsWithImageMetadata(conn, finalResults);
1245
+ // Task 7.3: Deduplicate by content_hash if requested
1246
+ if (input.exclude_duplicate_chunks) {
1247
+ finalResults = deduplicateByContentHash(finalResults);
1248
+ }
1249
+ // T2.8: Exclude system:repeated_header_footer tagged chunks by default
1250
+ if (!input.include_headers_footers) {
1251
+ finalResults = excludeRepeatedHeaderFooterChunks(conn, finalResults);
1252
+ }
1253
+ // Compute source counts from final merged results (not pre-merge candidates)
1254
+ let finalChunkCount = 0;
1255
+ let finalVlmCount = 0;
1256
+ let finalExtractionCount = 0;
1257
+ for (const r of finalResults) {
1258
+ if (r.result_type === 'chunk')
1259
+ finalChunkCount++;
1260
+ else if (r.result_type === 'vlm')
1261
+ finalVlmCount++;
1262
+ else
1263
+ finalExtractionCount++;
1264
+ }
1265
+ // Task 3.1: Cluster context included by default (unless explicitly false)
1266
+ const clusterContextIncluded = input.include_cluster_context && finalResults.length > 0;
1267
+ if (clusterContextIncluded) {
1268
+ attachClusterContext(conn, finalResults);
1269
+ }
1270
+ // Phase 4: Attach neighbor context chunks if requested
1271
+ const contextChunkCount = input.include_context_chunks ?? 0;
1272
+ if (contextChunkCount > 0) {
1273
+ attachContextChunks(conn, finalResults, contextChunkCount);
1274
+ }
1275
+ // Phase 5: Attach table metadata for atomic table chunks
1276
+ attachTableMetadata(conn, finalResults);
1277
+ // T2.12: Attach cross-document context if requested
1278
+ if (input.include_document_context) {
1279
+ attachCrossDocumentContext(conn, finalResults);
1280
+ }
1281
+ // Document metadata matches (v30 FTS5 on doc_title/author/subject)
1282
+ let documentMetadataMatches;
1283
+ const metadataResults = bm25.searchDocumentMetadata({
1284
+ query: input.query,
1285
+ limit: 5,
1286
+ phraseSearch: input.phrase_search,
1287
+ });
1288
+ if (metadataResults.length > 0) {
1289
+ documentMetadataMatches = metadataResults;
1290
+ }
1291
+ const responseData = {
1292
+ query: input.query,
1293
+ search_type: 'bm25',
1294
+ results: finalResults,
1295
+ total: finalResults.length,
1296
+ sources: {
1297
+ chunk_count: finalChunkCount,
1298
+ vlm_count: finalVlmCount,
1299
+ extraction_count: finalExtractionCount,
1300
+ },
1301
+ metadata_boosts_applied: true,
1302
+ cluster_context_included: clusterContextIncluded,
1303
+ next_steps: finalResults.length === 0
1304
+ ? [
1305
+ { tool: 'ocr_search', description: 'Try different keywords, mode, or broader query' },
1306
+ { tool: 'ocr_ingest_files', description: 'Add more documents to expand searchable content' },
1307
+ ]
1308
+ : finalResults.length === 1
1309
+ ? [
1310
+ { tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
1311
+ { tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
1312
+ { tool: 'ocr_document_find_similar', description: 'Find related documents' },
1313
+ ]
1314
+ : [
1315
+ { tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
1316
+ { tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
1317
+ { tool: 'ocr_document_page', description: 'Read the full page a result came from' },
1318
+ ],
1319
+ };
1320
+ if (documentMetadataMatches) {
1321
+ responseData.document_metadata_matches = documentMetadataMatches;
1322
+ }
1323
+ // Task 3.2: Standardized query expansion details
1324
+ if (queryExpansion) {
1325
+ responseData.query_expansion = {
1326
+ original_query: queryExpansion.original,
1327
+ expanded_query: searchQuery,
1328
+ synonyms_found: queryExpansion.synonyms_found,
1329
+ terms_added: queryExpansion.expanded.length,
1330
+ corpus_terms: queryExpansion.corpus_terms,
1331
+ };
1332
+ }
1333
+ if (rerankInfo) {
1334
+ responseData.rerank = rerankInfo;
1335
+ }
1336
+ // V7: Apply compact mode and provenance summaries before grouping
1337
+ applyV7Transforms(responseData, input, db, 'keyword');
1338
+ if (input.group_by_document) {
1339
+ const { grouped, total_documents } = groupResultsByDocument(responseData.results);
1340
+ const groupedResponse = {
1341
+ ...responseData,
1342
+ total_results: finalResults.length,
1343
+ total_documents,
1344
+ documents: grouped,
1345
+ };
1346
+ delete groupedResponse.results;
1347
+ delete groupedResponse.total;
1348
+ return formatResponse(successResult(groupedResponse));
1349
+ }
1350
+ return formatResponse(successResult(responseData));
1351
+ }); // end withDatabaseOperation
1352
+ }
1353
+ catch (error) {
1354
+ return handleError(error);
1355
+ }
1356
+ }
1357
+ /**
1358
+ * Internal: Hybrid search using Reciprocal Rank Fusion (called by unified handler)
1359
+ */
1360
+ async function handleSearchHybridInternal(params) {
1361
+ try {
1362
+ return await withDatabaseOperation(async ({ db, vector }) => {
1363
+ // Params already validated and enriched by handleSearchUnified
1364
+ const input = params;
1365
+ const limit = input.limit ?? 10;
1366
+ const conn = db.getConnection();
1367
+ // Auto-route: classify query and adjust weights
1368
+ let queryClassification;
1369
+ if (input.auto_route) {
1370
+ queryClassification = classifyQuery(input.query);
1371
+ if (queryClassification.query_type === 'exact') {
1372
+ input.bm25_weight = 1.5;
1373
+ input.semantic_weight = 0.5;
1374
+ }
1375
+ else if (queryClassification.query_type === 'semantic') {
1376
+ input.bm25_weight = 0.5;
1377
+ input.semantic_weight = 1.5;
1378
+ }
1379
+ // 'mixed' keeps defaults (1.0/1.0)
1380
+ }
1381
+ // Expand query with domain-specific synonyms + corpus cluster terms if requested
1382
+ const tableQueryDetected = isTableQuery(input.query);
1383
+ let searchQuery = input.query;
1384
+ let queryExpansion;
1385
+ if (input.expand_query) {
1386
+ searchQuery = expandQuery(input.query, db, tableQueryDetected);
1387
+ queryExpansion = getExpandedTerms(input.query, db, tableQueryDetected);
1388
+ }
1389
+ // Resolve metadata filter to document IDs, then chain through quality + cluster filters
1390
+ const documentFilter = resolveClusterFilter(conn, input.cluster_id, resolveQualityFilter(db, input.min_quality_score, resolveMetadataFilter(db, input.metadata_filter, input.document_filter)));
1391
+ // Resolve chunk-level filters
1392
+ const chunkFilter = resolveChunkFilter({
1393
+ content_type_filter: input.content_type_filter,
1394
+ section_path_filter: input.section_path_filter,
1395
+ heading_filter: input.heading_filter,
1396
+ page_range_filter: input.page_range_filter,
1397
+ is_atomic_filter: input.is_atomic_filter,
1398
+ heading_level_filter: input.heading_level_filter,
1399
+ min_page_count: input.min_page_count,
1400
+ max_page_count: input.max_page_count,
1401
+ table_columns_contain: input.table_columns_contain,
1402
+ });
1403
+ // Get BM25 results (chunks + VLM + extractions)
1404
+ const bm25 = new BM25SearchService(db.getConnection());
1405
+ // When expand_query produced an OR-joined FTS5 expression, pass preSanitized
1406
+ // to prevent sanitizeFTS5Query from inserting implicit AND (H-2 fix).
1407
+ const preSanitized = !!input.expand_query;
1408
+ // includeHighlight: false -- hybrid discards BM25 highlights (RRF doesn't surface snippets)
1409
+ const bm25ChunkResults = bm25.search({
1410
+ query: searchQuery,
1411
+ limit: limit * 2,
1412
+ documentFilter,
1413
+ includeHighlight: false,
1414
+ chunkFilter: chunkFilter.conditions.length > 0 ? chunkFilter : undefined,
1415
+ preSanitized,
1416
+ });
1417
+ const bm25VlmResults = bm25.searchVLM({
1418
+ query: searchQuery,
1419
+ limit: limit * 2,
1420
+ documentFilter,
1421
+ includeHighlight: false,
1422
+ pageRangeFilter: input.page_range_filter,
1423
+ preSanitized,
1424
+ });
1425
+ const bm25ExtractionResults = bm25.searchExtractions({
1426
+ query: searchQuery,
1427
+ limit: limit * 2,
1428
+ documentFilter,
1429
+ includeHighlight: false,
1430
+ preSanitized,
1431
+ });
1432
+ // Merge BM25 results by score
1433
+ const allBm25 = [...bm25ChunkResults, ...bm25VlmResults, ...bm25ExtractionResults]
1434
+ .sort((a, b) => b.bm25_score - a.bm25_score)
1435
+ .slice(0, limit * 2)
1436
+ .map((r, i) => ({ ...r, rank: i + 1 }));
1437
+ // Get semantic results using ORIGINAL query (not FTS5-expanded)
1438
+ // The expanded query contains OR operators that contaminate embedding vectors
1439
+ const embedder = getEmbeddingService();
1440
+ let hybridEmbeddingQuery = input.query;
1441
+ if (input.section_path_filter) {
1442
+ hybridEmbeddingQuery = `[Section: ${input.section_path_filter}] ${hybridEmbeddingQuery}`;
1443
+ }
1444
+ const queryVector = await embedder.embedSearchQuery(hybridEmbeddingQuery);
1445
+ const semanticResults = vector.searchSimilar(queryVector, {
1446
+ limit: limit * 2,
1447
+ // Lower threshold than standalone (0.7) -- RRF de-ranks low-quality results
1448
+ threshold: 0.3,
1449
+ documentFilter,
1450
+ chunkFilter: chunkFilter.conditions.length > 0 ? chunkFilter : undefined,
1451
+ pageRangeFilter: input.page_range_filter,
1452
+ });
1453
+ // Convert to ranked format and fuse with RRF
1454
+ const bm25Ranked = toBm25Ranked(allBm25);
1455
+ const semanticRanked = toSemanticRanked(semanticResults);
1456
+ const fusion = new RRFFusion({
1457
+ k: input.rrf_k,
1458
+ bm25Weight: input.bm25_weight,
1459
+ semanticWeight: input.semantic_weight,
1460
+ });
1461
+ const fusionLimit = input.rerank ? Math.max(limit * 2, 20) : limit;
1462
+ const rawResults = fusion.fuse(bm25Ranked, semanticRanked, fusionLimit);
1463
+ let finalResults;
1464
+ let rerankInfo;
1465
+ if (input.rerank && rawResults.length > 0) {
1466
+ const rerankInput = rawResults.map((r) => ({ ...r }));
1467
+ const reranked = await rerankResults(input.query, rerankInput, limit);
1468
+ finalResults = reranked.map((r) => {
1469
+ const original = rawResults[r.original_index];
1470
+ const base = {
1471
+ ...original,
1472
+ rerank_score: r.relevance_score,
1473
+ rerank_reasoning: r.reasoning,
1474
+ };
1475
+ attachProvenance(base, db, original.provenance_id, !!input.include_provenance, 'provenance_chain');
1476
+ return base;
1477
+ });
1478
+ rerankInfo = {
1479
+ reranked: true,
1480
+ candidates_evaluated: Math.min(rawResults.length, 20),
1481
+ results_returned: finalResults.length,
1482
+ };
1483
+ }
1484
+ else {
1485
+ finalResults = rawResults.map((r) => {
1486
+ const base = { ...r };
1487
+ attachProvenance(base, db, r.provenance_id, !!input.include_provenance, 'provenance_chain');
1488
+ return base;
1489
+ });
1490
+ }
1491
+ // Chunk proximity boost - reward clusters of nearby relevant chunks
1492
+ const chunkProximityInfo = finalResults.length > 0 ? applyChunkProximityBoost(finalResults) : undefined;
1493
+ // Apply metadata-based score boosts and length normalization
1494
+ applyMetadataBoosts(finalResults, { contentTypeQuery: input.query });
1495
+ applyLengthNormalization(finalResults, db);
1496
+ // Enrich VLM results with image metadata
1497
+ enrichVLMResultsWithImageMetadata(conn, finalResults);
1498
+ // Re-sort by rrf_score after proximity boost and metadata boosts may have changed scores
1499
+ finalResults.sort((a, b) => b.rrf_score - a.rrf_score);
1500
+ // Task 7.3: Deduplicate by content_hash if requested
1501
+ if (input.exclude_duplicate_chunks) {
1502
+ finalResults = deduplicateByContentHash(finalResults);
1503
+ }
1504
+ // T2.8: Exclude system:repeated_header_footer tagged chunks by default
1505
+ if (!input.include_headers_footers) {
1506
+ finalResults = excludeRepeatedHeaderFooterChunks(conn, finalResults);
1507
+ }
1508
+ // Task 3.1: Cluster context included by default (unless explicitly false)
1509
+ const clusterContextIncluded = input.include_cluster_context && finalResults.length > 0;
1510
+ if (clusterContextIncluded) {
1511
+ attachClusterContext(conn, finalResults);
1512
+ }
1513
+ // Phase 4: Attach neighbor context chunks if requested
1514
+ const contextChunkCount = input.include_context_chunks ?? 0;
1515
+ if (contextChunkCount > 0) {
1516
+ attachContextChunks(conn, finalResults, contextChunkCount);
1517
+ }
1518
+ // Phase 5: Attach table metadata for atomic table chunks
1519
+ attachTableMetadata(db.getConnection(), finalResults);
1520
+ // T2.12: Attach cross-document context if requested
1521
+ if (input.include_document_context) {
1522
+ attachCrossDocumentContext(conn, finalResults);
1523
+ }
1524
+ const responseData = {
1525
+ query: input.query,
1526
+ search_type: 'rrf_hybrid',
1527
+ config: {
1528
+ bm25_weight: input.bm25_weight,
1529
+ semantic_weight: input.semantic_weight,
1530
+ rrf_k: input.rrf_k,
1531
+ },
1532
+ results: finalResults,
1533
+ total: finalResults.length,
1534
+ sources: {
1535
+ bm25_chunk_count: bm25ChunkResults.length,
1536
+ bm25_vlm_count: bm25VlmResults.length,
1537
+ bm25_extraction_count: bm25ExtractionResults.length,
1538
+ semantic_count: semanticResults.length,
1539
+ },
1540
+ metadata_boosts_applied: true,
1541
+ cluster_context_included: clusterContextIncluded,
1542
+ next_steps: finalResults.length === 0
1543
+ ? [
1544
+ { tool: 'ocr_search', description: 'Try different keywords, mode, or broader query' },
1545
+ { tool: 'ocr_ingest_files', description: 'Add more documents to expand searchable content' },
1546
+ ]
1547
+ : finalResults.length === 1
1548
+ ? [
1549
+ { tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
1550
+ { tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
1551
+ { tool: 'ocr_document_find_similar', description: 'Find related documents' },
1552
+ ]
1553
+ : [
1554
+ { tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
1555
+ { tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
1556
+ { tool: 'ocr_document_page', description: 'Read the full page a result came from' },
1557
+ ],
1558
+ };
1559
+ // Task 3.2: Standardized query expansion details
1560
+ if (queryExpansion) {
1561
+ responseData.query_expansion = {
1562
+ original_query: queryExpansion.original,
1563
+ expanded_query: searchQuery,
1564
+ synonyms_found: queryExpansion.synonyms_found,
1565
+ terms_added: queryExpansion.expanded.length,
1566
+ corpus_terms: queryExpansion.corpus_terms,
1567
+ };
1568
+ }
1569
+ if (rerankInfo) {
1570
+ responseData.rerank = rerankInfo;
1571
+ }
1572
+ if (chunkProximityInfo) {
1573
+ responseData.chunk_proximity_boost = chunkProximityInfo;
1574
+ }
1575
+ if (queryClassification) {
1576
+ responseData.query_classification = queryClassification;
1577
+ }
1578
+ // V7: Apply compact mode and provenance summaries before grouping
1579
+ applyV7Transforms(responseData, input, db, 'hybrid');
1580
+ if (input.group_by_document) {
1581
+ const { grouped, total_documents } = groupResultsByDocument(responseData.results);
1582
+ const groupedResponse = {
1583
+ ...responseData,
1584
+ total_results: finalResults.length,
1585
+ total_documents,
1586
+ documents: grouped,
1587
+ };
1588
+ delete groupedResponse.results;
1589
+ delete groupedResponse.total;
1590
+ return formatResponse(successResult(groupedResponse));
1591
+ }
1592
+ return formatResponse(successResult(responseData));
1593
+ }); // end withDatabaseOperation
1594
+ }
1595
+ catch (error) {
1596
+ return handleError(error);
1597
+ }
1598
+ }
1599
+ // ═══════════════════════════════════════════════════════════════════════════════
1600
+ // UNIFIED SEARCH HANDLER
1601
+ // ═══════════════════════════════════════════════════════════════════════════════
1602
+ /**
1603
+ * Handle ocr_search - Unified search across keyword (BM25), semantic (vector),
1604
+ * and hybrid (BM25+semantic RRF fusion) modes.
1605
+ *
1606
+ * Always-on optimizations (hardcoded, no parameters needed):
1607
+ * - quality_boost: true (quality-weighted ranking)
1608
+ * - expand_query: true (domain synonym + corpus term expansion)
1609
+ * - exclude_duplicate_chunks: true (deduplicate by content hash)
1610
+ * - exclude headers/footers: true (filter repeated header/footer chunks)
1611
+ * - include_cluster_context: true (cluster membership in results)
1612
+ */
1613
+ export async function handleSearchUnified(params) {
1614
+ try {
1615
+ const input = validateInput(SearchUnifiedInput, params);
1616
+ // Flatten filters from nested object into top-level params for internal handlers.
1617
+ // Internal handlers (InternalSearchParams) expect flat params, not nested filters.
1618
+ const filters = input.filters ?? {};
1619
+ // Pass similarity_threshold through if the user explicitly provided any value.
1620
+ // The internal semantic handler uses adaptive threshold when it's undefined.
1621
+ const userSetThreshold = input.similarity_threshold !== undefined;
1622
+ const enrichedParams = {
1623
+ // Spread validated top-level params
1624
+ query: input.query,
1625
+ mode: input.mode,
1626
+ limit: input.limit,
1627
+ include_provenance: input.include_provenance,
1628
+ rerank: input.rerank,
1629
+ include_context_chunks: input.include_context_chunks,
1630
+ group_by_document: input.group_by_document,
1631
+ phrase_search: input.phrase_search,
1632
+ include_highlight: input.include_highlight,
1633
+ ...(userSetThreshold ? { similarity_threshold: input.similarity_threshold } : {}),
1634
+ bm25_weight: input.bm25_weight,
1635
+ semantic_weight: input.semantic_weight,
1636
+ rrf_k: input.rrf_k,
1637
+ auto_route: input.auto_route,
1638
+ // Flatten nested filters to top-level for internal handlers
1639
+ document_filter: filters.document_filter,
1640
+ metadata_filter: filters.metadata_filter,
1641
+ min_quality_score: filters.min_quality_score,
1642
+ cluster_id: filters.cluster_id,
1643
+ content_type_filter: filters.content_type_filter,
1644
+ section_path_filter: filters.section_path_filter,
1645
+ heading_filter: filters.heading_filter,
1646
+ page_range_filter: filters.page_range_filter,
1647
+ is_atomic_filter: filters.is_atomic_filter,
1648
+ heading_level_filter: filters.heading_level_filter,
1649
+ min_page_count: filters.min_page_count,
1650
+ max_page_count: filters.max_page_count,
1651
+ table_columns_contain: filters.table_columns_contain,
1652
+ // Hardcode always-on defaults
1653
+ quality_boost: true,
1654
+ expand_query: true,
1655
+ exclude_duplicate_chunks: true,
1656
+ include_headers_footers: false,
1657
+ include_cluster_context: true,
1658
+ include_document_context: true,
1659
+ // V7 Intelligence Optimization params
1660
+ compact: input.compact,
1661
+ include_provenance_summary: input.include_provenance_summary,
1662
+ };
1663
+ // Route to internal handler based on mode
1664
+ switch (input.mode) {
1665
+ case 'keyword':
1666
+ return await handleSearchKeywordInternal(enrichedParams);
1667
+ case 'semantic':
1668
+ return await handleSearchSemanticInternal(enrichedParams);
1669
+ case 'hybrid':
1670
+ default:
1671
+ return await handleSearchHybridInternal(enrichedParams);
1672
+ }
1673
+ }
1674
+ catch (error) {
1675
+ return handleError(error);
1676
+ }
1677
+ }
1678
+ /**
1679
+ * Handle ocr_fts_manage - Manage FTS5 indexes (rebuild or check status)
1680
+ * Covers both chunks FTS and VLM FTS indexes
1681
+ */
1682
+ export async function handleFTSManage(params) {
1683
+ try {
1684
+ const input = validateInput(FTSManageInput, params);
1685
+ const { db } = requireDatabase();
1686
+ const bm25 = new BM25SearchService(db.getConnection());
1687
+ if (input.action === 'rebuild') {
1688
+ const result = bm25.rebuildIndex();
1689
+ return formatResponse(successResult({ operation: 'fts_rebuild', ...result, next_steps: [{ tool: 'ocr_search', description: 'Search using the rebuilt index' }, { tool: 'ocr_db_stats', description: 'Check database statistics' }] }));
1690
+ }
1691
+ const status = bm25.getStatus();
1692
+ // Detect chunks without embeddings (invisible to semantic search)
1693
+ try {
1694
+ const conn = db.getConnection();
1695
+ const gapRow = conn
1696
+ .prepare(`SELECT COUNT(*) as cnt FROM chunks c
1697
+ LEFT JOIN embeddings e ON e.chunk_id = c.id
1698
+ WHERE e.id IS NULL`)
1699
+ .get();
1700
+ status.chunks_without_embeddings = gapRow.cnt;
1701
+ }
1702
+ catch (error) {
1703
+ console.error(`[Search] Failed to query chunks without embeddings: ${String(error)}`);
1704
+ }
1705
+ status.next_steps = [{ tool: 'ocr_search', description: 'Search using the rebuilt index' }, { tool: 'ocr_db_stats', description: 'Check database statistics' }];
1706
+ return formatResponse(successResult(status));
1707
+ }
1708
+ catch (error) {
1709
+ return handleError(error);
1710
+ }
1711
+ }
1712
+ // ═══════════════════════════════════════════════════════════════════════════════
1713
+ // RAG CONTEXT ASSEMBLY HANDLER
1714
+ // ═══════════════════════════════════════════════════════════════════════════════
1715
+ /**
1716
+ * Task 3.3: Deduplicate overlapping chunks in RAG context.
1717
+ * Two chunks from the same document overlap if their character ranges
1718
+ * overlap by >50%. The higher-scored chunk is kept.
1719
+ * Results must be pre-sorted by score (descending) before calling.
1720
+ */
1721
+ function deduplicateOverlappingResults(results) {
1722
+ if (results.length <= 1)
1723
+ return results;
1724
+ const deduplicated = [];
1725
+ for (const result of results) {
1726
+ const docId = result.document_id;
1727
+ const charStart = (result.character_start ?? result.char_start);
1728
+ const charEnd = (result.character_end ?? result.char_end);
1729
+ if (charStart == null || charEnd == null) {
1730
+ deduplicated.push(result);
1731
+ continue;
1732
+ }
1733
+ let isDuplicate = false;
1734
+ for (const prev of deduplicated) {
1735
+ if (prev.document_id !== docId)
1736
+ continue;
1737
+ const prevStart = (prev.character_start ?? prev.char_start);
1738
+ const prevEnd = (prev.character_end ?? prev.char_end);
1739
+ if (prevStart == null || prevEnd == null)
1740
+ continue;
1741
+ const overlapStart = Math.max(charStart, prevStart);
1742
+ const overlapEnd = Math.min(charEnd, prevEnd);
1743
+ if (overlapEnd > overlapStart) {
1744
+ const overlapLen = overlapEnd - overlapStart;
1745
+ const thisLen = charEnd - charStart;
1746
+ if (thisLen > 0 && overlapLen / thisLen > 0.5) {
1747
+ isDuplicate = true;
1748
+ break;
1749
+ }
1750
+ }
1751
+ }
1752
+ if (!isDuplicate)
1753
+ deduplicated.push(result);
1754
+ }
1755
+ return deduplicated;
1756
+ }
1757
+ /**
1758
+ * Task 3.4: Enforce source diversity in RAG context.
1759
+ * Limits the maximum number of chunks per document to prevent
1760
+ * a single long document from dominating context.
1761
+ */
1762
+ function enforceSourceDiversity(results, maxPerDocument = 3) {
1763
+ const docCounts = new Map();
1764
+ const diversified = [];
1765
+ for (const result of results) {
1766
+ const docId = result.document_id;
1767
+ const count = docCounts.get(docId) ?? 0;
1768
+ if (count < maxPerDocument) {
1769
+ diversified.push(result);
1770
+ docCounts.set(docId, count + 1);
1771
+ }
1772
+ }
1773
+ return diversified;
1774
+ }
1775
+ /**
1776
+ * RAG Context Input schema - validated inline (not exported to validation.ts
1777
+ * since this is a self-contained tool with a unique schema).
1778
+ */
1779
+ const RagContextInput = z.object({
1780
+ question: z.string().min(1).max(2000).describe('The question to build context for'),
1781
+ limit: z
1782
+ .number()
1783
+ .int()
1784
+ .min(1)
1785
+ .max(20)
1786
+ .default(5)
1787
+ .describe('Maximum search results to include in context'),
1788
+ document_filter: z.array(z.string()).optional().describe('Restrict to specific documents'),
1789
+ max_context_length: z
1790
+ .number()
1791
+ .int()
1792
+ .min(500)
1793
+ .max(50000)
1794
+ .default(8000)
1795
+ .describe('Maximum total context length in characters'),
1796
+ max_results_per_document: z
1797
+ .number()
1798
+ .int()
1799
+ .min(1)
1800
+ .max(20)
1801
+ .default(3)
1802
+ .describe('Maximum chunks per document for source diversity (default: 3)'),
1803
+ });
1804
+ /**
1805
+ * Handle ocr_rag_context - Assemble a RAG context block for LLM consumption.
1806
+ *
1807
+ * Runs hybrid search (BM25 + semantic + RRF) and assembles a single markdown
1808
+ * context block optimized for LLM consumption.
1809
+ *
1810
+ * Pipeline:
1811
+ * 1. Hybrid search (BM25 + semantic + RRF)
1812
+ * 2. Assemble markdown: excerpts
1813
+ * 3. Truncate to max_context_length
1814
+ */
1815
+ async function handleRagContext(params) {
1816
+ try {
1817
+ const input = validateInput(RagContextInput, params);
1818
+ const { db, vector } = requireDatabase();
1819
+ const conn = db.getConnection();
1820
+ const limit = input.limit ?? 5;
1821
+ const maxContextLength = input.max_context_length ?? 8000;
1822
+ // ── Step 1: Run hybrid search (BM25 + semantic + RRF) ──────────────────
1823
+ const bm25 = new BM25SearchService(conn);
1824
+ const fetchLimit = limit * 2;
1825
+ const bm25ChunkResults = bm25.search({
1826
+ query: input.question,
1827
+ limit: fetchLimit,
1828
+ documentFilter: input.document_filter,
1829
+ includeHighlight: false,
1830
+ });
1831
+ const bm25VlmResults = bm25.searchVLM({
1832
+ query: input.question,
1833
+ limit: fetchLimit,
1834
+ documentFilter: input.document_filter,
1835
+ includeHighlight: false,
1836
+ });
1837
+ const bm25ExtractionResults = bm25.searchExtractions({
1838
+ query: input.question,
1839
+ limit: fetchLimit,
1840
+ documentFilter: input.document_filter,
1841
+ includeHighlight: false,
1842
+ });
1843
+ const allBm25 = [...bm25ChunkResults, ...bm25VlmResults, ...bm25ExtractionResults]
1844
+ .sort((a, b) => b.bm25_score - a.bm25_score)
1845
+ .slice(0, fetchLimit)
1846
+ .map((r, i) => ({ ...r, rank: i + 1 }));
1847
+ // Semantic search
1848
+ const embedder = getEmbeddingService();
1849
+ const queryVector = await embedder.embedSearchQuery(input.question);
1850
+ const semanticResults = vector.searchSimilar(queryVector, {
1851
+ limit: fetchLimit,
1852
+ threshold: 0.3,
1853
+ documentFilter: input.document_filter,
1854
+ });
1855
+ // Convert to ranked format and fuse with RRF (default weights)
1856
+ // Over-fetch to allow room for dedup + diversity filtering
1857
+ const bm25Ranked = toBm25Ranked(allBm25);
1858
+ const semanticRanked = toSemanticRanked(semanticResults);
1859
+ const fusion = new RRFFusion({ k: 60, bm25Weight: 1.0, semanticWeight: 1.0 });
1860
+ const fusedResults = fusion.fuse(bm25Ranked, semanticRanked, limit * 3);
1861
+ // Handle empty results
1862
+ if (fusedResults.length === 0) {
1863
+ const emptyContext = '## Relevant Document Excerpts\n\nNo relevant documents found for the given question.';
1864
+ return formatResponse(successResult({
1865
+ question: input.question,
1866
+ context: emptyContext,
1867
+ context_length: emptyContext.length,
1868
+ search_results_used: 0,
1869
+ sources: [],
1870
+ deduplication: { before: 0, after: 0, removed: 0 },
1871
+ source_diversity: { max_per_document: input.max_results_per_document ?? 3, before: 0, after: 0 },
1872
+ next_steps: [{ tool: 'ocr_search', description: 'Try a broader search query' }],
1873
+ }));
1874
+ }
1875
+ // ── Step 1b: Deduplicate overlapping chunks (Task 3.3) ──────────────
1876
+ const preDedupResults = fusedResults;
1877
+ const deduplicated = deduplicateOverlappingResults(preDedupResults);
1878
+ const dedupStats = {
1879
+ before: preDedupResults.length,
1880
+ after: deduplicated.length,
1881
+ removed: preDedupResults.length - deduplicated.length,
1882
+ };
1883
+ // ── Step 1c: Enforce source diversity (Task 3.4) ────────────────────
1884
+ const maxPerDoc = input.max_results_per_document ?? 3;
1885
+ const diversified = enforceSourceDiversity(deduplicated, maxPerDoc);
1886
+ const diversityStats = {
1887
+ max_per_document: maxPerDoc,
1888
+ before: deduplicated.length,
1889
+ after: diversified.length,
1890
+ };
1891
+ // Apply final limit after dedup + diversity
1892
+ const finalFused = diversified.slice(0, limit);
1893
+ // Enrich VLM results with image metadata
1894
+ enrichVLMResultsWithImageMetadata(conn, finalFused);
1895
+ // ── Step 2: Assemble markdown context ──────────────────────────────────
1896
+ const contextParts = [];
1897
+ // Document excerpts
1898
+ contextParts.push('## Relevant Document Excerpts\n');
1899
+ const sources = [];
1900
+ for (let i = 0; i < finalFused.length; i++) {
1901
+ const r = finalFused[i];
1902
+ const score = Math.round(r.rrf_score * 1000) / 1000;
1903
+ const fileName = r.source_file_name || path.basename(r.source_file_path || 'unknown');
1904
+ const pageInfo = r.page_number !== null && r.page_number !== undefined ? `, Page ${r.page_number}` : '';
1905
+ contextParts.push(`### Result ${i + 1} (Score: ${score})`);
1906
+ contextParts.push(`**Source:** ${fileName}${pageInfo}`);
1907
+ if (r.section_path) {
1908
+ contextParts.push(`**Section:** ${r.section_path}`);
1909
+ }
1910
+ if (r.heading_context) {
1911
+ contextParts.push(`**Heading:** ${r.heading_context}`);
1912
+ }
1913
+ // For VLM results with image metadata, include image context
1914
+ if (r.image_extracted_path) {
1915
+ const blockType = r.image_block_type || 'Image';
1916
+ const imgPage = r.image_page_number ?? r.page_number ?? 'unknown';
1917
+ contextParts.push(`> **[Image: ${blockType} on page ${imgPage}]**`);
1918
+ contextParts.push(`> File: ${r.image_extracted_path}`);
1919
+ contextParts.push(`> Description: ${r.original_text.replace(/\n/g, '\n> ')}\n`);
1920
+ }
1921
+ else {
1922
+ contextParts.push(`> ${r.original_text.replace(/\n/g, '\n> ')}\n`);
1923
+ }
1924
+ sources.push({
1925
+ file_name: fileName,
1926
+ page_number: r.page_number,
1927
+ document_id: r.document_id,
1928
+ });
1929
+ }
1930
+ // ── Step 3: Truncate to max_context_length ─────────────────────────────
1931
+ let assembledMarkdown = contextParts.join('\n');
1932
+ if (assembledMarkdown.length > maxContextLength) {
1933
+ assembledMarkdown = assembledMarkdown.slice(0, maxContextLength - 3) + '...';
1934
+ }
1935
+ // ── Step 4: Return structured response ─────────────────────────────────
1936
+ const ragResponse = {
1937
+ question: input.question,
1938
+ context: assembledMarkdown,
1939
+ context_length: assembledMarkdown.length,
1940
+ search_results_used: finalFused.length,
1941
+ sources,
1942
+ deduplication: dedupStats,
1943
+ source_diversity: diversityStats,
1944
+ };
1945
+ ragResponse.next_steps = [{ tool: 'ocr_search', description: 'Run a more detailed search with filters' }, { tool: 'ocr_document_get', description: 'Get full details for a source document' }, { tool: 'ocr_chunk_context', description: 'Expand a specific chunk with surrounding text' }];
1946
+ return formatResponse(successResult(ragResponse));
1947
+ }
1948
+ catch (error) {
1949
+ return handleError(error);
1950
+ }
1951
+ }
1952
+ // ═══════════════════════════════════════════════════════════════════════════════
1953
+ // BENCHMARK COMPARE HANDLER
1954
+ // ═══════════════════════════════════════════════════════════════════════════════
1955
+ /**
1956
+ * Handle ocr_benchmark_compare - Compare search results across multiple databases
1957
+ */
1958
+ async function handleBenchmarkCompare(params) {
1959
+ try {
1960
+ const input = validateInput(z.object({
1961
+ query: z.string().min(1).max(1000),
1962
+ database_names: z.array(z.string().min(1)).min(2),
1963
+ search_type: z.enum(['bm25', 'semantic']).default('bm25'),
1964
+ limit: z.number().int().min(1).max(50).default(10),
1965
+ }), params);
1966
+ const storagePath = getDefaultStoragePath();
1967
+ const dbResults = [];
1968
+ for (const dbName of input.database_names) {
1969
+ let tempDb = null;
1970
+ try {
1971
+ tempDb = DatabaseService.open(dbName, storagePath);
1972
+ const conn = tempDb.getConnection();
1973
+ let scores;
1974
+ let documentIds;
1975
+ if (input.search_type === 'bm25') {
1976
+ const bm25 = new BM25SearchService(conn);
1977
+ const results = bm25.search({
1978
+ query: input.query,
1979
+ limit: input.limit,
1980
+ includeHighlight: false,
1981
+ });
1982
+ scores = results.map((r) => r.bm25_score);
1983
+ documentIds = results.map((r) => r.document_id);
1984
+ }
1985
+ else {
1986
+ const vectorSvc = new VectorService(conn);
1987
+ const embedder = getEmbeddingService();
1988
+ const queryVector = await embedder.embedSearchQuery(input.query);
1989
+ const results = vectorSvc.searchSimilar(queryVector, {
1990
+ limit: input.limit,
1991
+ threshold: 0.3,
1992
+ });
1993
+ scores = results.map((r) => r.similarity_score);
1994
+ documentIds = results.map((r) => r.document_id);
1995
+ }
1996
+ const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
1997
+ dbResults.push({
1998
+ database_name: dbName,
1999
+ result_count: scores.length,
2000
+ top_scores: scores.slice(0, 5),
2001
+ avg_score: Math.round(avgScore * 1000) / 1000,
2002
+ document_ids: documentIds,
2003
+ });
2004
+ }
2005
+ catch (error) {
2006
+ dbResults.push({
2007
+ database_name: dbName,
2008
+ result_count: 0,
2009
+ top_scores: [],
2010
+ avg_score: 0,
2011
+ document_ids: [],
2012
+ error: error instanceof Error ? error.message : String(error),
2013
+ });
2014
+ }
2015
+ finally {
2016
+ tempDb?.close();
2017
+ }
2018
+ }
2019
+ // FIX-6: If every database had an error, return an error instead of success with 0 results
2020
+ const allFailed = dbResults.length > 0 && dbResults.every(r => 'error' in r && r.error);
2021
+ if (allFailed) {
2022
+ const errors = dbResults.map(r => `${r.database_name}: ${r.error}`).join('; ');
2023
+ return handleError(new Error(`All databases failed: ${errors}`));
2024
+ }
2025
+ // Compute overlap analysis: which document_ids appear in multiple databases
2026
+ const allDocIds = new Map(); // doc_id -> list of db names
2027
+ for (const dbResult of dbResults) {
2028
+ for (const docId of dbResult.document_ids) {
2029
+ const existing = allDocIds.get(docId) || [];
2030
+ existing.push(dbResult.database_name);
2031
+ allDocIds.set(docId, existing);
2032
+ }
2033
+ }
2034
+ const overlapping = Object.fromEntries([...allDocIds.entries()].filter(([, dbs]) => dbs.length > 1));
2035
+ return formatResponse(successResult({
2036
+ query: input.query,
2037
+ search_type: input.search_type,
2038
+ limit: input.limit,
2039
+ databases: dbResults,
2040
+ overlap_analysis: {
2041
+ overlapping_document_ids: overlapping,
2042
+ overlap_count: Object.keys(overlapping).length,
2043
+ total_unique_documents: allDocIds.size,
2044
+ },
2045
+ next_steps: [{ tool: 'ocr_search', description: 'Search in the current database' }, { tool: 'ocr_db_select', description: 'Switch to a different database' }],
2046
+ }));
2047
+ }
2048
+ catch (error) {
2049
+ return handleError(error);
2050
+ }
2051
+ }
2052
+ // ═══════════════════════════════════════════════════════════════════════════════
2053
+ // SEARCH EXPORT HANDLER
2054
+ // ═══════════════════════════════════════════════════════════════════════════════
2055
+ /**
2056
+ * Handle ocr_search_export - Export search results to CSV or JSON file
2057
+ */
2058
+ async function handleSearchExport(params) {
2059
+ try {
2060
+ const input = validateInput(z.object({
2061
+ query: z.string().min(1).max(1000),
2062
+ search_type: z.enum(['bm25', 'semantic', 'hybrid']).default('hybrid'),
2063
+ limit: z.number().int().min(1).max(1000).default(100),
2064
+ format: z.enum(['csv', 'json']).default('csv'),
2065
+ output_path: z.string().min(1),
2066
+ include_text: z.boolean().default(true),
2067
+ }), params);
2068
+ // Run the appropriate search, routing through unified handler with appropriate mode
2069
+ const searchParams = {
2070
+ query: input.query,
2071
+ limit: input.limit,
2072
+ include_provenance: false,
2073
+ mode: input.search_type === 'bm25' ? 'keyword' : input.search_type,
2074
+ };
2075
+ const searchResult = await handleSearchUnified(searchParams);
2076
+ // Parse search results from the ToolResponse
2077
+ if (!searchResult.content || searchResult.content.length === 0) {
2078
+ throw new Error('Search returned empty content');
2079
+ }
2080
+ const responseContent = searchResult.content[0];
2081
+ if (responseContent.type !== 'text')
2082
+ throw new Error('Unexpected search response format');
2083
+ let parsedResponse;
2084
+ try {
2085
+ parsedResponse = JSON.parse(responseContent.text);
2086
+ }
2087
+ catch (error) {
2088
+ console.error('[search] handleSearchExport failed to parse search response as JSON:', error instanceof Error ? error.message : String(error));
2089
+ throw new Error('Failed to parse search response as JSON');
2090
+ }
2091
+ if (!parsedResponse.success) {
2092
+ const errObj = parsedResponse.error;
2093
+ throw new Error(`Search failed: ${errObj?.message || 'Unknown error'}`);
2094
+ }
2095
+ const dataObj = parsedResponse.data;
2096
+ const results = Array.isArray(dataObj?.results)
2097
+ ? dataObj.results
2098
+ : [];
2099
+ // Sanitize output path to prevent directory traversal
2100
+ const safeOutputPath = sanitizePath(input.output_path);
2101
+ // Ensure output directory exists
2102
+ const outputDir = path.dirname(safeOutputPath);
2103
+ fs.mkdirSync(outputDir, { recursive: true });
2104
+ if (input.format === 'json') {
2105
+ const exportData = {
2106
+ results: results.map((r) => {
2107
+ const row = {
2108
+ document_id: r.document_id,
2109
+ source_file: r.source_file_name || r.source_file_path,
2110
+ page_number: r.page_number,
2111
+ score: r.bm25_score ?? r.similarity_score ?? r.rrf_score,
2112
+ result_type: r.result_type,
2113
+ };
2114
+ if (input.include_text)
2115
+ row.text = r.original_text;
2116
+ return row;
2117
+ }),
2118
+ };
2119
+ fs.writeFileSync(safeOutputPath, JSON.stringify(exportData, null, 2));
2120
+ }
2121
+ else {
2122
+ // CSV - RFC 4180 compliant: all fields double-quoted, internal quotes doubled
2123
+ const csvQuote = (value) => `"${value.replace(/"/g, '""')}"`;
2124
+ const headers = ['document_id', 'source_file', 'page_number', 'score', 'result_type'];
2125
+ if (input.include_text)
2126
+ headers.push('text');
2127
+ const csvLines = [headers.map(csvQuote).join(',')];
2128
+ for (const r of results) {
2129
+ const row = [
2130
+ csvQuote(String(r.document_id ?? '')),
2131
+ csvQuote(String(r.source_file_name || r.source_file_path || '')),
2132
+ csvQuote(r.page_number !== null && r.page_number !== undefined ? String(r.page_number) : ''),
2133
+ csvQuote(String(r.bm25_score ?? r.similarity_score ?? r.rrf_score ?? '')),
2134
+ csvQuote(String(r.result_type || '')),
2135
+ ];
2136
+ if (input.include_text) {
2137
+ row.push(csvQuote(String(r.original_text || '')));
2138
+ }
2139
+ csvLines.push(row.join(','));
2140
+ }
2141
+ fs.writeFileSync(safeOutputPath, csvLines.join('\n'));
2142
+ }
2143
+ return formatResponse(successResult({
2144
+ output_path: safeOutputPath,
2145
+ format: input.format,
2146
+ result_count: results.length,
2147
+ search_type: input.search_type,
2148
+ query: input.query,
2149
+ next_steps: [{ tool: 'ocr_search', description: 'Run another search with different parameters' }, { tool: 'ocr_document_get', description: 'Get details for a document from the results' }],
2150
+ }));
2151
+ }
2152
+ catch (error) {
2153
+ return handleError(error);
2154
+ }
2155
+ }
2156
+ // ═══════════════════════════════════════════════════════════════════════════════
2157
+ // SAVED SEARCH HANDLERS
2158
+ // ═══════════════════════════════════════════════════════════════════════════════
2159
+ const SearchSavedInput = z.object({
2160
+ action: z.enum(['list', 'get', 'execute', 'save']).describe('Action: list saved searches, get by ID, execute a saved search, or save a new search'),
2161
+ saved_search_id: z.string().min(1).optional().describe('ID of the saved search (required for get and execute actions)'),
2162
+ search_type: z.enum(['bm25', 'semantic', 'hybrid']).optional().describe('Filter by search type (list) or search method (save)'),
2163
+ limit: z.number().int().min(1).max(100).default(50).describe('Max results for list action'),
2164
+ offset: z.number().int().min(0).default(0).describe('Pagination offset for list action'),
2165
+ override_limit: z.number().int().min(1).max(100).optional()
2166
+ .describe('Override the original result limit (execute action only)'),
2167
+ name: z.string().min(1).max(200).optional().describe('Name for saved search (required for save action)'),
2168
+ query: z.string().min(1).max(1000).optional().describe('Search query (required for save action)'),
2169
+ search_params: z.record(z.unknown()).optional().describe('Search parameters JSON (save action)'),
2170
+ result_count: z.number().int().min(0).optional().describe('Number of results (save action)'),
2171
+ result_ids: z.array(z.string()).optional().describe('Result IDs array (save action)'),
2172
+ notes: z.string().optional().describe('Notes about this search (save action)'),
2173
+ });
2174
+ /**
2175
+ * Handle ocr_search_saved - Unified saved search management (MERGE-B: includes save action)
2176
+ *
2177
+ * Actions:
2178
+ * - save: Save search results for later retrieval
2179
+ * - list: List saved searches with optional type filtering
2180
+ * - get: Retrieve a saved search by ID including all parameters and result IDs
2181
+ * - execute: Re-execute a saved search with current data via handleSearchUnified
2182
+ */
2183
+ async function handleSearchSaved(params) {
2184
+ try {
2185
+ const input = validateInput(SearchSavedInput, params);
2186
+ const { db } = requireDatabase();
2187
+ const conn = db.getConnection();
2188
+ if (input.action === 'save') {
2189
+ // Validate required fields for save
2190
+ if (!input.name)
2191
+ throw new MCPError('VALIDATION_ERROR', 'name is required for save action');
2192
+ if (!input.query)
2193
+ throw new MCPError('VALIDATION_ERROR', 'query is required for save action');
2194
+ if (!input.search_type)
2195
+ throw new MCPError('VALIDATION_ERROR', 'search_type is required for save action');
2196
+ if (input.result_count === undefined)
2197
+ throw new MCPError('VALIDATION_ERROR', 'result_count is required for save action');
2198
+ const id = uuidv4();
2199
+ const now = new Date().toISOString();
2200
+ conn.prepare(`
2201
+ INSERT INTO saved_searches (id, name, query, search_type, search_params, result_count, result_ids, created_at, notes)
2202
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
2203
+ `).run(id, input.name, input.query, input.search_type, JSON.stringify(input.search_params ?? {}), input.result_count, JSON.stringify(input.result_ids ?? []), now, input.notes ?? null);
2204
+ return formatResponse(successResult({
2205
+ saved_search_id: id,
2206
+ name: input.name,
2207
+ query: input.query,
2208
+ search_type: input.search_type,
2209
+ result_count: input.result_count,
2210
+ created_at: now,
2211
+ next_steps: [{ tool: 'ocr_search_saved', description: 'List or re-execute saved searches' }],
2212
+ }));
2213
+ }
2214
+ if (input.action === 'list') {
2215
+ let sql = 'SELECT id, name, query, search_type, result_count, created_at, notes, last_executed_at, execution_count FROM saved_searches';
2216
+ const sqlParams = [];
2217
+ if (input.search_type) {
2218
+ sql += ' WHERE search_type = ?';
2219
+ sqlParams.push(input.search_type);
2220
+ }
2221
+ sql += ' ORDER BY created_at DESC LIMIT ? OFFSET ?';
2222
+ sqlParams.push(input.limit, input.offset);
2223
+ const rows = conn.prepare(sql).all(...sqlParams);
2224
+ const totalRow = conn.prepare(input.search_type
2225
+ ? 'SELECT COUNT(*) as count FROM saved_searches WHERE search_type = ?'
2226
+ : 'SELECT COUNT(*) as count FROM saved_searches').get(...(input.search_type ? [input.search_type] : []));
2227
+ return formatResponse(successResult({
2228
+ action: 'list',
2229
+ saved_searches: rows,
2230
+ total: totalRow.count,
2231
+ limit: input.limit,
2232
+ offset: input.offset,
2233
+ next_steps: [{ tool: 'ocr_search', description: 'Run a new search' }, { tool: 'ocr_search_saved', description: 'Save a search (action=save) for later' }],
2234
+ }));
2235
+ }
2236
+ // Both 'get' and 'execute' require saved_search_id
2237
+ if (!input.saved_search_id) {
2238
+ throw new MCPError('VALIDATION_ERROR', 'saved_search_id is required for get and execute actions');
2239
+ }
2240
+ if (input.action === 'get') {
2241
+ const row = conn.prepare('SELECT * FROM saved_searches WHERE id = ?').get(input.saved_search_id);
2242
+ if (!row) {
2243
+ throw new Error(`Saved search not found: ${input.saved_search_id}`);
2244
+ }
2245
+ return formatResponse(successResult({
2246
+ action: 'get',
2247
+ id: row.id,
2248
+ name: row.name,
2249
+ query: row.query,
2250
+ search_type: row.search_type,
2251
+ search_params: JSON.parse(row.search_params),
2252
+ result_count: row.result_count,
2253
+ result_ids: JSON.parse(row.result_ids),
2254
+ created_at: row.created_at,
2255
+ notes: row.notes,
2256
+ next_steps: [{ tool: 'ocr_search', description: 'Run a new search' }, { tool: 'ocr_search_saved', description: 'Save a search (action=save) for later' }],
2257
+ }));
2258
+ }
2259
+ // action === 'execute'
2260
+ const row = conn.prepare('SELECT * FROM saved_searches WHERE id = ?').get(input.saved_search_id);
2261
+ if (!row) {
2262
+ throw new MCPError('VALIDATION_ERROR', `Saved search not found: ${input.saved_search_id}`);
2263
+ }
2264
+ // Parse stored search parameters
2265
+ let searchParams;
2266
+ try {
2267
+ searchParams = JSON.parse(row.search_params);
2268
+ }
2269
+ catch (parseErr) {
2270
+ throw new MCPError('INTERNAL_ERROR', `Failed to parse saved search params: ${String(parseErr)}`);
2271
+ }
2272
+ // Override limit if requested
2273
+ if (input.override_limit !== undefined) {
2274
+ searchParams.limit = input.override_limit;
2275
+ }
2276
+ // Ensure query is set in params
2277
+ searchParams.query = row.query;
2278
+ // Dispatch through unified handler with appropriate mode
2279
+ const modeMap = { bm25: 'keyword', semantic: 'semantic', hybrid: 'hybrid' };
2280
+ const mode = modeMap[row.search_type];
2281
+ if (!mode) {
2282
+ throw new MCPError('VALIDATION_ERROR', `Unknown search type: ${row.search_type}`);
2283
+ }
2284
+ searchParams.mode = mode;
2285
+ const searchResult = await handleSearchUnified(searchParams);
2286
+ // Parse the search result to wrap with saved search metadata
2287
+ const searchResultData = JSON.parse(searchResult.content[0].text);
2288
+ // Task 6.4: Update saved search analytics (execution tracking)
2289
+ let analyticsWarning;
2290
+ try {
2291
+ conn.prepare('UPDATE saved_searches SET last_executed_at = ?, execution_count = COALESCE(execution_count, 0) + 1 WHERE id = ?').run(new Date().toISOString(), row.id);
2292
+ }
2293
+ catch (analyticsErr) {
2294
+ // Non-fatal: schema pre-v30 databases may not have these columns yet
2295
+ const msg = analyticsErr instanceof Error ? analyticsErr.message : String(analyticsErr);
2296
+ console.error('[search] Failed to update saved search analytics:', msg);
2297
+ analyticsWarning = `Analytics tracking unavailable: database schema may be pre-v30. ${msg}`;
2298
+ }
2299
+ const result = {
2300
+ action: 'execute',
2301
+ saved_search: {
2302
+ id: row.id,
2303
+ name: row.name,
2304
+ query: row.query,
2305
+ search_type: row.search_type,
2306
+ original_result_count: row.result_count,
2307
+ created_at: row.created_at,
2308
+ notes: row.notes,
2309
+ },
2310
+ re_executed_at: new Date().toISOString(),
2311
+ search_results: searchResultData,
2312
+ next_steps: [{ tool: 'ocr_search', description: 'Run a new search' }, { tool: 'ocr_search_saved', description: 'Save a search (action=save) for later' }],
2313
+ };
2314
+ if (analyticsWarning) {
2315
+ result.warning = analyticsWarning;
2316
+ }
2317
+ return formatResponse(successResult(result));
2318
+ }
2319
+ catch (error) {
2320
+ return handleError(error);
2321
+ }
2322
+ }
2323
+ // ═══════════════════════════════════════════════════════════════════════════════
2324
+ // CROSS-DATABASE SEARCH HANDLER
2325
+ // ═══════════════════════════════════════════════════════════════════════════════
2326
+ const CrossDbSearchInput = z.object({
2327
+ query: z.string().min(1).describe('Search query'),
2328
+ database_names: z.array(z.string()).optional()
2329
+ .describe('Database names to search (default: all databases)'),
2330
+ limit_per_db: z.number().int().min(1).max(50).default(10)
2331
+ .describe('Maximum results per database'),
2332
+ });
2333
+ /**
2334
+ * Handle ocr_search_cross_db - Search across multiple databases using BM25
2335
+ */
2336
+ async function handleCrossDbSearch(params) {
2337
+ try {
2338
+ const input = validateInput(CrossDbSearchInput, params);
2339
+ const { listDatabases } = await import('../services/storage/database/static-operations.js');
2340
+ const Database = (await import('better-sqlite3')).default;
2341
+ // Get list of databases
2342
+ let databases = listDatabases();
2343
+ // Filter to requested database_names if provided
2344
+ if (input.database_names && input.database_names.length > 0) {
2345
+ const nameSet = new Set(input.database_names);
2346
+ databases = databases.filter((db) => nameSet.has(db.name));
2347
+ }
2348
+ const allResults = [];
2349
+ const skippedDbs = [];
2350
+ for (const dbInfo of databases) {
2351
+ let conn = null;
2352
+ try {
2353
+ conn = new Database(dbInfo.path, { readonly: true });
2354
+ // Check if FTS table exists
2355
+ const ftsCheck = conn
2356
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_fts'")
2357
+ .get();
2358
+ if (!ftsCheck) {
2359
+ skippedDbs.push({ name: dbInfo.name, reason: 'No FTS index (chunks_fts table not found)' });
2360
+ continue;
2361
+ }
2362
+ // Run BM25 search (sanitize query for FTS5 safety)
2363
+ const ftsQuery = sanitizeFTS5Query(input.query);
2364
+ const rows = conn
2365
+ .prepare(`SELECT c.id, c.document_id, c.text, c.chunk_index, bm25(chunks_fts) AS bm25_score
2366
+ FROM chunks_fts
2367
+ JOIN chunks c ON c.rowid = chunks_fts.rowid
2368
+ WHERE chunks_fts MATCH ?
2369
+ ORDER BY bm25(chunks_fts)
2370
+ LIMIT ?`)
2371
+ .all(ftsQuery, input.limit_per_db);
2372
+ for (const row of rows) {
2373
+ // Get document info
2374
+ const docInfo = conn
2375
+ .prepare('SELECT file_name, file_path FROM documents WHERE id = ?')
2376
+ .get(row.document_id);
2377
+ allResults.push({
2378
+ database_name: dbInfo.name,
2379
+ document_id: row.document_id,
2380
+ file_name: docInfo?.file_name ?? null,
2381
+ chunk_id: row.id,
2382
+ chunk_index: row.chunk_index,
2383
+ text_preview: row.text.substring(0, 300),
2384
+ bm25_score: Math.abs(row.bm25_score),
2385
+ normalized_score: 0, // Set during per-database normalization below
2386
+ });
2387
+ }
2388
+ }
2389
+ catch (dbError) {
2390
+ const errMsg = dbError instanceof Error ? dbError.message : String(dbError);
2391
+ console.error(`[CrossDbSearch] Failed to search database ${dbInfo.name}: ${errMsg}`);
2392
+ skippedDbs.push({ name: dbInfo.name, reason: errMsg });
2393
+ }
2394
+ finally {
2395
+ if (conn) {
2396
+ try {
2397
+ conn.close();
2398
+ }
2399
+ catch (closeErr) {
2400
+ console.error(`[CrossDbSearch] Failed to close connection to ${dbInfo.name}: ${String(closeErr)}`);
2401
+ }
2402
+ }
2403
+ }
2404
+ }
2405
+ // Normalize BM25 scores per-database before merging.
2406
+ // BM25 scores from different databases use different corpus statistics (IDF, avgdl)
2407
+ // so raw scores are not comparable. Min-max normalize each database's scores to [0, 1].
2408
+ const byDatabase = new Map();
2409
+ for (const r of allResults) {
2410
+ if (!byDatabase.has(r.database_name))
2411
+ byDatabase.set(r.database_name, []);
2412
+ byDatabase.get(r.database_name).push(r);
2413
+ }
2414
+ for (const dbResults of byDatabase.values()) {
2415
+ const scores = dbResults.map(r => r.bm25_score);
2416
+ const minScore = safeMin(scores) ?? 0;
2417
+ const maxScore = safeMax(scores) ?? 0;
2418
+ const range = maxScore - minScore;
2419
+ for (const r of dbResults) {
2420
+ r.normalized_score = range > 0
2421
+ ? (r.bm25_score - minScore) / range
2422
+ : 1.0;
2423
+ }
2424
+ }
2425
+ // Sort by normalized score (higher=better)
2426
+ allResults.sort((a, b) => b.normalized_score - a.normalized_score);
2427
+ return formatResponse(successResult({
2428
+ query: input.query,
2429
+ databases_searched: databases.length - skippedDbs.length,
2430
+ total_results: allResults.length,
2431
+ results: allResults,
2432
+ score_normalization: 'per_database_min_max',
2433
+ databases_skipped: skippedDbs.length > 0 ? skippedDbs : undefined,
2434
+ next_steps: [{ tool: 'ocr_db_select', description: 'Switch to a specific database for deeper search' }, { tool: 'ocr_search', description: 'Search within the current database with full features' }],
2435
+ }));
2436
+ }
2437
+ catch (error) {
2438
+ return handleError(error);
2439
+ }
2440
+ }
2441
+ // ═══════════════════════════════════════════════════════════════════════════════
2442
+ // TOOL DEFINITIONS EXPORT
2443
+ // ═══════════════════════════════════════════════════════════════════════════════
2444
+ /**
2445
+ * Search tools collection for MCP server registration
2446
+ */
2447
+ export const searchTools = {
2448
+ ocr_search: {
2449
+ description: '[ESSENTIAL] Primary search. mode="keyword" (BM25), "semantic" (vector), or "hybrid" (default, best). Quality-weighted, query-expanded, deduplicated.',
2450
+ inputSchema: SearchUnifiedInput.shape,
2451
+ handler: handleSearchUnified,
2452
+ },
2453
+ ocr_fts_manage: {
2454
+ description: '[SETUP] FTS5 index maintenance. action="status" checks health; "rebuild" recreates index. Use when keyword search returns unexpected zero results.',
2455
+ inputSchema: {
2456
+ action: z.enum(['rebuild', 'status']).describe('Action: rebuild index or check status'),
2457
+ },
2458
+ handler: handleFTSManage,
2459
+ },
2460
+ ocr_search_export: {
2461
+ description: '[STATUS] Use to export search results to a CSV or JSON file on disk. Returns file path and result count.',
2462
+ inputSchema: {
2463
+ query: z.string().min(1).max(1000).describe('Search query'),
2464
+ search_type: z
2465
+ .enum(['bm25', 'semantic', 'hybrid'])
2466
+ .default('hybrid')
2467
+ .describe('Search method to use'),
2468
+ limit: z.number().int().min(1).max(1000).default(100).describe('Maximum results'),
2469
+ format: z.enum(['csv', 'json']).default('csv').describe('Export file format'),
2470
+ output_path: z.string().min(1).describe('File path to save export'),
2471
+ include_text: z.boolean().default(true).describe('Include full text in export'),
2472
+ },
2473
+ handler: handleSearchExport,
2474
+ },
2475
+ ocr_benchmark_compare: {
2476
+ description: '[SEARCH] Use when you have the same documents in separate databases and want to compare search quality. Returns per-database results for the same query.',
2477
+ inputSchema: {
2478
+ query: z.string().min(1).max(1000).describe('Search query'),
2479
+ database_names: z
2480
+ .array(z.string().min(1))
2481
+ .min(2)
2482
+ .describe('Database names to compare (minimum 2)'),
2483
+ search_type: z.enum(['bm25', 'semantic']).default('bm25').describe('Search method to use'),
2484
+ limit: z.number().int().min(1).max(50).default(10).describe('Maximum results per database'),
2485
+ },
2486
+ handler: handleBenchmarkCompare,
2487
+ },
2488
+ ocr_rag_context: {
2489
+ description: '[ESSENTIAL] Use when answering a user question about document content. Returns pre-assembled, deduplicated markdown context from hybrid search. Best for RAG workflows.',
2490
+ inputSchema: {
2491
+ question: z.string().min(1).max(2000).describe('The question to build context for'),
2492
+ limit: z
2493
+ .number()
2494
+ .int()
2495
+ .min(1)
2496
+ .max(20)
2497
+ .default(5)
2498
+ .describe('Maximum search results to include in context'),
2499
+ document_filter: z.array(z.string()).optional().describe('Restrict to specific documents'),
2500
+ max_context_length: z
2501
+ .number()
2502
+ .int()
2503
+ .min(500)
2504
+ .max(50000)
2505
+ .default(8000)
2506
+ .describe('Maximum total context length in characters'),
2507
+ max_results_per_document: z
2508
+ .number()
2509
+ .int()
2510
+ .min(1)
2511
+ .max(20)
2512
+ .default(3)
2513
+ .describe('Maximum chunks per document for source diversity (default: 3)'),
2514
+ },
2515
+ handler: handleRagContext,
2516
+ },
2517
+ ocr_search_saved: {
2518
+ description: '[SEARCH] Manage saved searches. action="save"|"list"|"get"|"execute". Save requires name, query, search_type, result_count.',
2519
+ inputSchema: SearchSavedInput.shape,
2520
+ handler: handleSearchSaved,
2521
+ },
2522
+ ocr_search_cross_db: {
2523
+ description: '[SEARCH] Use to search across ALL databases at once using BM25 keyword matching. Returns merged results with database source. No need to switch databases.',
2524
+ inputSchema: CrossDbSearchInput.shape,
2525
+ handler: handleCrossDbSearch,
2526
+ },
2527
+ };
2528
+ //# sourceMappingURL=search.js.map