ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,626 @@
1
+ /**
2
+ * Embedding Management Tools
3
+ *
4
+ * MCP tools for listing, inspecting, and rebuilding embeddings.
5
+ * Provides visibility into embedding state, source context, and provenance.
6
+ *
7
+ * CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
8
+ * Use console.error() for all logging.
9
+ *
10
+ * @module tools/embeddings
11
+ */
12
+ import { z } from 'zod';
13
+ import { v4 as uuidv4 } from 'uuid';
14
+ import { formatResponse, handleError, fetchProvenanceChain } from './shared.js';
15
+ import { successResult } from '../server/types.js';
16
+ import { requireDatabase } from '../server/state.js';
17
+ import { validateInput } from '../utils/validation.js';
18
+ import { getImage } from '../services/storage/database/image-operations.js';
19
+ import { getEmbeddingService, EmbeddingService } from '../services/embedding/embedder.js';
20
+ import { getEmbeddingClient } from '../services/embedding/nomic.js';
21
+ import { computeHash } from '../utils/hash.js';
22
+ import { ProvenanceType as ProvType } from '../models/provenance.js';
23
+ import { EMBEDDING_MODEL } from '../models/embedding.js';
24
+ import { documentNotFoundError } from '../server/errors.js';
25
+ // ═══════════════════════════════════════════════════════════════════════════════
26
+ // HELPER: determine source type from FK fields
27
+ // ═══════════════════════════════════════════════════════════════════════════════
28
+ function determineSourceType(chunkId, imageId, extractionId) {
29
+ if (chunkId && !imageId && !extractionId)
30
+ return 'chunk';
31
+ if (imageId)
32
+ return 'image';
33
+ if (extractionId)
34
+ return 'extraction';
35
+ return 'unknown';
36
+ }
37
+ // ═══════════════════════════════════════════════════════════════════════════════
38
+ // Tool 4.1: ocr_embedding_list
39
+ // ═══════════════════════════════════════════════════════════════════════════════
40
+ const EmbeddingListInput = z.object({
41
+ document_id: z.string().min(1).optional(),
42
+ source_type: z.enum(['chunk', 'image', 'extraction']).optional(),
43
+ model_name: z.string().min(1).optional(),
44
+ limit: z.number().int().min(1).max(100).default(50),
45
+ offset: z.number().int().min(0).default(0),
46
+ });
47
+ async function handleEmbeddingList(params) {
48
+ try {
49
+ const input = validateInput(EmbeddingListInput, params);
50
+ const { db } = requireDatabase();
51
+ const result = db.getEmbeddingsFiltered({
52
+ document_id: input.document_id,
53
+ source_type: input.source_type,
54
+ model_name: input.model_name,
55
+ limit: input.limit,
56
+ offset: input.offset,
57
+ });
58
+ const enriched = result.embeddings.map((emb) => {
59
+ const sourceType = determineSourceType(emb.chunk_id, emb.image_id, emb.extraction_id);
60
+ const entry = {
61
+ id: emb.id,
62
+ document_id: emb.document_id,
63
+ source_type: sourceType,
64
+ chunk_id: emb.chunk_id,
65
+ image_id: emb.image_id,
66
+ extraction_id: emb.extraction_id,
67
+ model_name: emb.model_name,
68
+ model_version: emb.model_version,
69
+ original_text_length: emb.original_text_length,
70
+ original_text_preview: emb.original_text.slice(0, 200),
71
+ page_number: emb.page_number,
72
+ page_range: emb.page_range,
73
+ gpu_device: emb.gpu_device,
74
+ generation_duration_ms: emb.generation_duration_ms,
75
+ provenance_id: emb.provenance_id,
76
+ created_at: emb.created_at,
77
+ };
78
+ // Enrich with source context
79
+ if (sourceType === 'chunk' && emb.chunk_id) {
80
+ const chunk = db.getChunk(emb.chunk_id);
81
+ if (chunk) {
82
+ entry.chunk_heading_context = chunk.heading_context;
83
+ entry.chunk_section_path = chunk.section_path;
84
+ entry.chunk_index = chunk.chunk_index;
85
+ }
86
+ }
87
+ else if (sourceType === 'image' && emb.image_id) {
88
+ const conn = db.getConnection();
89
+ const img = getImage(conn, emb.image_id);
90
+ if (img) {
91
+ entry.image_extracted_path = img.extracted_path;
92
+ entry.image_page_number = img.page_number;
93
+ entry.image_block_type = img.block_type;
94
+ }
95
+ }
96
+ else if (sourceType === 'extraction' && emb.extraction_id) {
97
+ const extraction = db.getExtraction(emb.extraction_id);
98
+ if (extraction) {
99
+ entry.extraction_schema = extraction.schema_json;
100
+ }
101
+ }
102
+ return entry;
103
+ });
104
+ return formatResponse(successResult({
105
+ embeddings: enriched,
106
+ total: result.total,
107
+ limit: input.limit,
108
+ offset: input.offset,
109
+ filters_applied: {
110
+ document_id: input.document_id ?? null,
111
+ source_type: input.source_type ?? null,
112
+ model_name: input.model_name ?? null,
113
+ },
114
+ next_steps: [{ tool: 'ocr_embedding_get', description: 'Inspect a specific embedding' }, { tool: 'ocr_embedding_stats', description: 'Check overall embedding coverage' }],
115
+ }));
116
+ }
117
+ catch (error) {
118
+ return handleError(error);
119
+ }
120
+ }
121
+ // ═══════════════════════════════════════════════════════════════════════════════
122
+ // Tool 4.2: ocr_embedding_stats
123
+ // ═══════════════════════════════════════════════════════════════════════════════
124
+ const EmbeddingStatsInput = z.object({
125
+ document_id: z.string().min(1).optional(),
126
+ });
127
+ async function handleEmbeddingStats(params) {
128
+ try {
129
+ const input = validateInput(EmbeddingStatsInput, params);
130
+ const { db } = requireDatabase();
131
+ const stats = db.getEmbeddingStats(input.document_id);
132
+ return formatResponse(successResult({
133
+ document_id: input.document_id ?? null,
134
+ ...stats,
135
+ next_steps: stats.total_embeddings === 0
136
+ ? [
137
+ { tool: 'ocr_process_pending', description: 'Run processing to generate embeddings' },
138
+ { tool: 'ocr_document_list', description: 'Check if documents exist to process' },
139
+ ]
140
+ : [
141
+ { tool: 'ocr_embedding_rebuild', description: 'Rebuild embeddings for items with gaps' },
142
+ { tool: 'ocr_embedding_list', description: 'Browse individual embeddings' },
143
+ ],
144
+ }));
145
+ }
146
+ catch (error) {
147
+ return handleError(error);
148
+ }
149
+ }
150
+ // ═══════════════════════════════════════════════════════════════════════════════
151
+ // Tool 4.3: ocr_embedding_get
152
+ // ═══════════════════════════════════════════════════════════════════════════════
153
+ const EmbeddingGetInput = z.object({
154
+ embedding_id: z.string().min(1),
155
+ include_provenance: z.boolean().default(false),
156
+ });
157
+ async function handleEmbeddingGet(params) {
158
+ try {
159
+ const input = validateInput(EmbeddingGetInput, params);
160
+ const { db } = requireDatabase();
161
+ const embedding = db.getEmbedding(input.embedding_id);
162
+ if (!embedding) {
163
+ throw new Error(`Embedding not found: ${input.embedding_id}`);
164
+ }
165
+ const sourceType = determineSourceType(embedding.chunk_id, embedding.image_id, embedding.extraction_id);
166
+ const result = {
167
+ id: embedding.id,
168
+ document_id: embedding.document_id,
169
+ source_type: sourceType,
170
+ chunk_id: embedding.chunk_id,
171
+ image_id: embedding.image_id,
172
+ extraction_id: embedding.extraction_id,
173
+ original_text: embedding.original_text,
174
+ original_text_length: embedding.original_text_length,
175
+ source_file_path: embedding.source_file_path,
176
+ source_file_name: embedding.source_file_name,
177
+ source_file_hash: embedding.source_file_hash,
178
+ page_number: embedding.page_number,
179
+ page_range: embedding.page_range,
180
+ character_start: embedding.character_start,
181
+ character_end: embedding.character_end,
182
+ chunk_index: embedding.chunk_index,
183
+ total_chunks: embedding.total_chunks,
184
+ model_name: embedding.model_name,
185
+ model_version: embedding.model_version,
186
+ task_type: embedding.task_type,
187
+ inference_mode: embedding.inference_mode,
188
+ gpu_device: embedding.gpu_device,
189
+ content_hash: embedding.content_hash,
190
+ generation_duration_ms: embedding.generation_duration_ms,
191
+ provenance_id: embedding.provenance_id,
192
+ created_at: embedding.created_at,
193
+ };
194
+ // Enrich with source context
195
+ if (sourceType === 'chunk' && embedding.chunk_id) {
196
+ const chunk = db.getChunk(embedding.chunk_id);
197
+ if (chunk) {
198
+ result.source_context = {
199
+ type: 'chunk',
200
+ chunk_index: chunk.chunk_index,
201
+ heading_context: chunk.heading_context,
202
+ section_path: chunk.section_path,
203
+ content_types: chunk.content_types,
204
+ embedding_status: chunk.embedding_status,
205
+ page_number: chunk.page_number,
206
+ };
207
+ }
208
+ }
209
+ else if (sourceType === 'image' && embedding.image_id) {
210
+ const conn = db.getConnection();
211
+ const img = getImage(conn, embedding.image_id);
212
+ if (img) {
213
+ result.source_context = {
214
+ type: 'image',
215
+ extracted_path: img.extracted_path,
216
+ page_number: img.page_number,
217
+ block_type: img.block_type,
218
+ format: img.format,
219
+ dimensions: img.dimensions,
220
+ vlm_status: img.vlm_status,
221
+ vlm_confidence: img.vlm_confidence,
222
+ };
223
+ }
224
+ }
225
+ else if (sourceType === 'extraction' && embedding.extraction_id) {
226
+ const extraction = db.getExtraction(embedding.extraction_id);
227
+ if (extraction) {
228
+ result.source_context = {
229
+ type: 'extraction',
230
+ schema_json: extraction.schema_json,
231
+ content_hash: extraction.content_hash,
232
+ created_at: extraction.created_at,
233
+ };
234
+ }
235
+ }
236
+ // Document context
237
+ const doc = db.getDocument(embedding.document_id);
238
+ if (doc) {
239
+ result.document_context = {
240
+ file_path: doc.file_path,
241
+ file_name: doc.file_name,
242
+ file_type: doc.file_type,
243
+ status: doc.status,
244
+ };
245
+ }
246
+ // Provenance chain
247
+ if (input.include_provenance) {
248
+ result.provenance_chain = fetchProvenanceChain(db, embedding.provenance_id, '[embedding_get]');
249
+ }
250
+ result.next_steps = [{ tool: 'ocr_chunk_context', description: 'View the source chunk with surrounding text' }, { tool: 'ocr_embedding_rebuild', description: 'Regenerate this embedding' }];
251
+ return formatResponse(successResult(result));
252
+ }
253
+ catch (error) {
254
+ return handleError(error);
255
+ }
256
+ }
257
+ // ═══════════════════════════════════════════════════════════════════════════════
258
+ // Tool 4.4: ocr_embedding_rebuild
259
+ // ═══════════════════════════════════════════════════════════════════════════════
260
+ const EmbeddingRebuildInput = z.object({
261
+ document_id: z.string().min(1).optional(),
262
+ chunk_id: z.string().min(1).optional(),
263
+ image_id: z.string().min(1).optional(),
264
+ include_vlm: z.boolean().default(false).optional(),
265
+ });
266
+ async function handleEmbeddingRebuild(params) {
267
+ try {
268
+ const input = validateInput(EmbeddingRebuildInput, params);
269
+ const { db, vector } = requireDatabase();
270
+ // Validate exactly one target specified
271
+ const targets = [input.document_id, input.chunk_id, input.image_id].filter(Boolean);
272
+ if (targets.length === 0) {
273
+ throw new Error('Exactly one of document_id, chunk_id, or image_id must be provided');
274
+ }
275
+ if (targets.length > 1) {
276
+ throw new Error('Exactly one of document_id, chunk_id, or image_id must be provided, got multiple');
277
+ }
278
+ const embeddingService = getEmbeddingService();
279
+ const rebuiltIds = [];
280
+ const provenanceIds = [];
281
+ if (input.chunk_id) {
282
+ // Rebuild embedding for a single chunk
283
+ const chunk = db.getChunk(input.chunk_id);
284
+ if (!chunk) {
285
+ throw new Error(`Chunk not found: ${input.chunk_id}`);
286
+ }
287
+ const doc = db.getDocument(chunk.document_id);
288
+ if (!doc) {
289
+ throw new Error(`Document not found for chunk: ${chunk.document_id}`);
290
+ }
291
+ // Delete old embedding and vector for this chunk
292
+ const oldEmbedding = db.getEmbeddingByChunkId(input.chunk_id);
293
+ if (oldEmbedding) {
294
+ vector.deleteVector(oldEmbedding.id);
295
+ db.deleteEmbeddingsByChunkId(input.chunk_id);
296
+ }
297
+ // Reset chunk embedding status
298
+ db.updateChunkEmbeddingStatus(input.chunk_id, 'pending');
299
+ // Regenerate embedding
300
+ const result = await embeddingService.embedDocumentChunks(db, vector, [chunk], {
301
+ documentId: chunk.document_id,
302
+ filePath: doc.file_path,
303
+ fileName: doc.file_name,
304
+ fileHash: doc.file_hash,
305
+ documentProvenanceId: doc.provenance_id,
306
+ });
307
+ rebuiltIds.push(...result.embeddingIds);
308
+ provenanceIds.push(...result.provenanceIds);
309
+ }
310
+ else if (input.image_id) {
311
+ // Rebuild VLM embedding for a single image
312
+ const conn = db.getConnection();
313
+ const img = getImage(conn, input.image_id);
314
+ if (!img) {
315
+ throw new Error(`Image not found: ${input.image_id}`);
316
+ }
317
+ if (!img.vlm_description) {
318
+ throw new Error(`Image ${input.image_id} has no VLM description to embed`);
319
+ }
320
+ const doc = db.getDocument(img.document_id);
321
+ if (!doc) {
322
+ throw new Error(`Document not found for image: ${img.document_id}`);
323
+ }
324
+ // Delete old VLM embedding
325
+ if (img.vlm_embedding_id) {
326
+ vector.deleteVector(img.vlm_embedding_id);
327
+ db.deleteEmbeddingsByImageId(input.image_id);
328
+ // Clear vlm_embedding_id on image
329
+ conn.prepare('UPDATE images SET vlm_embedding_id = NULL WHERE id = ?').run(input.image_id);
330
+ }
331
+ // Generate new embedding for VLM description
332
+ const embeddingId = uuidv4();
333
+ const provenanceId = uuidv4();
334
+ const now = new Date().toISOString();
335
+ // Create provenance
336
+ const provRecord = {
337
+ id: provenanceId,
338
+ type: 'EMBEDDING',
339
+ created_at: now,
340
+ processed_at: now,
341
+ source_file_created_at: null,
342
+ source_file_modified_at: null,
343
+ source_type: 'EMBEDDING',
344
+ source_path: null,
345
+ source_id: img.provenance_id ?? null,
346
+ root_document_id: doc.provenance_id,
347
+ location: {
348
+ page_number: img.page_number,
349
+ image_index: img.image_index,
350
+ },
351
+ content_hash: computeHash(img.vlm_description),
352
+ input_hash: null,
353
+ file_hash: doc.file_hash,
354
+ processor: EMBEDDING_MODEL.name,
355
+ processor_version: EMBEDDING_MODEL.version,
356
+ processing_params: {
357
+ dimensions: EMBEDDING_MODEL.dimensions,
358
+ task_type: 'search_document',
359
+ inference_mode: 'local',
360
+ source: 'vlm_description_rebuild',
361
+ },
362
+ processing_duration_ms: null,
363
+ processing_quality_score: null,
364
+ parent_id: img.provenance_id ?? null,
365
+ parent_ids: img.provenance_id ? JSON.stringify([img.provenance_id]) : '[]',
366
+ chain_depth: 4,
367
+ chain_path: JSON.stringify(['DOCUMENT', 'OCR_RESULT', 'IMAGE', 'VLM_DESCRIPTION', 'EMBEDDING']),
368
+ };
369
+ db.insertProvenance(provRecord);
370
+ provenanceIds.push(provenanceId);
371
+ // Generate the vector using embedChunks (search_document prefix for storage)
372
+ const embClient = getEmbeddingClient();
373
+ const [embVector] = await embClient.embedChunks([img.vlm_description], 1);
374
+ // Insert embedding record
375
+ db.insertEmbedding({
376
+ id: embeddingId,
377
+ chunk_id: null,
378
+ image_id: input.image_id,
379
+ extraction_id: null,
380
+ document_id: img.document_id,
381
+ original_text: img.vlm_description,
382
+ original_text_length: img.vlm_description.length,
383
+ source_file_path: doc.file_path,
384
+ source_file_name: doc.file_name,
385
+ source_file_hash: doc.file_hash,
386
+ page_number: img.page_number,
387
+ page_range: null,
388
+ character_start: 0,
389
+ character_end: img.vlm_description.length,
390
+ chunk_index: 0,
391
+ total_chunks: 0,
392
+ model_name: EMBEDDING_MODEL.name,
393
+ model_version: EMBEDDING_MODEL.version,
394
+ task_type: 'search_document',
395
+ inference_mode: 'local',
396
+ gpu_device: 'cuda:0',
397
+ provenance_id: provenanceId,
398
+ content_hash: computeHash(img.vlm_description),
399
+ generation_duration_ms: null,
400
+ });
401
+ // Store vector
402
+ vector.storeVector(embeddingId, embVector);
403
+ // Update image record
404
+ conn.prepare('UPDATE images SET vlm_embedding_id = ? WHERE id = ?').run(embeddingId, input.image_id);
405
+ rebuiltIds.push(embeddingId);
406
+ }
407
+ else if (input.document_id) {
408
+ // Rebuild all chunk embeddings for a document
409
+ const doc = db.getDocument(input.document_id);
410
+ if (!doc) {
411
+ throw documentNotFoundError(input.document_id);
412
+ }
413
+ const chunks = db.getChunksByDocumentId(input.document_id);
414
+ if (chunks.length === 0 && !input.include_vlm) {
415
+ throw new Error(`No chunks found for document: ${input.document_id}`);
416
+ }
417
+ // Delete old embeddings and vectors for all chunks
418
+ if (chunks.length > 0) {
419
+ const oldEmbeddings = db.getEmbeddingsByDocumentId(input.document_id);
420
+ // Only delete chunk-based embeddings, not image/extraction ones
421
+ const chunkEmbeddings = oldEmbeddings.filter(e => e.chunk_id && !e.image_id && !e.extraction_id);
422
+ for (const emb of chunkEmbeddings) {
423
+ vector.deleteVector(emb.id);
424
+ }
425
+ // Delete chunk embeddings from embeddings table
426
+ const conn = db.getConnection();
427
+ conn.prepare('DELETE FROM embeddings WHERE document_id = ? AND chunk_id IS NOT NULL AND image_id IS NULL AND extraction_id IS NULL').run(input.document_id);
428
+ // Reset all chunk embedding statuses
429
+ for (const chunk of chunks) {
430
+ db.updateChunkEmbeddingStatus(chunk.id, 'pending');
431
+ }
432
+ // Regenerate embeddings
433
+ const result = await embeddingService.embedDocumentChunks(db, vector, chunks, {
434
+ documentId: input.document_id,
435
+ filePath: doc.file_path,
436
+ fileName: doc.file_name,
437
+ fileHash: doc.file_hash,
438
+ documentProvenanceId: doc.provenance_id,
439
+ });
440
+ rebuiltIds.push(...result.embeddingIds);
441
+ provenanceIds.push(...result.provenanceIds);
442
+ }
443
+ // Rebuild VLM embeddings for images when include_vlm is true
444
+ if (input.include_vlm) {
445
+ const conn = db.getConnection();
446
+ const vlmEmbeddingService = new EmbeddingService();
447
+ const vlmImages = conn
448
+ .prepare(`SELECT id, vlm_description, vlm_embedding_id, provenance_id, page_number,
449
+ extracted_path, format
450
+ FROM images
451
+ WHERE document_id = ? AND vlm_status = 'complete'
452
+ AND vlm_description IS NOT NULL AND vlm_description != '[SKIPPED]'`)
453
+ .all(input.document_id);
454
+ for (const img of vlmImages) {
455
+ try {
456
+ // Delete old VLM embedding if exists
457
+ if (img.vlm_embedding_id) {
458
+ vector.deleteVector(img.vlm_embedding_id);
459
+ conn
460
+ .prepare('DELETE FROM embeddings WHERE id = ?')
461
+ .run(img.vlm_embedding_id);
462
+ // Null out the reference on the image
463
+ conn
464
+ .prepare('UPDATE images SET vlm_embedding_id = NULL WHERE id = ?')
465
+ .run(img.id);
466
+ }
467
+ // Generate new embedding for VLM description
468
+ const vlmEmbedResult = await vlmEmbeddingService.embedSearchQuery(img.vlm_description);
469
+ // Create EMBEDDING provenance (depth 4, parent = VLM_DESCRIPTION provenance)
470
+ const embProvId = uuidv4();
471
+ const now = new Date().toISOString();
472
+ // Find VLM description provenance (depth 3) for this image
473
+ const vlmProvRecords = conn
474
+ .prepare(`SELECT id, parent_ids FROM provenance
475
+ WHERE root_document_id = ? AND type = 'VLM_DESCRIPTION'
476
+ AND source_id = ?
477
+ ORDER BY created_at DESC LIMIT 1`)
478
+ .all(doc.provenance_id, img.provenance_id);
479
+ const vlmProvId = vlmProvRecords.length > 0 ? vlmProvRecords[0].id : img.provenance_id;
480
+ const existingParents = vlmProvRecords.length > 0
481
+ ? JSON.parse(vlmProvRecords[0].parent_ids)
482
+ : [];
483
+ const parentIds = [...existingParents, vlmProvId];
484
+ db.insertProvenance({
485
+ id: embProvId,
486
+ type: ProvType.EMBEDDING,
487
+ created_at: now,
488
+ processed_at: now,
489
+ source_file_created_at: null,
490
+ source_file_modified_at: null,
491
+ source_type: 'EMBEDDING',
492
+ source_path: null,
493
+ source_id: vlmProvId,
494
+ root_document_id: doc.provenance_id,
495
+ location: { page_number: img.page_number },
496
+ content_hash: computeHash(img.vlm_description),
497
+ input_hash: computeHash(img.vlm_description),
498
+ file_hash: doc.file_hash,
499
+ processor: 'nomic-embed-text-v1.5',
500
+ processor_version: '1.5.0',
501
+ processing_params: {
502
+ task_type: 'search_document',
503
+ inference_mode: 'local',
504
+ source: 'vlm_description_reembed',
505
+ },
506
+ processing_duration_ms: null,
507
+ processing_quality_score: null,
508
+ parent_id: vlmProvId,
509
+ parent_ids: JSON.stringify(parentIds),
510
+ chain_depth: 4,
511
+ chain_path: JSON.stringify([
512
+ 'DOCUMENT',
513
+ 'OCR_RESULT',
514
+ 'IMAGE',
515
+ 'VLM_DESCRIPTION',
516
+ 'EMBEDDING',
517
+ ]),
518
+ });
519
+ // Insert embedding record (matches VLM pipeline pattern)
520
+ const embId = uuidv4();
521
+ db.insertEmbedding({
522
+ id: embId,
523
+ chunk_id: null,
524
+ image_id: img.id,
525
+ extraction_id: null,
526
+ document_id: doc.id,
527
+ original_text: img.vlm_description,
528
+ original_text_length: img.vlm_description.length,
529
+ source_file_path: img.extracted_path ?? 'unknown',
530
+ source_file_name: img.extracted_path?.split('/').pop() ?? 'vlm_description',
531
+ source_file_hash: 'vlm_generated',
532
+ page_number: img.page_number,
533
+ page_range: null,
534
+ character_start: 0,
535
+ character_end: img.vlm_description.length,
536
+ chunk_index: 0,
537
+ total_chunks: 1,
538
+ model_name: 'nomic-embed-text-v1.5',
539
+ model_version: '1.5.0',
540
+ task_type: 'search_document',
541
+ inference_mode: 'local',
542
+ gpu_device: 'cuda:0',
543
+ provenance_id: embProvId,
544
+ content_hash: computeHash(img.vlm_description),
545
+ generation_duration_ms: null,
546
+ });
547
+ // Store vector
548
+ vector.storeVector(embId, vlmEmbedResult);
549
+ // Update image with new VLM embedding ID
550
+ conn
551
+ .prepare('UPDATE images SET vlm_embedding_id = ? WHERE id = ?')
552
+ .run(embId, img.id);
553
+ rebuiltIds.push(embId);
554
+ provenanceIds.push(embProvId);
555
+ }
556
+ catch (vlmError) {
557
+ console.error(`[WARN] Failed to re-embed VLM description for image ${img.id}: ${vlmError instanceof Error ? vlmError.message : String(vlmError)}`);
558
+ // Non-fatal: continue with remaining images
559
+ }
560
+ }
561
+ }
562
+ }
563
+ let target;
564
+ if (input.document_id) {
565
+ target = { type: 'document', id: input.document_id };
566
+ }
567
+ else if (input.chunk_id) {
568
+ target = { type: 'chunk', id: input.chunk_id };
569
+ }
570
+ else {
571
+ target = { type: 'image', id: input.image_id };
572
+ }
573
+ return formatResponse(successResult({
574
+ rebuilt_count: rebuiltIds.length,
575
+ new_embedding_ids: rebuiltIds,
576
+ provenance_ids: provenanceIds,
577
+ target,
578
+ next_steps: [{ tool: 'ocr_embedding_stats', description: 'Verify embedding coverage after rebuild' }, { tool: 'ocr_search', description: 'Search using the rebuilt embeddings' }],
579
+ }));
580
+ }
581
+ catch (error) {
582
+ return handleError(error);
583
+ }
584
+ }
585
+ // ═══════════════════════════════════════════════════════════════════════════════
586
+ // TOOL DEFINITIONS EXPORT
587
+ // ═══════════════════════════════════════════════════════════════════════════════
588
+ export const embeddingTools = {
589
+ ocr_embedding_list: {
590
+ description: '[STATUS] Use to browse embeddings with filtering by document, source type (chunk/image/extraction), and model. Returns embedding metadata with source context.',
591
+ inputSchema: {
592
+ document_id: z.string().min(1).optional().describe('Filter by document ID'),
593
+ source_type: z.enum(['chunk', 'image', 'extraction']).optional().describe('Filter by source type'),
594
+ model_name: z.string().min(1).optional().describe('Filter by model name'),
595
+ limit: z.number().int().min(1).max(100).default(50).describe('Max results (default: 50)'),
596
+ offset: z.number().int().min(0).default(0).describe('Pagination offset'),
597
+ },
598
+ handler: handleEmbeddingList,
599
+ },
600
+ ocr_embedding_stats: {
601
+ description: '[STATUS] Use to check embedding coverage and performance. Returns total count, breakdown by source type, device stats, and counts of unembedded chunks/images.',
602
+ inputSchema: {
603
+ document_id: z.string().min(1).optional().describe('Scope stats to a specific document'),
604
+ },
605
+ handler: handleEmbeddingStats,
606
+ },
607
+ ocr_embedding_get: {
608
+ description: '[STATUS] Use to inspect a specific embedding by ID. Returns source context (chunk, image, or extraction), document context, model info, and optional provenance chain.',
609
+ inputSchema: {
610
+ embedding_id: z.string().min(1).describe('Embedding ID to retrieve'),
611
+ include_provenance: z.boolean().default(false).describe('Include full provenance chain'),
612
+ },
613
+ handler: handleEmbeddingGet,
614
+ },
615
+ ocr_embedding_rebuild: {
616
+ description: '[SETUP] Rebuild embeddings for a document, chunk, or image. Use after config changes or VLM re-analysis. include_vlm=true for VLM image embeddings.',
617
+ inputSchema: {
618
+ document_id: z.string().min(1).optional().describe('Rebuild all chunk embeddings for this document (add include_vlm=true for VLM image embeddings too)'),
619
+ chunk_id: z.string().min(1).optional().describe('Rebuild embedding for this specific chunk'),
620
+ image_id: z.string().min(1).optional().describe('Rebuild VLM embedding for this specific image'),
621
+ include_vlm: z.boolean().default(false).optional().describe('When true with document_id, also rebuild VLM embeddings for images'),
622
+ },
623
+ handler: handleEmbeddingRebuild,
624
+ },
625
+ };
626
+ //# sourceMappingURL=embeddings.js.map