ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,1624 @@
1
+ /**
2
+ * Document Management MCP Tools
3
+ *
4
+ * Extracted from src/index.ts Task 22.
5
+ * Tools: ocr_document_list, ocr_document_get, ocr_document_delete,
6
+ * ocr_document_find_similar
7
+ *
8
+ * CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
9
+ * Use console.error() for all logging.
10
+ *
11
+ * @module tools/documents
12
+ */
13
+ import { z } from 'zod';
14
+ import { existsSync, rmSync, writeFileSync, mkdirSync } from 'fs';
15
+ import { resolve, dirname } from 'path';
16
+ import { v4 as uuidv4 } from 'uuid';
17
+ import { requireDatabase, getDefaultStoragePath } from '../server/state.js';
18
+ import { successResult } from '../server/types.js';
19
+ import { validateInput, sanitizePath, DocumentGetInput, DocumentDeleteInput, } from '../utils/validation.js';
20
+ import { listDocumentsWithCursor, encodeCursor } from '../services/storage/database/document-operations.js';
21
+ import { documentNotFoundError, MCPError } from '../server/errors.js';
22
+ import { formatResponse, handleError, fetchProvenanceChain } from './shared.js';
23
+ import { getComparisonSummariesByDocument } from '../services/storage/database/comparison-operations.js';
24
+ import { getClusterSummariesForDocument } from '../services/storage/database/cluster-operations.js';
25
+ import { getImagesByDocument } from '../services/storage/database/image-operations.js';
26
+ import { extractTableStructures } from '../services/chunking/json-block-analyzer.js';
27
+ // ═══════════════════════════════════════════════════════════════════════════════
28
+ // DOCUMENT LIST INPUT SCHEMA (with cursor support)
29
+ // ═══════════════════════════════════════════════════════════════════════════════
30
+ const DocumentListInputWithCursor = z.object({
31
+ status_filter: z.enum(['pending', 'processing', 'complete', 'failed']).optional(),
32
+ limit: z.number().int().min(1).max(1000).default(50),
33
+ offset: z.number().int().min(0).default(0),
34
+ created_after: z.string().datetime().optional()
35
+ .describe('Filter documents created after this ISO 8601 timestamp'),
36
+ created_before: z.string().datetime().optional()
37
+ .describe('Filter documents created before this ISO 8601 timestamp'),
38
+ file_type: z.string().optional()
39
+ .describe('Filter by file type (e.g., "pdf", "docx")'),
40
+ cursor: z.string().optional()
41
+ .describe('Cursor from a previous response for keyset pagination. When provided, offset is ignored.'),
42
+ });
43
+ // ═══════════════════════════════════════════════════════════════════════════════
44
+ // DOCUMENT TOOL HANDLERS
45
+ // ═══════════════════════════════════════════════════════════════════════════════
46
+ /**
47
+ * Handle ocr_document_list - List documents in the current database.
48
+ *
49
+ * Supports both offset-based and cursor-based pagination.
50
+ * When `cursor` is provided, keyset pagination is used (more efficient for large datasets).
51
+ */
52
+ export async function handleDocumentList(params) {
53
+ try {
54
+ const input = validateInput(DocumentListInputWithCursor, params);
55
+ const { db } = requireDatabase();
56
+ const conn = db.getConnection();
57
+ // Build dynamic SQL with conditional WHERE clauses for new filters
58
+ const conditions = [];
59
+ const queryParams = [];
60
+ if (input.status_filter) {
61
+ conditions.push('status = ?');
62
+ queryParams.push(input.status_filter);
63
+ }
64
+ if (input.created_after) {
65
+ conditions.push('created_at > ?');
66
+ queryParams.push(input.created_after);
67
+ }
68
+ if (input.created_before) {
69
+ conditions.push('created_at < ?');
70
+ queryParams.push(input.created_before);
71
+ }
72
+ if (input.file_type) {
73
+ conditions.push('file_type = ?');
74
+ queryParams.push(input.file_type);
75
+ }
76
+ // When using cursor, delegate to the cursor-based pagination layer
77
+ // which handles keyset filtering internally
78
+ if (input.cursor) {
79
+ const cursorResult = listDocumentsWithCursor(conn, {
80
+ status: input.status_filter,
81
+ limit: input.limit,
82
+ cursor: input.cursor,
83
+ });
84
+ // Get total count with same filters (without cursor for accurate total)
85
+ const whereClause = conditions.length > 0 ? ' WHERE ' + conditions.join(' AND ') : '';
86
+ const countRow = conn
87
+ .prepare(`SELECT COUNT(*) as total FROM documents${whereClause}`)
88
+ .get(...queryParams);
89
+ const extrasStmt = conn.prepare('SELECT extras_json FROM ocr_results WHERE document_id = ? LIMIT 1');
90
+ return formatResponse(successResult({
91
+ documents: cursorResult.documents.map((d) => ({
92
+ id: d.id,
93
+ file_name: d.file_name,
94
+ file_path: d.file_path,
95
+ file_size: d.file_size,
96
+ file_type: d.file_type,
97
+ status: d.status,
98
+ page_count: d.page_count,
99
+ doc_title: d.doc_title ?? null,
100
+ doc_author: d.doc_author ?? null,
101
+ doc_subject: d.doc_subject ?? null,
102
+ created_at: d.created_at,
103
+ structural_summary: getStructuralSummary(extrasStmt, d.id),
104
+ })),
105
+ total: countRow.total,
106
+ limit: input.limit,
107
+ next_cursor: cursorResult.next_cursor,
108
+ next_steps: buildDocumentListNextSteps(countRow.total),
109
+ }));
110
+ }
111
+ // Standard offset-based pagination path
112
+ const whereClause = conditions.length > 0 ? ' WHERE ' + conditions.join(' AND ') : '';
113
+ // Get total count with same filters
114
+ const countRow = conn
115
+ .prepare(`SELECT COUNT(*) as total FROM documents${whereClause}`)
116
+ .get(...queryParams);
117
+ const total = countRow.total;
118
+ // Get paginated results
119
+ const dataQuery = `SELECT * FROM documents${whereClause} ORDER BY created_at DESC, id DESC LIMIT ? OFFSET ?`;
120
+ const dataParams = [...queryParams, input.limit, input.offset];
121
+ const rows = conn.prepare(dataQuery).all(...dataParams);
122
+ // Phase 2: Prepared statement for structural summary from extras_json
123
+ const extrasStmt = conn.prepare('SELECT extras_json FROM ocr_results WHERE document_id = ? LIMIT 1');
124
+ // Compute next_cursor from the last row for cursor-based pagination compatibility
125
+ let next_cursor = null;
126
+ if (rows.length > 0 && rows.length === input.limit) {
127
+ const lastRow = rows[rows.length - 1];
128
+ next_cursor = encodeCursor(lastRow.created_at, lastRow.id);
129
+ }
130
+ return formatResponse(successResult({
131
+ documents: rows.map((d) => ({
132
+ id: d.id,
133
+ file_name: d.file_name,
134
+ file_path: d.file_path,
135
+ file_size: d.file_size,
136
+ file_type: d.file_type,
137
+ status: d.status,
138
+ page_count: d.page_count,
139
+ doc_title: d.doc_title ?? null,
140
+ doc_author: d.doc_author ?? null,
141
+ doc_subject: d.doc_subject ?? null,
142
+ created_at: d.created_at,
143
+ structural_summary: getStructuralSummary(extrasStmt, d.id),
144
+ })),
145
+ total,
146
+ limit: input.limit,
147
+ offset: input.offset,
148
+ next_cursor,
149
+ next_steps: buildDocumentListNextSteps(total),
150
+ }));
151
+ }
152
+ catch (error) {
153
+ return handleError(error);
154
+ }
155
+ }
156
+ /**
157
+ * Extract structural summary from extras_json for a document.
158
+ */
159
+ function getStructuralSummary(extrasStmt, documentId) {
160
+ try {
161
+ const ocrRow = extrasStmt.get(documentId);
162
+ if (!ocrRow?.extras_json)
163
+ return null;
164
+ const extras = JSON.parse(ocrRow.extras_json);
165
+ const fp = extras.structural_fingerprint;
166
+ if (!fp)
167
+ return null;
168
+ const headingDepths = fp.heading_depths;
169
+ return {
170
+ table_count: fp.table_count ?? 0,
171
+ figure_count: fp.figure_count ?? 0,
172
+ heading_count: headingDepths ? Object.values(headingDepths).reduce((a, b) => a + b, 0) : 0,
173
+ content_types: fp.content_type_distribution ?? null,
174
+ };
175
+ }
176
+ catch (error) {
177
+ console.error(`[documents] Failed to parse structural fingerprint for document ${documentId}: ${error instanceof Error ? error.message : String(error)}`);
178
+ return null;
179
+ }
180
+ }
181
+ /**
182
+ * Build next_steps for document list based on total count.
183
+ */
184
+ function buildDocumentListNextSteps(total) {
185
+ return total === 0
186
+ ? [
187
+ { tool: 'ocr_ingest_files', description: 'Add documents to the database first' },
188
+ { tool: 'ocr_ingest_directory', description: 'Scan a directory for documents to ingest' },
189
+ ]
190
+ : [
191
+ { tool: 'ocr_document_get', description: 'Get details for a specific document by ID' },
192
+ { tool: 'ocr_search', description: 'Search within the corpus' },
193
+ { tool: 'ocr_document_structure', description: 'View a document outline (headings, tables)' },
194
+ ];
195
+ }
196
+ /**
197
+ * Handle ocr_document_get - Get detailed information about a specific document
198
+ */
199
+ export async function handleDocumentGet(params) {
200
+ try {
201
+ const input = validateInput(DocumentGetInput, params);
202
+ const { db } = requireDatabase();
203
+ const doc = db.getDocument(input.document_id);
204
+ if (!doc) {
205
+ throw documentNotFoundError(input.document_id);
206
+ }
207
+ // Always fetch OCR result for metadata (lightweight - excludes extracted_text in response unless include_text)
208
+ const ocrResult = db.getOCRResultByDocumentId(doc.id);
209
+ const result = {
210
+ id: doc.id,
211
+ file_name: doc.file_name,
212
+ file_path: doc.file_path,
213
+ file_hash: doc.file_hash,
214
+ file_size: doc.file_size,
215
+ file_type: doc.file_type,
216
+ status: doc.status,
217
+ page_count: doc.page_count,
218
+ doc_title: doc.doc_title ?? null,
219
+ doc_author: doc.doc_author ?? null,
220
+ doc_subject: doc.doc_subject ?? null,
221
+ created_at: doc.created_at,
222
+ provenance_id: doc.provenance_id,
223
+ ocr_info: ocrResult
224
+ ? {
225
+ ocr_result_id: ocrResult.id,
226
+ datalab_request_id: ocrResult.datalab_request_id,
227
+ datalab_mode: ocrResult.datalab_mode,
228
+ parse_quality_score: ocrResult.parse_quality_score,
229
+ cost_cents: ocrResult.cost_cents,
230
+ page_count: ocrResult.page_count,
231
+ text_length: ocrResult.text_length,
232
+ processing_duration_ms: ocrResult.processing_duration_ms,
233
+ content_hash: ocrResult.content_hash,
234
+ }
235
+ : null,
236
+ };
237
+ // Surface enrichment data from extras_json (Tasks 4.1, 4.2, 4.4)
238
+ if (ocrResult?.extras_json) {
239
+ try {
240
+ const extras = JSON.parse(ocrResult.extras_json);
241
+ if (extras.block_type_stats) {
242
+ result.block_type_stats = extras.block_type_stats;
243
+ }
244
+ if (extras.link_count !== undefined) {
245
+ result.link_count = extras.link_count;
246
+ result.structured_links = extras.structured_links ?? [];
247
+ }
248
+ if (extras.structural_fingerprint) {
249
+ result.structural_fingerprint = extras.structural_fingerprint;
250
+ }
251
+ }
252
+ catch (parseErr) {
253
+ console.error(`[DocumentGet] Failed to parse extras_json for enrichment fields: ${String(parseErr)}`);
254
+ }
255
+ }
256
+ // Compute document_profile from block_type_stats (no additional DB queries)
257
+ const stats = result.block_type_stats;
258
+ if (stats) {
259
+ const richBlockCount = stats.table_blocks + stats.figure_blocks + stats.code_blocks;
260
+ let contentComplexity;
261
+ if (richBlockCount > 5) {
262
+ contentComplexity = 'high';
263
+ }
264
+ else if (stats.table_blocks + stats.figure_blocks > 0) {
265
+ contentComplexity = 'medium';
266
+ }
267
+ else {
268
+ contentComplexity = 'low';
269
+ }
270
+ result.document_profile = {
271
+ has_tables: stats.table_blocks > 0,
272
+ has_figures: stats.figure_blocks > 0,
273
+ has_code: stats.code_blocks > 0,
274
+ has_lists: stats.list_blocks > 0,
275
+ content_complexity: contentComplexity,
276
+ tables_per_page: stats.tables_per_page ?? null,
277
+ figures_per_page: stats.figures_per_page ?? null,
278
+ text_density: stats.text_density ?? null,
279
+ };
280
+ }
281
+ else {
282
+ result.document_profile = null;
283
+ }
284
+ if (input.include_text) {
285
+ result.ocr_text = ocrResult?.extracted_text ?? null;
286
+ }
287
+ if (input.include_chunks) {
288
+ const chunks = db.getChunksByDocumentId(doc.id);
289
+ result.chunks = chunks.map((c) => ({
290
+ id: c.id,
291
+ chunk_index: c.chunk_index,
292
+ text_length: c.text.length,
293
+ page_number: c.page_number,
294
+ character_start: c.character_start,
295
+ character_end: c.character_end,
296
+ embedding_status: c.embedding_status,
297
+ heading_context: c.heading_context ?? null,
298
+ heading_level: c.heading_level ?? null,
299
+ section_path: c.section_path ?? null,
300
+ content_types: c.content_types ?? null,
301
+ is_atomic: c.is_atomic ?? 0,
302
+ chunking_strategy: c.chunking_strategy ?? null,
303
+ }));
304
+ }
305
+ if (input.include_blocks && ocrResult) {
306
+ result.json_blocks = ocrResult.json_blocks ? JSON.parse(ocrResult.json_blocks) : null;
307
+ result.extras = ocrResult.extras_json ? JSON.parse(ocrResult.extras_json) : null;
308
+ }
309
+ if (input.include_full_provenance) {
310
+ const chain = db.getProvenanceChain(doc.provenance_id);
311
+ result.provenance_chain = chain.map((p) => ({
312
+ id: p.id,
313
+ type: p.type,
314
+ chain_depth: p.chain_depth,
315
+ processor: p.processor,
316
+ processor_version: p.processor_version,
317
+ content_hash: p.content_hash,
318
+ created_at: p.created_at,
319
+ }));
320
+ }
321
+ // Comparison context: show all comparisons referencing this document
322
+ const comparisons = getComparisonSummariesByDocument(db.getConnection(), doc.id);
323
+ result.comparisons = {
324
+ total: comparisons.length,
325
+ items: comparisons.map((c) => ({
326
+ comparison_id: c.id,
327
+ compared_with: c.document_id_1 === doc.id ? c.document_id_2 : c.document_id_1,
328
+ similarity_ratio: c.similarity_ratio,
329
+ summary: c.summary,
330
+ created_at: c.created_at,
331
+ })),
332
+ };
333
+ // Cluster memberships: show all clusters this document belongs to
334
+ const clusterMemberships = getClusterSummariesForDocument(db.getConnection(), doc.id);
335
+ if (clusterMemberships.length > 0) {
336
+ result.clusters = clusterMemberships.map((c) => ({
337
+ cluster_id: c.id,
338
+ run_id: c.run_id,
339
+ cluster_index: c.cluster_index,
340
+ label: c.label,
341
+ classification_tag: c.classification_tag,
342
+ coherence_score: c.coherence_score,
343
+ }));
344
+ }
345
+ result.next_steps = [
346
+ { tool: 'ocr_document_page', description: 'Read a specific page of this document' },
347
+ { tool: 'ocr_document_structure', description: 'View document outline and layout' },
348
+ { tool: 'ocr_search', description: 'Search within this document (use document_id filter)' },
349
+ { tool: 'ocr_chunk_list', description: 'List all chunks with section/heading filtering' },
350
+ { tool: 'ocr_form_fill', description: 'Fill form fields using this document' },
351
+ { tool: 'ocr_document_versions', description: 'Find other versions of this document' },
352
+ { tool: 'ocr_document_extras', description: 'View OCR extras (blocks, links, fingerprint)' },
353
+ { tool: 'ocr_document_recommend', description: 'Get cluster-based document recommendations' },
354
+ ];
355
+ return formatResponse(successResult(result));
356
+ }
357
+ catch (error) {
358
+ return handleError(error);
359
+ }
360
+ }
361
+ /**
362
+ * Handle ocr_document_delete - Delete a document and all its derived data
363
+ */
364
+ export async function handleDocumentDelete(params) {
365
+ try {
366
+ const input = validateInput(DocumentDeleteInput, params);
367
+ const { db, vector } = requireDatabase();
368
+ const doc = db.getDocument(input.document_id);
369
+ if (!doc) {
370
+ throw documentNotFoundError(input.document_id);
371
+ }
372
+ // Count items before deletion for reporting
373
+ const chunks = db.getChunksByDocumentId(doc.id);
374
+ const embeddings = db.getEmbeddingsByDocumentId(doc.id);
375
+ const provenance = db.getProvenanceByRootDocument(doc.provenance_id);
376
+ // Delete vectors first
377
+ const vectorsDeleted = vector.deleteVectorsByDocumentId(doc.id);
378
+ // Delete document (cascades to chunks, embeddings, provenance)
379
+ db.deleteDocument(doc.id);
380
+ // Clean up extracted image files on disk
381
+ let imagesCleanedUp = false;
382
+ const imageDir = resolve(getDefaultStoragePath(), 'images', doc.id);
383
+ if (existsSync(imageDir)) {
384
+ rmSync(imageDir, { recursive: true, force: true });
385
+ imagesCleanedUp = true;
386
+ }
387
+ return formatResponse(successResult({
388
+ document_id: doc.id,
389
+ deleted: true,
390
+ chunks_deleted: chunks.length,
391
+ embeddings_deleted: embeddings.length,
392
+ vectors_deleted: vectorsDeleted,
393
+ provenance_deleted: provenance.length,
394
+ images_directory_cleaned: imagesCleanedUp,
395
+ next_steps: [{ tool: 'ocr_document_list', description: 'Browse remaining documents' }],
396
+ }));
397
+ }
398
+ catch (error) {
399
+ return handleError(error);
400
+ }
401
+ }
402
+ // ═══════════════════════════════════════════════════════════════════════════════
403
+ // INPUT SCHEMAS FOR NEW TOOLS
404
+ // ═══════════════════════════════════════════════════════════════════════════════
405
+ const DocumentStructureInput = z.object({
406
+ document_id: z.string().min(1).describe('Document ID'),
407
+ format: z.enum(['structure', 'tree', 'outline']).default('structure')
408
+ .describe('Output format: "structure" (headings/tables/figures/code), "tree" (hierarchical section tree with chunks), "outline" (flat numbered section list)'),
409
+ include_chunk_ids: z.boolean().default(true)
410
+ .describe('Include chunk IDs in each section node (tree/outline formats only)'),
411
+ include_page_numbers: z.boolean().default(true)
412
+ .describe('Include page numbers in each section node (tree/outline formats only)'),
413
+ });
414
+ const FindSimilarInput = z.object({
415
+ document_id: z.string().min(1).describe('Source document ID'),
416
+ limit: z.number().int().min(1).max(50).default(10),
417
+ min_similarity: z.number().min(0).max(1).default(0.5)
418
+ .describe('Minimum similarity threshold (0-1)'),
419
+ });
420
+ const UpdateMetadataInput = z.object({
421
+ document_ids: z.array(z.string().min(1)).min(1)
422
+ .describe('Document IDs to update'),
423
+ doc_title: z.string().optional(),
424
+ doc_author: z.string().optional(),
425
+ doc_subject: z.string().optional(),
426
+ });
427
+ const DuplicateDetectionInput = z.object({
428
+ mode: z.enum(['exact', 'near']).default('near')
429
+ .describe('exact: same file_hash; near: high text similarity'),
430
+ similarity_threshold: z.number().min(0.5).max(1).default(0.9)
431
+ .describe('Minimum similarity for near-duplicate detection'),
432
+ limit: z.number().int().min(1).max(100).default(20),
433
+ });
434
+ // ═══════════════════════════════════════════════════════════════════════════════
435
+ // CROSS-DOCUMENT SIMILARITY HANDLER
436
+ // ═══════════════════════════════════════════════════════════════════════════════
437
+ /**
438
+ * Handle ocr_document_find_similar - Find documents similar to a given document
439
+ * using averaged chunk embeddings as document centroid for vector search.
440
+ */
441
+ export async function handleFindSimilar(params) {
442
+ try {
443
+ const input = validateInput(FindSimilarInput, params);
444
+ const { db, vector } = requireDatabase();
445
+ // Verify document exists
446
+ const doc = db.getDocument(input.document_id);
447
+ if (!doc) {
448
+ throw documentNotFoundError(input.document_id);
449
+ }
450
+ // Get all chunk embeddings for source document
451
+ const embeddingRows = db.getConnection()
452
+ .prepare('SELECT id FROM embeddings WHERE document_id = ? AND chunk_id IS NOT NULL')
453
+ .all(input.document_id);
454
+ if (embeddingRows.length === 0) {
455
+ throw new MCPError('VALIDATION_ERROR', `Document "${input.document_id}" has no chunk embeddings. Process the document first.`);
456
+ }
457
+ // Collect vectors and compute centroid
458
+ const vectors = [];
459
+ for (const row of embeddingRows) {
460
+ const vec = vector.getVector(row.id);
461
+ if (vec) {
462
+ vectors.push(vec);
463
+ }
464
+ }
465
+ if (vectors.length === 0) {
466
+ throw new MCPError('VALIDATION_ERROR', `Document "${input.document_id}" has embedding records but no vectors in vec_embeddings.`);
467
+ }
468
+ // Average vectors to create 768-dim document centroid
469
+ const dims = 768;
470
+ const centroid = new Float32Array(dims);
471
+ for (const vec of vectors) {
472
+ for (let i = 0; i < dims; i++) {
473
+ centroid[i] += vec[i];
474
+ }
475
+ }
476
+ for (let i = 0; i < dims; i++) {
477
+ centroid[i] /= vectors.length;
478
+ }
479
+ // Search for similar embeddings (fetch extra to allow aggregation)
480
+ const resultLimit = input.limit ?? 10;
481
+ const minSim = input.min_similarity ?? 0.5;
482
+ const searchResults = vector.searchSimilar(centroid, {
483
+ limit: resultLimit * 10,
484
+ threshold: minSim,
485
+ });
486
+ // Aggregate by document: average similarity across matching chunks, excluding source doc
487
+ const docSimilarityMap = new Map();
488
+ for (const r of searchResults) {
489
+ if (r.document_id === input.document_id)
490
+ continue;
491
+ const entry = docSimilarityMap.get(r.document_id);
492
+ if (entry) {
493
+ entry.totalSim += r.similarity_score;
494
+ entry.count += 1;
495
+ }
496
+ else {
497
+ docSimilarityMap.set(r.document_id, { totalSim: r.similarity_score, count: 1 });
498
+ }
499
+ }
500
+ // Rank by average similarity, filter by min_similarity, slice to limit
501
+ const ranked = Array.from(docSimilarityMap.entries())
502
+ .map(([docId, { totalSim, count }]) => ({
503
+ document_id: docId,
504
+ avg_similarity: Math.round((totalSim / count) * 1000000) / 1000000,
505
+ matching_chunks: count,
506
+ }))
507
+ .filter((r) => r.avg_similarity >= minSim)
508
+ .sort((a, b) => b.avg_similarity - a.avg_similarity)
509
+ .slice(0, resultLimit);
510
+ // Enrich with document metadata and structural fingerprint
511
+ const conn = db.getConnection();
512
+ const similarDocuments = ranked.map((r) => {
513
+ const simDoc = db.getDocument(r.document_id);
514
+ // Try to include structural fingerprint from extras_json
515
+ let structuralFingerprint = null;
516
+ try {
517
+ const ocrRow = conn
518
+ .prepare('SELECT extras_json FROM ocr_results WHERE document_id = ?')
519
+ .get(r.document_id);
520
+ if (ocrRow?.extras_json) {
521
+ const extras = JSON.parse(ocrRow.extras_json);
522
+ if (extras.structural_fingerprint) {
523
+ structuralFingerprint = extras.structural_fingerprint;
524
+ }
525
+ }
526
+ }
527
+ catch (error) {
528
+ console.error(`[documents] Failed to enrich structural fingerprint for document ${r.document_id}: ${error instanceof Error ? error.message : String(error)}`);
529
+ }
530
+ return {
531
+ document_id: r.document_id,
532
+ file_name: simDoc?.file_name ?? null,
533
+ file_type: simDoc?.file_type ?? null,
534
+ status: simDoc?.status ?? null,
535
+ avg_similarity: r.avg_similarity,
536
+ matching_chunks: r.matching_chunks,
537
+ structural_fingerprint: structuralFingerprint,
538
+ };
539
+ });
540
+ return formatResponse(successResult({
541
+ source_document_id: input.document_id,
542
+ source_chunk_count: vectors.length,
543
+ similar_documents: similarDocuments,
544
+ total: similarDocuments.length,
545
+ next_steps: [{ tool: 'ocr_document_get', description: 'Get details for a similar document' }, { tool: 'ocr_document_compare', description: 'Compare two similar documents' }],
546
+ }));
547
+ }
548
+ catch (error) {
549
+ return handleError(error);
550
+ }
551
+ }
552
+ // ═══════════════════════════════════════════════════════════════════════════════
553
+ // BATCH METADATA UPDATE HANDLER
554
+ // ═══════════════════════════════════════════════════════════════════════════════
555
+ /**
556
+ * Handle ocr_document_update_metadata - Batch update metadata for multiple documents
557
+ */
558
+ export async function handleUpdateMetadata(params) {
559
+ try {
560
+ const input = validateInput(UpdateMetadataInput, params);
561
+ // Verify at least one metadata field is provided (before requiring database)
562
+ if (input.doc_title === undefined &&
563
+ input.doc_author === undefined &&
564
+ input.doc_subject === undefined) {
565
+ throw new MCPError('VALIDATION_ERROR', 'At least one metadata field (doc_title, doc_author, doc_subject) must be provided.');
566
+ }
567
+ const { db } = requireDatabase();
568
+ let updatedCount = 0;
569
+ const notFoundIds = [];
570
+ for (const docId of input.document_ids) {
571
+ try {
572
+ const doc = db.getDocument(docId);
573
+ if (!doc) {
574
+ notFoundIds.push(docId);
575
+ continue;
576
+ }
577
+ db.updateDocumentMetadata(docId, {
578
+ docTitle: input.doc_title,
579
+ docAuthor: input.doc_author,
580
+ docSubject: input.doc_subject,
581
+ });
582
+ updatedCount++;
583
+ }
584
+ catch (docError) {
585
+ const errMsg = docError instanceof Error ? docError.message : String(docError);
586
+ console.error(`[WARN] Failed to update metadata for document ${docId}: ${errMsg}`);
587
+ notFoundIds.push(docId);
588
+ }
589
+ }
590
+ return formatResponse(successResult({
591
+ updated_count: updatedCount,
592
+ not_found_ids: notFoundIds,
593
+ total_requested: input.document_ids.length,
594
+ next_steps: [{ tool: 'ocr_document_get', description: 'Verify the updated metadata' }],
595
+ }));
596
+ }
597
+ catch (error) {
598
+ return handleError(error);
599
+ }
600
+ }
601
+ // ═══════════════════════════════════════════════════════════════════════════════
602
+ // DUPLICATE DOCUMENT DETECTION HANDLER
603
+ // ═══════════════════════════════════════════════════════════════════════════════
604
+ /**
605
+ * Handle ocr_document_duplicates - Detect duplicate documents
606
+ */
607
+ export async function handleDuplicateDetection(params) {
608
+ try {
609
+ const input = validateInput(DuplicateDetectionInput, params);
610
+ const { db } = requireDatabase();
611
+ const conn = db.getConnection();
612
+ if (input.mode === 'exact') {
613
+ // Find documents with same file_hash
614
+ const groups = conn
615
+ .prepare(`
616
+ SELECT file_hash, GROUP_CONCAT(id) as doc_ids, GROUP_CONCAT(file_name) as file_names,
617
+ COUNT(*) as count
618
+ FROM documents
619
+ GROUP BY file_hash
620
+ HAVING COUNT(*) > 1
621
+ ORDER BY count DESC
622
+ LIMIT ?
623
+ `)
624
+ .all(input.limit);
625
+ const duplicateGroups = groups.map((g) => ({
626
+ file_hash: g.file_hash,
627
+ document_ids: g.doc_ids.split(','),
628
+ file_names: g.file_names.split(','),
629
+ count: g.count,
630
+ }));
631
+ return formatResponse(successResult({
632
+ mode: 'exact',
633
+ total_groups: duplicateGroups.length,
634
+ total_duplicate_documents: duplicateGroups.reduce((sum, g) => sum + g.count, 0),
635
+ groups: duplicateGroups,
636
+ next_steps: [{ tool: 'ocr_document_compare', description: 'Compare a duplicate pair in detail' }, { tool: 'ocr_document_delete', description: 'Delete a confirmed duplicate' }],
637
+ }));
638
+ }
639
+ else {
640
+ // Near-duplicate mode: query comparisons table
641
+ const comparisons = conn
642
+ .prepare(`
643
+ SELECT c.id as comparison_id, c.document_id_1, c.document_id_2,
644
+ c.similarity_ratio, c.summary,
645
+ d1.file_name as file_name_1, d2.file_name as file_name_2
646
+ FROM comparisons c
647
+ JOIN documents d1 ON d1.id = c.document_id_1
648
+ JOIN documents d2 ON d2.id = c.document_id_2
649
+ WHERE c.similarity_ratio >= ?
650
+ ORDER BY c.similarity_ratio DESC
651
+ LIMIT ?
652
+ `)
653
+ .all(input.similarity_threshold, input.limit);
654
+ return formatResponse(successResult({
655
+ mode: 'near',
656
+ similarity_threshold: input.similarity_threshold,
657
+ total_pairs: comparisons.length,
658
+ pairs: comparisons.map((c) => ({
659
+ comparison_id: c.comparison_id,
660
+ document_id_1: c.document_id_1,
661
+ file_name_1: c.file_name_1,
662
+ document_id_2: c.document_id_2,
663
+ file_name_2: c.file_name_2,
664
+ similarity_ratio: c.similarity_ratio,
665
+ summary: c.summary,
666
+ })),
667
+ next_steps: [{ tool: 'ocr_document_compare', description: 'Compare a duplicate pair in detail' }, { tool: 'ocr_document_delete', description: 'Delete a confirmed duplicate' }],
668
+ }));
669
+ }
670
+ }
671
+ catch (error) {
672
+ return handleError(error);
673
+ }
674
+ }
675
+ // ═══════════════════════════════════════════════════════════════════════════════
676
+ // DOCUMENT STRUCTURE ANALYSIS HANDLER
677
+ // ═══════════════════════════════════════════════════════════════════════════════
678
+ /**
679
+ * Build an outline from chunks that have heading metadata.
680
+ * Deduplicates headings by tracking seen heading_context values.
681
+ */
682
+ function buildOutlineFromChunks(chunks) {
683
+ const seen = new Set();
684
+ const outline = [];
685
+ for (const chunk of chunks) {
686
+ if (chunk.heading_context && !seen.has(chunk.heading_context)) {
687
+ seen.add(chunk.heading_context);
688
+ outline.push({
689
+ level: chunk.heading_level ?? 1,
690
+ text: chunk.heading_context,
691
+ page: chunk.page_number,
692
+ });
693
+ }
694
+ }
695
+ return outline;
696
+ }
697
+ /**
698
+ * Walk a block tree from json_blocks, extracting structural elements.
699
+ */
700
+ function walkBlocks(blocks, outline, tables, figures, codeBlocks) {
701
+ for (const block of blocks) {
702
+ const blockType = block.block_type;
703
+ const page = block.page ?? block.page_idx ?? null;
704
+ if (blockType === 'SectionHeader' || blockType === 'Title') {
705
+ const text = block.text ?? block.html ?? '';
706
+ const level = block.level ?? (blockType === 'Title' ? 1 : 2);
707
+ if (text) {
708
+ outline.push({ level, text, page });
709
+ }
710
+ }
711
+ else if (blockType === 'Table') {
712
+ const caption = block.caption ?? undefined;
713
+ tables.push({ page, caption });
714
+ }
715
+ else if (blockType === 'Figure' || blockType === 'Picture') {
716
+ const caption = block.caption ?? undefined;
717
+ figures.push({ page, caption });
718
+ }
719
+ else if (blockType === 'Code') {
720
+ const language = block.language ?? undefined;
721
+ codeBlocks.push({ page, language });
722
+ }
723
+ // Recursively walk children if present
724
+ if (Array.isArray(block.children)) {
725
+ walkBlocks(block.children, outline, tables, figures, codeBlocks);
726
+ }
727
+ }
728
+ }
729
+ /**
730
+ * Handle ocr_document_structure - Analyze document structure
731
+ *
732
+ * Supports three formats:
733
+ * - 'structure' (default): headings, tables, figures, code blocks from json_blocks or chunks
734
+ * - 'tree': hierarchical section tree with chunk_ids, page_numbers (merged from ocr_document_sections)
735
+ * - 'outline': flat numbered outline with chunk counts (merged from ocr_document_sections)
736
+ */
737
+ export async function handleDocumentStructure(params) {
738
+ try {
739
+ const input = validateInput(DocumentStructureInput, params);
740
+ const { db } = requireDatabase();
741
+ const doc = db.getDocument(input.document_id);
742
+ if (!doc) {
743
+ throw documentNotFoundError(input.document_id);
744
+ }
745
+ // Delegate to sections logic for tree/outline formats
746
+ if (input.format === 'tree' || input.format === 'outline') {
747
+ return handleDocumentSectionsInternal(db, doc, input);
748
+ }
749
+ // Default 'structure' format: headings, tables, figures, code blocks
750
+ const conn = db.getConnection();
751
+ const outline = [];
752
+ const tables = [];
753
+ const figures = [];
754
+ const codeBlocks = [];
755
+ let source = 'chunks';
756
+ let documentMap = null;
757
+ // Try json_blocks first (richer structure)
758
+ const ocrRow = conn
759
+ .prepare('SELECT json_blocks FROM ocr_results WHERE document_id = ?')
760
+ .get(input.document_id);
761
+ if (ocrRow?.json_blocks) {
762
+ try {
763
+ const parsed = JSON.parse(ocrRow.json_blocks);
764
+ // Handle both formats: array of blocks or {children: [...]} object
765
+ const blocks = Array.isArray(parsed) ? parsed
766
+ : (Array.isArray(parsed.children) ? parsed.children : null);
767
+ if (blocks && blocks.length > 0) {
768
+ walkBlocks(blocks, outline, tables, figures, codeBlocks);
769
+ source = 'json_blocks';
770
+ // Build document map with table column details
771
+ try {
772
+ const ocrTextRow = conn.prepare('SELECT extracted_text FROM ocr_results WHERE document_id = ?')
773
+ .get(input.document_id);
774
+ if (ocrTextRow?.extracted_text) {
775
+ // Pass the original parsed object (or wrap array in {children:...})
776
+ const jsonBlocksRoot = Array.isArray(parsed)
777
+ ? { children: parsed }
778
+ : parsed;
779
+ const tableStructures = extractTableStructures(jsonBlocksRoot, ocrTextRow.extracted_text, [] // pageOffsets not needed for structure extraction
780
+ );
781
+ documentMap = {
782
+ sections: outline.map(o => ({
783
+ heading: o.text,
784
+ level: o.level,
785
+ page: o.page,
786
+ })),
787
+ tables: tableStructures.map(ts => ({
788
+ page: ts.pageNumber,
789
+ columns: ts.columnHeaders,
790
+ row_count: ts.rowCount,
791
+ column_count: ts.columnCount,
792
+ })),
793
+ figures: figures.map(f => ({
794
+ page: f.page,
795
+ caption: f.caption ?? null,
796
+ })),
797
+ code_blocks: codeBlocks.map(cb => ({
798
+ page: cb.page,
799
+ language: cb.language ?? null,
800
+ })),
801
+ };
802
+ }
803
+ }
804
+ catch (mapErr) {
805
+ console.error(`[DocumentStructure] Failed to build document_map: ${String(mapErr)}`);
806
+ }
807
+ }
808
+ }
809
+ catch (parseErr) {
810
+ console.error(`[DocumentStructure] Failed to parse json_blocks for ${input.document_id}: ${String(parseErr)}`);
811
+ // Fall through to chunk-based analysis
812
+ }
813
+ }
814
+ // Fallback to chunks if no json_blocks or parsing failed
815
+ if (source === 'chunks') {
816
+ const chunks = db.getChunksByDocumentId(input.document_id);
817
+ const chunkData = chunks.map((c) => ({
818
+ heading_context: c.heading_context ?? null,
819
+ heading_level: c.heading_level ?? null,
820
+ page_number: c.page_number,
821
+ }));
822
+ const chunkOutline = buildOutlineFromChunks(chunkData);
823
+ outline.push(...chunkOutline);
824
+ }
825
+ const responseData = {
826
+ document_id: doc.id,
827
+ file_name: doc.file_name,
828
+ page_count: doc.page_count,
829
+ format: 'structure',
830
+ source,
831
+ outline,
832
+ tables: { count: tables.length, items: tables },
833
+ figures: { count: figures.length, items: figures },
834
+ code_blocks: { count: codeBlocks.length, items: codeBlocks },
835
+ total_structural_elements: outline.length + tables.length + figures.length + codeBlocks.length,
836
+ next_steps: [{ tool: 'ocr_document_page', description: 'Read a specific page from the document' }, { tool: 'ocr_search', description: 'Search within the document' }, { tool: 'ocr_document_tables', description: 'Extract table data from the document' }],
837
+ };
838
+ if (documentMap) {
839
+ responseData.document_map = documentMap;
840
+ }
841
+ return formatResponse(successResult(responseData));
842
+ }
843
+ catch (error) {
844
+ return handleError(error);
845
+ }
846
+ }
847
+ /**
848
+ * Flatten a section tree into a numbered outline format.
849
+ * Example: "1. Introduction (pages 1-3) [5 chunks]"
850
+ */
851
+ function flattenToOutline(nodes, prefix = '') {
852
+ const lines = [];
853
+ for (let i = 0; i < nodes.length; i++) {
854
+ const num = prefix ? `${prefix}.${i + 1}` : `${i + 1}`;
855
+ const node = nodes[i];
856
+ const pageInfo = node.page_range ? ` (pages ${node.page_range})` : '';
857
+ lines.push(`${num}. ${node.name}${pageInfo} [${node.chunk_count} chunks]`);
858
+ if (node.children && node.children.length > 0) {
859
+ lines.push(...flattenToOutline(node.children, num));
860
+ }
861
+ }
862
+ return lines;
863
+ }
864
+ /**
865
+ * Internal handler for section tree/outline format (merged from ocr_document_sections).
866
+ * Called by handleDocumentStructure when format='tree' or format='outline'.
867
+ */
868
+ async function handleDocumentSectionsInternal(db, doc, input) {
869
+ try {
870
+ const chunks = db.getChunksByDocumentId(input.document_id);
871
+ // Build tree from section_path strings
872
+ const root = {
873
+ name: '(root)',
874
+ chunk_count: 0,
875
+ heading_level: null,
876
+ first_chunk_index: null,
877
+ last_chunk_index: null,
878
+ chunk_ids: input.include_chunk_ids ? [] : undefined,
879
+ page_numbers: input.include_page_numbers ? [] : undefined,
880
+ children: [],
881
+ };
882
+ let chunksWithSections = 0;
883
+ let chunksWithoutSections = 0;
884
+ /** Helper to update chunk index range on a node */
885
+ const updateChunkIndexRange = (node, chunkIndex) => {
886
+ if (chunkIndex == null)
887
+ return;
888
+ if (node.first_chunk_index === null || chunkIndex < node.first_chunk_index) {
889
+ node.first_chunk_index = chunkIndex;
890
+ }
891
+ if (node.last_chunk_index === null || chunkIndex > node.last_chunk_index) {
892
+ node.last_chunk_index = chunkIndex;
893
+ }
894
+ };
895
+ for (const chunk of chunks) {
896
+ if (!chunk.section_path) {
897
+ // Chunks without section_path go to root
898
+ chunksWithoutSections++;
899
+ root.chunk_count++;
900
+ updateChunkIndexRange(root, chunk.chunk_index);
901
+ if (input.include_chunk_ids && root.chunk_ids) {
902
+ root.chunk_ids.push(chunk.id);
903
+ }
904
+ if (input.include_page_numbers && root.page_numbers && chunk.page_number !== null) {
905
+ if (!root.page_numbers.includes(chunk.page_number)) {
906
+ root.page_numbers.push(chunk.page_number);
907
+ }
908
+ }
909
+ continue;
910
+ }
911
+ chunksWithSections++;
912
+ // Parse section_path: "Heading 1 > Heading 2 > Heading 3"
913
+ const parts = chunk.section_path.split(' > ').map((s) => s.trim()).filter((s) => s.length > 0);
914
+ let current = root;
915
+ for (let i = 0; i < parts.length; i++) {
916
+ const partName = parts[i];
917
+ let child = current.children.find((c) => c.name === partName);
918
+ if (!child) {
919
+ child = {
920
+ name: partName,
921
+ chunk_count: 0,
922
+ heading_level: null,
923
+ first_chunk_index: null,
924
+ last_chunk_index: null,
925
+ chunk_ids: input.include_chunk_ids ? [] : undefined,
926
+ page_numbers: input.include_page_numbers ? [] : undefined,
927
+ children: [],
928
+ };
929
+ current.children.push(child);
930
+ }
931
+ // Only add chunk to the deepest (leaf) level
932
+ if (i === parts.length - 1) {
933
+ child.chunk_count++;
934
+ updateChunkIndexRange(child, chunk.chunk_index);
935
+ // Set heading_level from the chunk (first non-null wins)
936
+ if (child.heading_level === null && chunk.heading_level != null) {
937
+ child.heading_level = chunk.heading_level;
938
+ }
939
+ if (input.include_chunk_ids && child.chunk_ids) {
940
+ child.chunk_ids.push(chunk.id);
941
+ }
942
+ if (input.include_page_numbers && child.page_numbers && chunk.page_number !== null) {
943
+ if (!child.page_numbers.includes(chunk.page_number)) {
944
+ child.page_numbers.push(chunk.page_number);
945
+ }
946
+ }
947
+ }
948
+ current = child;
949
+ }
950
+ }
951
+ // Post-process: compute page_range for nodes with page_numbers
952
+ const computePageRange = (node) => {
953
+ if (node.page_numbers && node.page_numbers.length > 0) {
954
+ node.page_numbers.sort((a, b) => a - b);
955
+ const min = node.page_numbers[0];
956
+ const max = node.page_numbers[node.page_numbers.length - 1];
957
+ node.page_range = min === max ? String(min) : `${min}-${max}`;
958
+ }
959
+ else {
960
+ node.page_range = null;
961
+ }
962
+ for (const child of node.children) {
963
+ computePageRange(child);
964
+ }
965
+ };
966
+ if (input.include_page_numbers) {
967
+ computePageRange(root);
968
+ }
969
+ // Count total sections in the tree
970
+ const countSections = (nodes) => {
971
+ let count = nodes.length;
972
+ for (const node of nodes) {
973
+ count += countSections(node.children);
974
+ }
975
+ return count;
976
+ };
977
+ const totalSections = countSections(root.children);
978
+ if (input.format === 'outline') {
979
+ // Flat numbered outline format
980
+ const outline = flattenToOutline(root.children);
981
+ return formatResponse(successResult({
982
+ document_id: doc.id,
983
+ file_name: doc.file_name,
984
+ format: 'outline',
985
+ total_chunks: chunks.length,
986
+ chunks_with_sections: chunksWithSections,
987
+ chunks_without_sections: chunksWithoutSections,
988
+ total_sections: totalSections,
989
+ root_chunks: root.chunk_count,
990
+ outline,
991
+ next_steps: [{ tool: 'ocr_document_page', description: 'Read a specific page from the document' }, { tool: 'ocr_search', description: 'Search within the document' }, { tool: 'ocr_document_tables', description: 'Extract table data from the document' }],
992
+ }));
993
+ }
994
+ // Default: tree format
995
+ return formatResponse(successResult({
996
+ document_id: doc.id,
997
+ file_name: doc.file_name,
998
+ format: 'tree',
999
+ total_chunks: chunks.length,
1000
+ chunks_with_sections: chunksWithSections,
1001
+ chunks_without_sections: chunksWithoutSections,
1002
+ total_sections: totalSections,
1003
+ sections: root.children,
1004
+ root_chunks: root.chunk_count,
1005
+ next_steps: [{ tool: 'ocr_document_page', description: 'Read a specific page from the document' }, { tool: 'ocr_search', description: 'Search within the document' }, { tool: 'ocr_document_tables', description: 'Extract table data from the document' }],
1006
+ }));
1007
+ }
1008
+ catch (error) {
1009
+ return handleError(error);
1010
+ }
1011
+ }
1012
+ // ═══════════════════════════════════════════════════════════════════════════════
1013
+ // UNIFIED EXPORT INPUT SCHEMA (MERGE-A: ocr_document_export + ocr_corpus_export → ocr_export)
1014
+ // ═══════════════════════════════════════════════════════════════════════════════
1015
+ const ExportInput = z.object({
1016
+ document_id: z.string().min(1).optional()
1017
+ .describe('Document ID to export. Omit to export entire corpus.'),
1018
+ format: z.enum(['json', 'markdown', 'csv']).default('json')
1019
+ .describe('Export format: json/markdown for single doc, json/csv for corpus'),
1020
+ output_path: z.string().min(1).describe('Path to save exported file'),
1021
+ include_images: z.boolean().default(true)
1022
+ .describe('Include image data in export'),
1023
+ include_extractions: z.boolean().default(true)
1024
+ .describe('Include structured extractions (single doc only)'),
1025
+ include_provenance: z.boolean().default(false)
1026
+ .describe('Include provenance chain (single doc only)'),
1027
+ include_chunks: z.boolean().default(false)
1028
+ .describe('Include chunk list per document (corpus only)'),
1029
+ });
1030
+ // ═══════════════════════════════════════════════════════════════════════════════
1031
+ // UNIFIED EXPORT HANDLER (MERGE-A)
1032
+ // ═══════════════════════════════════════════════════════════════════════════════
1033
+ /**
1034
+ * Handle ocr_export - Unified export for single document or entire corpus
1035
+ * If document_id is provided: exports that document (json/markdown)
1036
+ * If document_id is omitted: exports entire corpus (json/csv)
1037
+ */
1038
+ export async function handleExport(params) {
1039
+ try {
1040
+ const input = validateInput(ExportInput, params);
1041
+ if (input.document_id) {
1042
+ // Format validation for single doc
1043
+ if (input.format === 'csv') {
1044
+ throw new MCPError('VALIDATION_ERROR', 'CSV format only supported for corpus export, not single document. Use json or markdown.');
1045
+ }
1046
+ return handleDocumentExportInternal(input);
1047
+ }
1048
+ else {
1049
+ // Format validation for corpus
1050
+ if (input.format === 'markdown') {
1051
+ throw new MCPError('VALIDATION_ERROR', 'Markdown format only supported for single document export, not corpus. Use json or csv.');
1052
+ }
1053
+ return handleCorpusExportInternal(input);
1054
+ }
1055
+ }
1056
+ catch (error) {
1057
+ return handleError(error);
1058
+ }
1059
+ }
1060
+ /**
1061
+ * Internal: Export all data for a single document to JSON or markdown
1062
+ */
1063
+ async function handleDocumentExportInternal(input) {
1064
+ try {
1065
+ const { db } = requireDatabase();
1066
+ // Get document record
1067
+ const doc = db.getDocument(input.document_id);
1068
+ if (!doc) {
1069
+ throw documentNotFoundError(input.document_id);
1070
+ }
1071
+ // Get OCR results
1072
+ const ocrResult = db.getOCRResultByDocumentId(doc.id);
1073
+ // Get all chunks
1074
+ const chunks = db.getChunksByDocumentId(doc.id);
1075
+ // Get images if requested
1076
+ let images = [];
1077
+ if (input.include_images) {
1078
+ const conn = db.getConnection();
1079
+ const imgRows = getImagesByDocument(conn, doc.id);
1080
+ images = imgRows.map((img) => ({
1081
+ id: img.id,
1082
+ page_number: img.page_number,
1083
+ image_index: img.image_index,
1084
+ block_type: img.block_type,
1085
+ extracted_path: img.extracted_path,
1086
+ width: img.dimensions?.width ?? null,
1087
+ height: img.dimensions?.height ?? null,
1088
+ vlm_status: img.vlm_status,
1089
+ vlm_description: img.vlm_description ?? null,
1090
+ vlm_image_type: img.vlm_structured_data?.imageType ?? null,
1091
+ created_at: img.created_at,
1092
+ }));
1093
+ }
1094
+ // Get extractions if requested
1095
+ let extractions = [];
1096
+ if (input.include_extractions) {
1097
+ const extRows = db.getExtractionsByDocument(doc.id);
1098
+ extractions = extRows.map((ext) => ({
1099
+ id: ext.id,
1100
+ schema_json: ext.schema_json,
1101
+ extraction_json: ext.extraction_json,
1102
+ content_hash: ext.content_hash,
1103
+ created_at: ext.created_at,
1104
+ }));
1105
+ }
1106
+ // Get provenance if requested
1107
+ let provenance;
1108
+ if (input.include_provenance) {
1109
+ provenance = fetchProvenanceChain(db, doc.provenance_id, 'DocumentExport');
1110
+ }
1111
+ // Sanitize output path
1112
+ const safePath = sanitizePath(input.output_path);
1113
+ // Create output directory if needed
1114
+ const dir = dirname(safePath);
1115
+ if (!existsSync(dir)) {
1116
+ mkdirSync(dir, { recursive: true });
1117
+ }
1118
+ if (input.format === 'json') {
1119
+ // Build JSON export
1120
+ const exportData = {
1121
+ document: {
1122
+ id: doc.id,
1123
+ file_name: doc.file_name,
1124
+ file_path: doc.file_path,
1125
+ file_hash: doc.file_hash,
1126
+ file_size: doc.file_size,
1127
+ file_type: doc.file_type,
1128
+ status: doc.status,
1129
+ page_count: doc.page_count,
1130
+ doc_title: doc.doc_title ?? null,
1131
+ doc_author: doc.doc_author ?? null,
1132
+ doc_subject: doc.doc_subject ?? null,
1133
+ created_at: doc.created_at,
1134
+ },
1135
+ ocr_results: ocrResult
1136
+ ? {
1137
+ id: ocrResult.id,
1138
+ datalab_mode: ocrResult.datalab_mode,
1139
+ parse_quality_score: ocrResult.parse_quality_score,
1140
+ page_count: ocrResult.page_count,
1141
+ text_length: ocrResult.text_length,
1142
+ extracted_text: ocrResult.extracted_text,
1143
+ cost_cents: ocrResult.cost_cents,
1144
+ processing_duration_ms: ocrResult.processing_duration_ms,
1145
+ }
1146
+ : null,
1147
+ chunks: chunks.map((c) => ({
1148
+ id: c.id,
1149
+ chunk_index: c.chunk_index,
1150
+ text: c.text,
1151
+ page_number: c.page_number,
1152
+ character_start: c.character_start,
1153
+ character_end: c.character_end,
1154
+ heading_context: c.heading_context ?? null,
1155
+ section_path: c.section_path ?? null,
1156
+ content_types: c.content_types ?? null,
1157
+ })),
1158
+ };
1159
+ if (input.include_images) {
1160
+ exportData.images = images;
1161
+ }
1162
+ if (input.include_extractions) {
1163
+ exportData.extractions = extractions;
1164
+ }
1165
+ if (input.include_provenance && provenance) {
1166
+ exportData.provenance = provenance;
1167
+ }
1168
+ writeFileSync(safePath, JSON.stringify(exportData, null, 2), 'utf-8');
1169
+ }
1170
+ else {
1171
+ // Build Markdown export
1172
+ const lines = [];
1173
+ lines.push(`# Document Export: ${doc.file_name}`);
1174
+ lines.push('');
1175
+ lines.push('## Metadata');
1176
+ lines.push(`- **File:** ${doc.file_path}`);
1177
+ lines.push(`- **Status:** ${doc.status}`);
1178
+ lines.push(`- **Pages:** ${doc.page_count ?? 'N/A'}`);
1179
+ lines.push(`- **Created:** ${doc.created_at}`);
1180
+ lines.push(`- **File Type:** ${doc.file_type}`);
1181
+ lines.push(`- **File Size:** ${doc.file_size} bytes`);
1182
+ if (doc.doc_title)
1183
+ lines.push(`- **Title:** ${doc.doc_title}`);
1184
+ if (doc.doc_author)
1185
+ lines.push(`- **Author:** ${doc.doc_author}`);
1186
+ lines.push('');
1187
+ if (ocrResult) {
1188
+ lines.push('## OCR Info');
1189
+ lines.push(`- **Mode:** ${ocrResult.datalab_mode}`);
1190
+ lines.push(`- **Quality Score:** ${ocrResult.parse_quality_score}`);
1191
+ lines.push(`- **Text Length:** ${ocrResult.text_length}`);
1192
+ lines.push(`- **Processing Time:** ${ocrResult.processing_duration_ms}ms`);
1193
+ lines.push('');
1194
+ }
1195
+ if (chunks.length > 0) {
1196
+ lines.push('## Content');
1197
+ lines.push('');
1198
+ for (const chunk of chunks) {
1199
+ const pageInfo = chunk.page_number !== null ? ` (Page ${chunk.page_number})` : '';
1200
+ const heading = chunk.heading_context ? ` - ${chunk.heading_context}` : '';
1201
+ lines.push(`### Chunk ${chunk.chunk_index}${pageInfo}${heading}`);
1202
+ lines.push('');
1203
+ lines.push(chunk.text);
1204
+ lines.push('');
1205
+ }
1206
+ }
1207
+ if (input.include_images && images.length > 0) {
1208
+ lines.push('## Images');
1209
+ lines.push('');
1210
+ for (let i = 0; i < images.length; i++) {
1211
+ const img = images[i];
1212
+ const pageInfo = img.page_number !== null ? ` (Page ${img.page_number})` : '';
1213
+ lines.push(`### Image ${i + 1}${pageInfo}`);
1214
+ lines.push(`- **Path:** ${img.extracted_path ?? 'N/A'}`);
1215
+ lines.push(`- **Type:** ${img.block_type ?? 'unknown'}`);
1216
+ lines.push(`- **Size:** ${img.width ?? '?'}x${img.height ?? '?'}`);
1217
+ if (img.vlm_description) {
1218
+ lines.push(`- **Description:** ${img.vlm_description}`);
1219
+ }
1220
+ lines.push('');
1221
+ }
1222
+ }
1223
+ if (input.include_extractions && extractions.length > 0) {
1224
+ lines.push('## Extractions');
1225
+ lines.push('');
1226
+ for (let i = 0; i < extractions.length; i++) {
1227
+ const ext = extractions[i];
1228
+ lines.push(`### Extraction ${i + 1}`);
1229
+ lines.push('');
1230
+ lines.push('**Schema:**');
1231
+ lines.push('```json');
1232
+ lines.push(String(ext.schema_json));
1233
+ lines.push('```');
1234
+ lines.push('');
1235
+ lines.push('**Data:**');
1236
+ lines.push('```json');
1237
+ lines.push(String(ext.extraction_json));
1238
+ lines.push('```');
1239
+ lines.push('');
1240
+ }
1241
+ }
1242
+ if (input.include_provenance && provenance && provenance.length > 0) {
1243
+ lines.push('## Provenance');
1244
+ lines.push('');
1245
+ lines.push('```json');
1246
+ lines.push(JSON.stringify(provenance, null, 2));
1247
+ lines.push('```');
1248
+ lines.push('');
1249
+ }
1250
+ writeFileSync(safePath, lines.join('\n'), 'utf-8');
1251
+ }
1252
+ return formatResponse(successResult({
1253
+ output_path: safePath,
1254
+ format: input.format,
1255
+ document_id: doc.id,
1256
+ stats: {
1257
+ chunk_count: chunks.length,
1258
+ image_count: images.length,
1259
+ extraction_count: extractions.length,
1260
+ },
1261
+ next_steps: [{ tool: 'ocr_document_list', description: 'Export another document' }],
1262
+ }));
1263
+ }
1264
+ catch (error) {
1265
+ return handleError(error);
1266
+ }
1267
+ }
1268
+ // ═══════════════════════════════════════════════════════════════════════════════
1269
+ // INTERNAL CORPUS EXPORT HANDLER
1270
+ // ═══════════════════════════════════════════════════════════════════════════════
1271
+ /**
1272
+ * Internal: Export entire corpus metadata and statistics
1273
+ */
1274
+ async function handleCorpusExportInternal(input) {
1275
+ try {
1276
+ const { db } = requireDatabase();
1277
+ const conn = db.getConnection();
1278
+ // Get all documents
1279
+ const documents = db.listDocuments();
1280
+ // Sanitize output path
1281
+ const safePath = sanitizePath(input.output_path);
1282
+ // Create output directory if needed
1283
+ const dir = dirname(safePath);
1284
+ if (!existsSync(dir)) {
1285
+ mkdirSync(dir, { recursive: true });
1286
+ }
1287
+ let totalChunks = 0;
1288
+ let totalImages = 0;
1289
+ if (input.format === 'json') {
1290
+ // Build JSON export: array of document objects
1291
+ const exportDocs = [];
1292
+ for (const doc of documents) {
1293
+ const chunkRows = db.getChunksByDocumentId(doc.id);
1294
+ const chunkCount = chunkRows.length;
1295
+ totalChunks += chunkCount;
1296
+ const imageCountRow = conn
1297
+ .prepare('SELECT COUNT(*) as count FROM images WHERE document_id = ?')
1298
+ .get(doc.id);
1299
+ const imageCount = imageCountRow?.count ?? 0;
1300
+ totalImages += imageCount;
1301
+ const docEntry = {
1302
+ id: doc.id,
1303
+ file_path: doc.file_path,
1304
+ file_name: doc.file_name,
1305
+ file_type: doc.file_type,
1306
+ file_size: doc.file_size,
1307
+ status: doc.status,
1308
+ page_count: doc.page_count,
1309
+ doc_title: doc.doc_title ?? null,
1310
+ doc_author: doc.doc_author ?? null,
1311
+ doc_subject: doc.doc_subject ?? null,
1312
+ chunk_count: chunkCount,
1313
+ image_count: imageCount,
1314
+ created_at: doc.created_at,
1315
+ };
1316
+ if (input.include_chunks) {
1317
+ docEntry.chunks = chunkRows.map((c) => ({
1318
+ id: c.id,
1319
+ chunk_index: c.chunk_index,
1320
+ text: c.text,
1321
+ page_number: c.page_number,
1322
+ heading_context: c.heading_context ?? null,
1323
+ section_path: c.section_path ?? null,
1324
+ content_types: c.content_types ?? null,
1325
+ }));
1326
+ }
1327
+ if (input.include_images) {
1328
+ const imgRows = getImagesByDocument(conn, doc.id);
1329
+ totalImages = totalImages - imageCount + imgRows.length; // Correct count
1330
+ docEntry.images = imgRows.map((img) => ({
1331
+ id: img.id,
1332
+ page_number: img.page_number,
1333
+ block_type: img.block_type,
1334
+ extracted_path: img.extracted_path,
1335
+ width: img.dimensions?.width ?? null,
1336
+ height: img.dimensions?.height ?? null,
1337
+ vlm_status: img.vlm_status,
1338
+ vlm_description: img.vlm_description ?? null,
1339
+ }));
1340
+ }
1341
+ exportDocs.push(docEntry);
1342
+ }
1343
+ writeFileSync(safePath, JSON.stringify(exportDocs, null, 2), 'utf-8');
1344
+ }
1345
+ else {
1346
+ // CSV format: one row per document
1347
+ const csvQuote = (value) => `"${value.replace(/"/g, '""')}"`;
1348
+ const headers = ['id', 'file_path', 'file_name', 'file_type', 'status', 'page_count', 'chunk_count', 'image_count', 'created_at'];
1349
+ const csvLines = [headers.map(csvQuote).join(',')];
1350
+ for (const doc of documents) {
1351
+ const chunkCount = db.getChunksByDocumentId(doc.id).length;
1352
+ totalChunks += chunkCount;
1353
+ const imageCountRow = conn
1354
+ .prepare('SELECT COUNT(*) as count FROM images WHERE document_id = ?')
1355
+ .get(doc.id);
1356
+ const imageCount = imageCountRow?.count ?? 0;
1357
+ totalImages += imageCount;
1358
+ csvLines.push([
1359
+ csvQuote(doc.id),
1360
+ csvQuote(doc.file_path),
1361
+ csvQuote(doc.file_name),
1362
+ csvQuote(doc.file_type),
1363
+ csvQuote(doc.status),
1364
+ csvQuote(String(doc.page_count ?? '')),
1365
+ csvQuote(String(chunkCount)),
1366
+ csvQuote(String(imageCount)),
1367
+ csvQuote(doc.created_at),
1368
+ ].join(','));
1369
+ }
1370
+ writeFileSync(safePath, csvLines.join('\n'), 'utf-8');
1371
+ }
1372
+ return formatResponse(successResult({
1373
+ output_path: safePath,
1374
+ format: input.format,
1375
+ document_count: documents.length,
1376
+ total_chunks: totalChunks,
1377
+ total_images: totalImages,
1378
+ next_steps: [{ tool: 'ocr_report_overview', description: 'Get quality and corpus analytics' }],
1379
+ }));
1380
+ }
1381
+ catch (error) {
1382
+ return handleError(error);
1383
+ }
1384
+ }
1385
+ // ═══════════════════════════════════════════════════════════════════════════════
1386
+ // DOCUMENT VERSIONS HANDLER
1387
+ // ═══════════════════════════════════════════════════════════════════════════════
1388
+ const DocumentVersionsInput = z.object({
1389
+ document_id: z.string().min(1).describe('Document ID to find versions of'),
1390
+ });
1391
+ /**
1392
+ * Handle ocr_document_versions - Find all versions of a document by file_path
1393
+ */
1394
+ async function handleDocumentVersions(params) {
1395
+ try {
1396
+ const input = validateInput(DocumentVersionsInput, params);
1397
+ const { db } = requireDatabase();
1398
+ const conn = db.getConnection();
1399
+ const doc = db.getDocument(input.document_id);
1400
+ if (!doc) {
1401
+ throw documentNotFoundError(input.document_id);
1402
+ }
1403
+ // Query ALL documents with the same file_path, ordered by created_at DESC
1404
+ const versions = conn
1405
+ .prepare(`SELECT id, file_hash, file_size, status, created_at, ocr_completed_at
1406
+ FROM documents
1407
+ WHERE file_path = ?
1408
+ ORDER BY created_at DESC`)
1409
+ .all(doc.file_path);
1410
+ return formatResponse(successResult({
1411
+ document_id: input.document_id,
1412
+ file_path: doc.file_path,
1413
+ versions: versions.map((v) => ({
1414
+ id: v.id,
1415
+ file_hash: v.file_hash,
1416
+ file_size: v.file_size,
1417
+ status: v.status,
1418
+ created_at: v.created_at,
1419
+ ocr_completed_at: v.ocr_completed_at,
1420
+ })),
1421
+ total_versions: versions.length,
1422
+ next_steps: [{ tool: 'ocr_document_get', description: 'Get details for a specific version' }, { tool: 'ocr_document_compare', description: 'Compare two versions' }],
1423
+ }));
1424
+ }
1425
+ catch (error) {
1426
+ return handleError(error);
1427
+ }
1428
+ }
1429
+ // ═══════════════════════════════════════════════════════════════════════════════
1430
+ // DOCUMENT WORKFLOW HANDLER
1431
+ // ═══════════════════════════════════════════════════════════════════════════════
1432
+ const WORKFLOW_PREFIX = 'workflow:';
1433
+ const WORKFLOW_COLORS = {
1434
+ draft: '#6B7280',
1435
+ review: '#F59E0B',
1436
+ approved: '#10B981',
1437
+ rejected: '#EF4444',
1438
+ archived: '#6366F1',
1439
+ };
1440
+ const DocumentWorkflowInput = z.object({
1441
+ document_id: z.string().min(1).describe('Document ID'),
1442
+ action: z.enum(['get', 'set', 'history']).describe('Action: get current state, set new state, or view history'),
1443
+ state: z.enum(['draft', 'review', 'approved', 'rejected', 'archived']).optional()
1444
+ .describe('New workflow state (required for action=set)'),
1445
+ note: z.string().max(500).optional().describe('Optional note for state transition'),
1446
+ });
1447
+ /**
1448
+ * Get the current workflow state for a document from its most recent workflow tag.
1449
+ */
1450
+ function getCurrentWorkflowState(conn, documentId) {
1451
+ const tag = conn
1452
+ .prepare(`SELECT t.name FROM tags t
1453
+ JOIN entity_tags et ON et.tag_id = t.id
1454
+ WHERE et.entity_type = 'document' AND et.entity_id = ?
1455
+ AND t.name LIKE 'workflow:%'
1456
+ ORDER BY et.created_at DESC LIMIT 1`)
1457
+ .get(documentId);
1458
+ return tag ? tag.name.replace(WORKFLOW_PREFIX, '') : 'none';
1459
+ }
1460
+ /**
1461
+ * Handle ocr_document_workflow - Manage document workflow states via tags
1462
+ */
1463
+ async function handleDocumentWorkflow(params) {
1464
+ try {
1465
+ const input = validateInput(DocumentWorkflowInput, params);
1466
+ const { db } = requireDatabase();
1467
+ const conn = db.getConnection();
1468
+ // Verify document exists
1469
+ const doc = db.getDocument(input.document_id);
1470
+ if (!doc) {
1471
+ throw documentNotFoundError(input.document_id);
1472
+ }
1473
+ if (input.action === 'get') {
1474
+ return formatResponse(successResult({
1475
+ document_id: input.document_id,
1476
+ current_state: getCurrentWorkflowState(conn, input.document_id),
1477
+ next_steps: [{ tool: 'ocr_document_get', description: 'View document details after workflow change' }, { tool: 'ocr_tag_search', description: 'Find other documents in the same workflow state' }],
1478
+ }));
1479
+ }
1480
+ if (input.action === 'set') {
1481
+ if (!input.state) {
1482
+ throw new MCPError('VALIDATION_ERROR', 'state is required when action is "set"');
1483
+ }
1484
+ const previousState = getCurrentWorkflowState(conn, input.document_id);
1485
+ // Don't delete old workflow tags - preserve history for the 'history' action.
1486
+ // The 'get' action uses ORDER BY created_at DESC LIMIT 1 to get current state.
1487
+ // Create tag if it doesn't exist
1488
+ const tagName = WORKFLOW_PREFIX + input.state;
1489
+ const now = new Date().toISOString();
1490
+ conn
1491
+ .prepare(`INSERT OR IGNORE INTO tags (id, name, description, color, created_at)
1492
+ VALUES (?, ?, ?, ?, ?)`)
1493
+ .run(uuidv4(), tagName, `Workflow state: ${input.state}${input.note ? ' - ' + input.note : ''}`, WORKFLOW_COLORS[input.state] ?? '#6B7280', now);
1494
+ // Get the tag ID (may have been pre-existing)
1495
+ const tag = conn
1496
+ .prepare('SELECT id FROM tags WHERE name = ?')
1497
+ .get(tagName);
1498
+ // Apply tag to document
1499
+ conn
1500
+ .prepare(`INSERT INTO entity_tags (id, entity_type, entity_id, tag_id, created_at)
1501
+ VALUES (?, 'document', ?, ?, ?)`)
1502
+ .run(uuidv4(), input.document_id, tag.id, now);
1503
+ return formatResponse(successResult({
1504
+ document_id: input.document_id,
1505
+ previous_state: previousState,
1506
+ new_state: input.state,
1507
+ transitioned_at: now,
1508
+ note: input.note ?? null,
1509
+ next_steps: [{ tool: 'ocr_document_get', description: 'View document details after workflow change' }, { tool: 'ocr_tag_search', description: 'Find other documents in the same workflow state' }],
1510
+ }));
1511
+ }
1512
+ // action === 'history'
1513
+ const historyRows = conn
1514
+ .prepare(`SELECT t.name, et.created_at
1515
+ FROM entity_tags et
1516
+ JOIN tags t ON t.id = et.tag_id
1517
+ WHERE et.entity_type = 'document' AND et.entity_id = ?
1518
+ AND t.name LIKE 'workflow:%'
1519
+ ORDER BY et.created_at ASC`)
1520
+ .all(input.document_id);
1521
+ // Get current state (last entry)
1522
+ const currentState = historyRows.length > 0
1523
+ ? historyRows[historyRows.length - 1].name.replace(WORKFLOW_PREFIX, '')
1524
+ : 'none';
1525
+ return formatResponse(successResult({
1526
+ document_id: input.document_id,
1527
+ current_state: currentState,
1528
+ history: historyRows.map((r) => ({
1529
+ state: r.name.replace(WORKFLOW_PREFIX, ''),
1530
+ applied_at: r.created_at,
1531
+ })),
1532
+ next_steps: [{ tool: 'ocr_document_get', description: 'View document details after workflow change' }, { tool: 'ocr_tag_search', description: 'Find other documents in the same workflow state' }],
1533
+ }));
1534
+ }
1535
+ catch (error) {
1536
+ return handleError(error);
1537
+ }
1538
+ }
1539
+ // ═══════════════════════════════════════════════════════════════════════════════
1540
+ // TOOL DEFINITIONS FOR MCP REGISTRATION
1541
+ // ═══════════════════════════════════════════════════════════════════════════════
1542
+ /**
1543
+ * Document tools collection for MCP server registration
1544
+ */
1545
+ export const documentTools = {
1546
+ ocr_document_list: {
1547
+ description: '[ESSENTIAL] Use to browse documents in the current database. Returns metadata with structural summaries. Filter by status, date, or file type. Supports cursor-based pagination for large datasets. Start here after ocr_db_select.',
1548
+ inputSchema: {
1549
+ status_filter: z
1550
+ .enum(['pending', 'processing', 'complete', 'failed'])
1551
+ .optional()
1552
+ .describe('Filter by status'),
1553
+ limit: z.number().int().min(1).max(1000).default(50).describe('Maximum results'),
1554
+ offset: z.number().int().min(0).default(0).describe('Offset for pagination'),
1555
+ created_after: z.string().datetime().optional()
1556
+ .describe('Filter documents created after this ISO 8601 timestamp'),
1557
+ created_before: z.string().datetime().optional()
1558
+ .describe('Filter documents created before this ISO 8601 timestamp'),
1559
+ file_type: z.string().optional()
1560
+ .describe('Filter by file type (e.g., "pdf", "docx")'),
1561
+ cursor: z.string().optional()
1562
+ .describe('Cursor from previous response for efficient keyset pagination. When provided, offset is ignored. Use next_cursor from the response.'),
1563
+ },
1564
+ handler: handleDocumentList,
1565
+ },
1566
+ ocr_document_get: {
1567
+ description: '[ESSENTIAL] Use to get full details for a single document. Returns OCR metadata, structure, quality, and memberships. Use ocr_document_page to read specific pages.',
1568
+ inputSchema: {
1569
+ document_id: z.string().min(1).describe('Document ID'),
1570
+ include_text: z.boolean().default(false).describe('Include OCR extracted text'),
1571
+ include_chunks: z.boolean().default(false).describe('Include chunk information'),
1572
+ include_blocks: z
1573
+ .boolean()
1574
+ .default(false)
1575
+ .describe('Include JSON blocks and extras metadata'),
1576
+ include_full_provenance: z.boolean().default(false).describe('Include full provenance chain'),
1577
+ },
1578
+ handler: handleDocumentGet,
1579
+ },
1580
+ ocr_document_delete: {
1581
+ description: '[DESTRUCTIVE] Use to permanently delete a document and all derived data (chunks, embeddings, images, provenance). Requires confirm=true.',
1582
+ inputSchema: {
1583
+ document_id: z.string().min(1).describe('Document ID to delete'),
1584
+ confirm: z.literal(true).describe('Must be true to confirm deletion'),
1585
+ },
1586
+ handler: handleDocumentDelete,
1587
+ },
1588
+ ocr_document_find_similar: {
1589
+ description: '[ANALYSIS] Use to find documents similar to a given document by content. Returns ranked list with similarity scores. Requires completed embeddings.',
1590
+ inputSchema: FindSimilarInput.shape,
1591
+ handler: handleFindSimilar,
1592
+ },
1593
+ ocr_document_structure: {
1594
+ description: '[ESSENTIAL] Document structure. format="structure" (default: headings/tables/figures), "tree" (hierarchical with chunk IDs), or "outline" (flat numbered).',
1595
+ inputSchema: DocumentStructureInput.shape,
1596
+ handler: handleDocumentStructure,
1597
+ },
1598
+ ocr_document_update_metadata: {
1599
+ description: '[MANAGE] Use to update title, author, or subject metadata on one or more documents. Returns updated document IDs.',
1600
+ inputSchema: UpdateMetadataInput.shape,
1601
+ handler: handleUpdateMetadata,
1602
+ },
1603
+ ocr_document_duplicates: {
1604
+ description: '[ANALYSIS] Use to find duplicate documents. Exact mode matches file hashes; near mode uses similarity scores from comparisons. Returns duplicate pairs.',
1605
+ inputSchema: DuplicateDetectionInput.shape,
1606
+ handler: handleDuplicateDetection,
1607
+ },
1608
+ ocr_export: {
1609
+ description: '[STATUS] Export document or corpus data. Provide document_id for single doc (json/markdown), omit for corpus (json/csv).',
1610
+ inputSchema: ExportInput.shape,
1611
+ handler: handleExport,
1612
+ },
1613
+ ocr_document_versions: {
1614
+ description: '[ANALYSIS] Use to find all versions of a re-ingested document. Returns documents sharing the same file path, newest first.',
1615
+ inputSchema: DocumentVersionsInput.shape,
1616
+ handler: handleDocumentVersions,
1617
+ },
1618
+ ocr_document_workflow: {
1619
+ description: '[MANAGE] Track document review states. action="get"|"set"|"history". States: draft/review/approved/rejected/archived.',
1620
+ inputSchema: DocumentWorkflowInput.shape,
1621
+ handler: handleDocumentWorkflow,
1622
+ },
1623
+ };
1624
+ //# sourceMappingURL=documents.js.map