ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,735 @@
1
+ /**
2
+ * Document Comparison Tools
3
+ *
4
+ * MCP tools for comparing two OCR-processed documents.
5
+ * Provides text diff and structural diff.
6
+ *
7
+ * CRITICAL: NEVER use console.log() - stdout is JSON-RPC protocol.
8
+ *
9
+ * @module tools/comparison
10
+ */
11
+ import { z } from 'zod';
12
+ import { v4 as uuidv4 } from 'uuid';
13
+ import { formatResponse, handleError, fetchProvenanceChain, } from './shared.js';
14
+ import { successResult } from '../server/types.js';
15
+ import { validateInput } from '../utils/validation.js';
16
+ import { requireDatabase } from '../server/state.js';
17
+ import { computeHash } from '../utils/hash.js';
18
+ import { MCPError } from '../server/errors.js';
19
+ import { compareText, compareStructure, generateSummary, } from '../services/comparison/diff-service.js';
20
+ import { insertComparison, getComparison, listComparisons, } from '../services/storage/database/comparison-operations.js';
21
+ import { getCluster, getClusterDocuments, } from '../services/storage/database/cluster-operations.js';
22
+ import { computeDocumentEmbeddings, cosineSimilarity, } from '../services/clustering/clustering-service.js';
23
+ import { getProvenanceTracker } from '../services/provenance/index.js';
24
+ import { ProvenanceType } from '../models/provenance.js';
25
+ // ═══════════════════════════════════════════════════════════════════════════════
26
+ // INPUT SCHEMAS
27
+ // ═══════════════════════════════════════════════════════════════════════════════
28
+ const DocumentCompareInput = z.object({
29
+ document_id_1: z.string().min(1).describe('First document ID'),
30
+ document_id_2: z.string().min(1).describe('Second document ID'),
31
+ include_text_diff: z.boolean().default(true).describe('Include text-level diff operations'),
32
+ max_diff_operations: z
33
+ .number()
34
+ .int()
35
+ .min(1)
36
+ .max(10000)
37
+ .default(1000)
38
+ .describe('Maximum diff operations to return'),
39
+ include_provenance: z
40
+ .boolean()
41
+ .default(false)
42
+ .describe('Include provenance chain for the comparison'),
43
+ });
44
+ const ComparisonListInput = z.object({
45
+ document_id: z
46
+ .string()
47
+ .optional()
48
+ .describe('Filter by document ID (matches either doc1 or doc2)'),
49
+ limit: z.number().int().min(1).max(100).default(50).describe('Maximum results'),
50
+ offset: z.number().int().min(0).default(0).describe('Offset for pagination'),
51
+ });
52
+ const ComparisonGetInput = z.object({
53
+ comparison_id: z.string().min(1).describe('Comparison ID'),
54
+ });
55
+ const ComparisonDiscoverInput = z.object({
56
+ min_similarity: z
57
+ .number()
58
+ .min(0)
59
+ .max(1)
60
+ .default(0.7)
61
+ .describe('Minimum cosine similarity threshold (0-1)'),
62
+ document_filter: z
63
+ .array(z.string())
64
+ .optional()
65
+ .describe('Only consider these document IDs'),
66
+ exclude_existing: z
67
+ .boolean()
68
+ .default(true)
69
+ .describe('Exclude document pairs that already have comparisons'),
70
+ limit: z
71
+ .number()
72
+ .int()
73
+ .min(1)
74
+ .max(100)
75
+ .default(20)
76
+ .describe('Maximum pairs to return'),
77
+ });
78
+ const ComparisonBatchInput = z.object({
79
+ pairs: z
80
+ .array(z.object({
81
+ doc1: z.string().min(1).describe('First document ID'),
82
+ doc2: z.string().min(1).describe('Second document ID'),
83
+ }))
84
+ .optional()
85
+ .describe('Explicit document pairs to compare'),
86
+ cluster_id: z
87
+ .string()
88
+ .optional()
89
+ .describe('Compare all documents within this cluster'),
90
+ include_text_diff: z
91
+ .boolean()
92
+ .default(true)
93
+ .describe('Include text-level diff operations in each comparison'),
94
+ });
95
+ function countChunks(conn, docId) {
96
+ return conn.prepare('SELECT COUNT(*) as cnt FROM chunks WHERE document_id = ?').get(docId).cnt;
97
+ }
98
+ /**
99
+ * Parse stored JSON with descriptive error on malformed data.
100
+ * Throws MCPError instead of returning undefined.
101
+ */
102
+ function parseStoredJSON(field, fieldName, comparisonId) {
103
+ try {
104
+ return JSON.parse(field);
105
+ }
106
+ catch (e) {
107
+ throw new MCPError('INTERNAL_ERROR', `Failed to parse ${fieldName} for comparison '${comparisonId}': stored JSON is malformed. Error: ${e instanceof Error ? e.message : String(e)}`);
108
+ }
109
+ }
110
+ function fetchCompleteDocument(conn, docId) {
111
+ const doc = conn.prepare('SELECT * FROM documents WHERE id = ?').get(docId);
112
+ if (!doc) {
113
+ throw new MCPError('DOCUMENT_NOT_FOUND', `Document '${docId}' not found`);
114
+ }
115
+ if (doc.status !== 'complete') {
116
+ throw new MCPError('VALIDATION_ERROR', `Document '${docId}' has status '${String(doc.status)}', expected 'complete'. Run ocr_process_pending first.`);
117
+ }
118
+ const ocr = conn.prepare('SELECT * FROM ocr_results WHERE document_id = ?').get(docId);
119
+ if (!ocr) {
120
+ throw new MCPError('INTERNAL_ERROR', `No OCR result found for document '${docId}'. Document may need reprocessing.`);
121
+ }
122
+ return { doc, ocr };
123
+ }
124
+ // ═══════════════════════════════════════════════════════════════════════════════
125
+ // MULTI-SIGNAL SIMILARITY HELPERS
126
+ // ═══════════════════════════════════════════════════════════════════════════════
127
+ /**
128
+ * Compute embedding centroid similarity between two documents.
129
+ * Fetches all chunk embedding vectors for each document, computes centroids,
130
+ * and returns cosine similarity between them.
131
+ *
132
+ * @returns Cosine similarity (0-1) or null if either document has no embeddings
133
+ */
134
+ function computeEmbeddingCentroidSimilarity(conn, docId1, docId2) {
135
+ const docEmbeddings = computeDocumentEmbeddings(conn, [docId1, docId2]);
136
+ const emb1 = docEmbeddings.find((d) => d.document_id === docId1);
137
+ const emb2 = docEmbeddings.find((d) => d.document_id === docId2);
138
+ if (!emb1 || !emb2)
139
+ return null;
140
+ return cosineSimilarity(emb1.embedding, Array.from(emb2.embedding));
141
+ }
142
+ /**
143
+ * Compute structural similarity between two documents based on block type distributions.
144
+ * Uses block_type_stats from extras_json of OCR results (added in Phase 4).
145
+ * Computes cosine similarity of block type distribution vectors.
146
+ *
147
+ * @returns Similarity score (0-1), or 0 if stats unavailable
148
+ */
149
+ function computeStructuralSimilarity(conn, docId1, docId2) {
150
+ const stats1 = getBlockTypeStats(conn, docId1);
151
+ const stats2 = getBlockTypeStats(conn, docId2);
152
+ if (!stats1 || !stats2)
153
+ return 0;
154
+ // Build unified set of block types
155
+ const allTypes = new Set([...Object.keys(stats1), ...Object.keys(stats2)]);
156
+ if (allTypes.size === 0)
157
+ return 0;
158
+ // Build distribution vectors
159
+ const vec1 = [];
160
+ const vec2 = [];
161
+ for (const type of allTypes) {
162
+ vec1.push(stats1[type] ?? 0);
163
+ vec2.push(stats2[type] ?? 0);
164
+ }
165
+ // Compute cosine similarity
166
+ let dotProduct = 0;
167
+ let norm1 = 0;
168
+ let norm2 = 0;
169
+ for (let i = 0; i < vec1.length; i++) {
170
+ dotProduct += vec1[i] * vec2[i];
171
+ norm1 += vec1[i] * vec1[i];
172
+ norm2 += vec2[i] * vec2[i];
173
+ }
174
+ if (norm1 === 0 || norm2 === 0)
175
+ return 0;
176
+ return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
177
+ }
178
+ /**
179
+ * Extract block_type_stats from extras_json of a document's OCR result.
180
+ * Returns a map of block_type -> count, or null if not available.
181
+ */
182
+ function getBlockTypeStats(conn, docId) {
183
+ const row = conn
184
+ .prepare('SELECT extras_json FROM ocr_results WHERE document_id = ? ORDER BY processing_completed_at DESC LIMIT 1')
185
+ .get(docId);
186
+ if (!row?.extras_json)
187
+ return null;
188
+ try {
189
+ const extras = JSON.parse(row.extras_json);
190
+ const blockTypeStats = extras.block_type_stats;
191
+ if (!blockTypeStats || typeof blockTypeStats !== 'object')
192
+ return null;
193
+ return blockTypeStats;
194
+ }
195
+ catch (error) {
196
+ console.error(`[comparison] Failed to parse extras_json for block_type_stats of document ${docId}: ${error instanceof Error ? error.message : String(error)}`);
197
+ return null;
198
+ }
199
+ }
200
+ // ═══════════════════════════════════════════════════════════════════════════════
201
+ // TOOL HANDLERS
202
+ // ═══════════════════════════════════════════════════════════════════════════════
203
+ async function handleDocumentCompare(params) {
204
+ try {
205
+ const startTime = Date.now();
206
+ const input = validateInput(DocumentCompareInput, params);
207
+ const { db } = requireDatabase();
208
+ const conn = db.getConnection();
209
+ if (input.document_id_1 === input.document_id_2) {
210
+ throw new MCPError('VALIDATION_ERROR', 'Cannot compare document with itself. Provide two different document IDs.');
211
+ }
212
+ const { doc: doc1, ocr: ocr1 } = fetchCompleteDocument(conn, input.document_id_1);
213
+ const { doc: doc2, ocr: ocr2 } = fetchCompleteDocument(conn, input.document_id_2);
214
+ // Duplicate comparison detection
215
+ const existingComparison = conn
216
+ .prepare(`SELECT c.id, c.created_at, c.similarity_ratio
217
+ FROM comparisons c
218
+ WHERE (c.document_id_1 = ? AND c.document_id_2 = ?)
219
+ OR (c.document_id_1 = ? AND c.document_id_2 = ?)
220
+ ORDER BY c.created_at DESC LIMIT 1`)
221
+ .get(input.document_id_1, input.document_id_2, input.document_id_2, input.document_id_1);
222
+ if (existingComparison) {
223
+ // Check if underlying OCR data has changed since last comparison
224
+ const currentInputHash = computeHash(String(ocr1.content_hash) + ':' + String(ocr2.content_hash));
225
+ const prevInputHash = conn
226
+ .prepare('SELECT input_hash FROM provenance WHERE id = (SELECT provenance_id FROM comparisons WHERE id = ?)')
227
+ .get(existingComparison.id);
228
+ if (prevInputHash && prevInputHash.input_hash === currentInputHash) {
229
+ throw new MCPError('VALIDATION_ERROR', `These documents were already compared with identical OCR content. ` +
230
+ `Existing comparison: ${existingComparison.id} (created ${existingComparison.created_at}, similarity ${(existingComparison.similarity_ratio * 100).toFixed(1)}%). ` +
231
+ `To re-compare, first reprocess one of the documents with ocr_reprocess.`);
232
+ }
233
+ // If input hashes differ, the OCR content has changed, allow re-comparison
234
+ }
235
+ const chunks1Count = countChunks(conn, input.document_id_1);
236
+ const chunks2Count = countChunks(conn, input.document_id_2);
237
+ // Text diff
238
+ const textDiff = input.include_text_diff
239
+ ? compareText(String(ocr1.extracted_text), String(ocr2.extracted_text), input.max_diff_operations)
240
+ : null;
241
+ // Structural diff
242
+ const structuralDiff = compareStructure({
243
+ page_count: doc1.page_count,
244
+ text_length: Number(ocr1.text_length),
245
+ quality_score: ocr1.parse_quality_score,
246
+ ocr_mode: String(ocr1.datalab_mode),
247
+ chunk_count: chunks1Count,
248
+ }, {
249
+ page_count: doc2.page_count,
250
+ text_length: Number(ocr2.text_length),
251
+ quality_score: ocr2.parse_quality_score,
252
+ ocr_mode: String(ocr2.datalab_mode),
253
+ chunk_count: chunks2Count,
254
+ });
255
+ // Generate summary
256
+ const summary = generateSummary(textDiff, structuralDiff, String(doc1.file_name), String(doc2.file_name));
257
+ // Compute similarity from text diff or default to structural comparison
258
+ const similarityRatio = textDiff ? textDiff.similarity_ratio : 0;
259
+ // Multi-signal similarity computation (ME-6)
260
+ // Track which components failed to surface in response instead of silently swallowing
261
+ const componentsFailed = [];
262
+ let embeddingSimilarity = null;
263
+ try {
264
+ embeddingSimilarity = computeEmbeddingCentroidSimilarity(conn, input.document_id_1, input.document_id_2);
265
+ }
266
+ catch (error) {
267
+ console.error('[comparison] Centroid similarity failed:', error instanceof Error ? error.message : String(error));
268
+ componentsFailed.push('centroid_similarity');
269
+ }
270
+ let structSimilarity = 0;
271
+ try {
272
+ structSimilarity = computeStructuralSimilarity(conn, input.document_id_1, input.document_id_2);
273
+ }
274
+ catch (error) {
275
+ console.error('[comparison] Structural similarity failed:', error instanceof Error ? error.message : String(error));
276
+ componentsFailed.push('structural_similarity');
277
+ }
278
+ // Quality alignment: how close are the OCR quality scores
279
+ const q1 = ocr1.parse_quality_score ?? 0;
280
+ const q2 = ocr2.parse_quality_score ?? 0;
281
+ const qualityAlignment = q1 > 0 && q2 > 0
282
+ ? 1 - Math.abs(q1 - q2) / Math.max(q1, q2)
283
+ : 0;
284
+ // Composite similarity: weighted blend of all signals
285
+ const compositeSimilarity = 0.4 * similarityRatio +
286
+ 0.3 * (embeddingSimilarity ?? similarityRatio) +
287
+ 0.2 * structSimilarity +
288
+ 0.1 * qualityAlignment;
289
+ // Compute content hash
290
+ const diffContent = JSON.stringify({
291
+ text_diff: textDiff,
292
+ structural_diff: structuralDiff,
293
+ });
294
+ const contentHash = computeHash(diffContent);
295
+ // Create provenance record
296
+ const comparisonId = uuidv4();
297
+ const now = new Date().toISOString();
298
+ const inputHash = computeHash(String(ocr1.content_hash) + ':' + String(ocr2.content_hash));
299
+ const tracker = getProvenanceTracker(db);
300
+ const provId = tracker.createProvenance({
301
+ type: ProvenanceType.COMPARISON,
302
+ source_type: 'COMPARISON',
303
+ source_id: String(ocr1.provenance_id),
304
+ root_document_id: String(doc1.provenance_id),
305
+ content_hash: contentHash,
306
+ input_hash: inputHash,
307
+ file_hash: String(doc1.file_hash),
308
+ source_path: `${String(doc1.file_path)} <-> ${String(doc2.file_path)}`,
309
+ processor: 'document-comparison',
310
+ processor_version: '1.0.0',
311
+ processing_params: { document_id_1: input.document_id_1, document_id_2: input.document_id_2 },
312
+ });
313
+ const processingDurationMs = Date.now() - startTime;
314
+ // Update provenance with actual duration (not known at creation time)
315
+ conn
316
+ .prepare('UPDATE provenance SET processing_duration_ms = ? WHERE id = ?')
317
+ .run(processingDurationMs, provId);
318
+ // Insert comparison record
319
+ const comparison = {
320
+ id: comparisonId,
321
+ document_id_1: input.document_id_1,
322
+ document_id_2: input.document_id_2,
323
+ similarity_ratio: similarityRatio,
324
+ text_diff_json: JSON.stringify(textDiff ?? {}),
325
+ structural_diff_json: JSON.stringify(structuralDiff),
326
+ summary,
327
+ content_hash: contentHash,
328
+ provenance_id: provId,
329
+ created_at: now,
330
+ processing_duration_ms: processingDurationMs,
331
+ };
332
+ // F-INTEG-10: Delete stale comparisons for this document pair before inserting
333
+ // (handles re-OCR creating new comparisons alongside outdated ones)
334
+ conn
335
+ .prepare(`DELETE FROM comparisons WHERE
336
+ (document_id_1 = ? AND document_id_2 = ?) OR
337
+ (document_id_1 = ? AND document_id_2 = ?)`)
338
+ .run(input.document_id_1, input.document_id_2, input.document_id_2, input.document_id_1);
339
+ insertComparison(conn, comparison);
340
+ const comparisonResponse = {
341
+ comparison_id: comparisonId,
342
+ document_1: { id: input.document_id_1, file_name: doc1.file_name },
343
+ document_2: { id: input.document_id_2, file_name: doc2.file_name },
344
+ similarity_ratio: similarityRatio,
345
+ composite_similarity: Math.round(compositeSimilarity * 10000) / 10000,
346
+ similarity_signals: {
347
+ text_similarity: similarityRatio,
348
+ embedding_centroid_similarity: embeddingSimilarity !== null
349
+ ? Math.round(embeddingSimilarity * 10000) / 10000
350
+ : null,
351
+ structural_similarity: Math.round(structSimilarity * 10000) / 10000,
352
+ quality_alignment: Math.round(qualityAlignment * 10000) / 10000,
353
+ weights: { text: 0.4, embedding: 0.3, structural: 0.2, quality: 0.1 },
354
+ },
355
+ summary,
356
+ text_diff: textDiff,
357
+ structural_diff: structuralDiff,
358
+ provenance_id: provId,
359
+ processing_duration_ms: processingDurationMs,
360
+ };
361
+ if (componentsFailed.length > 0) {
362
+ comparisonResponse.components_failed = componentsFailed;
363
+ }
364
+ if (input.include_provenance) {
365
+ comparisonResponse.provenance_chain = fetchProvenanceChain(db, provId, 'comparison');
366
+ }
367
+ comparisonResponse.next_steps = [
368
+ { tool: 'ocr_comparison_list', description: 'View all comparisons in the database' },
369
+ ];
370
+ return formatResponse(successResult(comparisonResponse));
371
+ }
372
+ catch (error) {
373
+ return handleError(error);
374
+ }
375
+ }
376
+ async function handleComparisonList(params) {
377
+ try {
378
+ const input = validateInput(ComparisonListInput, params);
379
+ const { db } = requireDatabase();
380
+ const conn = db.getConnection();
381
+ const comparisons = listComparisons(conn, input);
382
+ // Return summaries without large JSON fields
383
+ const results = comparisons.map((c) => ({
384
+ id: c.id,
385
+ document_id_1: c.document_id_1,
386
+ document_id_2: c.document_id_2,
387
+ similarity_ratio: c.similarity_ratio,
388
+ summary: c.summary,
389
+ created_at: c.created_at,
390
+ processing_duration_ms: c.processing_duration_ms,
391
+ }));
392
+ return formatResponse(successResult({
393
+ comparisons: results,
394
+ count: results.length,
395
+ offset: input.offset,
396
+ limit: input.limit,
397
+ next_steps: [{ tool: 'ocr_comparison_get', description: 'View full diff data for a comparison' }, { tool: 'ocr_document_compare', description: 'Compare two new documents' }],
398
+ }));
399
+ }
400
+ catch (error) {
401
+ return handleError(error);
402
+ }
403
+ }
404
+ async function handleComparisonGet(params) {
405
+ try {
406
+ const input = validateInput(ComparisonGetInput, params);
407
+ const { db } = requireDatabase();
408
+ const conn = db.getConnection();
409
+ const comparison = getComparison(conn, input.comparison_id);
410
+ if (!comparison) {
411
+ throw new MCPError('DOCUMENT_NOT_FOUND', `Comparison '${input.comparison_id}' not found`);
412
+ }
413
+ // Parse stored JSON fields with error handling
414
+ return formatResponse(successResult({
415
+ ...comparison,
416
+ text_diff_json: parseStoredJSON(comparison.text_diff_json, 'text_diff_json', input.comparison_id),
417
+ structural_diff_json: parseStoredJSON(comparison.structural_diff_json, 'structural_diff_json', input.comparison_id),
418
+ next_steps: [{ tool: 'ocr_document_get', description: 'View one of the compared documents' }, { tool: 'ocr_comparison_list', description: 'Browse other comparisons' }],
419
+ }));
420
+ }
421
+ catch (error) {
422
+ return handleError(error);
423
+ }
424
+ }
425
+ // ═══════════════════════════════════════════════════════════════════════════════
426
+ // DISCOVER & BATCH HANDLERS
427
+ // ═══════════════════════════════════════════════════════════════════════════════
428
+ /**
429
+ * Discover document pairs likely similar based on embedding proximity.
430
+ * Computes document centroid embeddings (average chunk embeddings),
431
+ * then pairwise cosine similarity.
432
+ */
433
+ async function handleComparisonDiscover(params) {
434
+ try {
435
+ const input = validateInput(ComparisonDiscoverInput, params);
436
+ const { db } = requireDatabase();
437
+ const conn = db.getConnection();
438
+ const minSimilarity = input.min_similarity ?? 0.7;
439
+ const excludeExisting = input.exclude_existing ?? true;
440
+ const limit = input.limit ?? 20;
441
+ // Compute document centroid embeddings
442
+ const docEmbeddings = computeDocumentEmbeddings(conn, input.document_filter);
443
+ if (docEmbeddings.length < 2) {
444
+ return formatResponse(successResult({
445
+ pairs: [],
446
+ total_pairs: 0,
447
+ documents_analyzed: docEmbeddings.length,
448
+ message: docEmbeddings.length === 0
449
+ ? 'No documents with embeddings found'
450
+ : 'At least 2 documents with embeddings required for comparison discovery',
451
+ next_steps: [{ tool: 'ocr_process_pending', description: 'Process more documents to enable comparison' }],
452
+ }));
453
+ }
454
+ // Build set of existing comparison pairs for exclusion
455
+ const existingPairs = new Set();
456
+ if (excludeExisting) {
457
+ const existing = conn
458
+ .prepare('SELECT document_id_1, document_id_2 FROM comparisons')
459
+ .all();
460
+ for (const row of existing) {
461
+ // Store both orderings
462
+ existingPairs.add(`${row.document_id_1}:${row.document_id_2}`);
463
+ existingPairs.add(`${row.document_id_2}:${row.document_id_1}`);
464
+ }
465
+ }
466
+ // Compute pairwise cosine similarity
467
+ const pairs = [];
468
+ // Get file names for all documents
469
+ const fileNameMap = new Map();
470
+ for (const de of docEmbeddings) {
471
+ const doc = db.getDocument(de.document_id);
472
+ fileNameMap.set(de.document_id, doc?.file_name ?? 'unknown');
473
+ }
474
+ for (let i = 0; i < docEmbeddings.length; i++) {
475
+ for (let j = i + 1; j < docEmbeddings.length; j++) {
476
+ const docA = docEmbeddings[i];
477
+ const docB = docEmbeddings[j];
478
+ // Skip if already compared
479
+ if (excludeExisting && existingPairs.has(`${docA.document_id}:${docB.document_id}`)) {
480
+ continue;
481
+ }
482
+ const similarity = cosineSimilarity(docA.embedding, Array.from(docB.embedding));
483
+ if (similarity >= minSimilarity) {
484
+ pairs.push({
485
+ document_id_1: docA.document_id,
486
+ document_id_2: docB.document_id,
487
+ similarity: Math.round(similarity * 10000) / 10000,
488
+ file_name_1: fileNameMap.get(docA.document_id) ?? 'unknown',
489
+ file_name_2: fileNameMap.get(docB.document_id) ?? 'unknown',
490
+ });
491
+ }
492
+ }
493
+ }
494
+ // Sort by similarity descending, then limit
495
+ pairs.sort((a, b) => b.similarity - a.similarity);
496
+ const limitedPairs = pairs.slice(0, limit);
497
+ return formatResponse(successResult({
498
+ pairs: limitedPairs,
499
+ total_pairs: pairs.length,
500
+ returned_pairs: limitedPairs.length,
501
+ documents_analyzed: docEmbeddings.length,
502
+ min_similarity: minSimilarity,
503
+ exclude_existing: excludeExisting,
504
+ next_steps: [{ tool: 'ocr_document_compare', description: 'Compare a discovered similar pair' }, { tool: 'ocr_comparison_batch', description: 'Compare all discovered pairs at once' }],
505
+ }));
506
+ }
507
+ catch (error) {
508
+ return handleError(error);
509
+ }
510
+ }
511
+ /**
512
+ * Compare multiple document pairs in one batch operation.
513
+ * Can specify explicit pairs or compare all documents in a cluster.
514
+ */
515
+ async function handleComparisonBatch(params) {
516
+ try {
517
+ const input = validateInput(ComparisonBatchInput, params);
518
+ const { db } = requireDatabase();
519
+ const conn = db.getConnection();
520
+ // Build list of pairs to compare
521
+ let pairsToCompare = [];
522
+ if (input.cluster_id) {
523
+ // Get all documents in cluster and generate all pairs
524
+ const cluster = getCluster(conn, input.cluster_id);
525
+ if (!cluster) {
526
+ throw new MCPError('DOCUMENT_NOT_FOUND', `Cluster "${input.cluster_id}" not found`);
527
+ }
528
+ const members = getClusterDocuments(conn, input.cluster_id);
529
+ if (members.length < 2) {
530
+ return formatResponse(successResult({
531
+ results: [],
532
+ total_compared: 0,
533
+ message: `Cluster has ${members.length} document(s), need at least 2 for comparison`,
534
+ next_steps: [{ tool: 'ocr_cluster_list', description: 'Find a cluster with more documents' }],
535
+ }));
536
+ }
537
+ for (let i = 0; i < members.length; i++) {
538
+ for (let j = i + 1; j < members.length; j++) {
539
+ pairsToCompare.push({
540
+ doc1: members[i].document_id,
541
+ doc2: members[j].document_id,
542
+ });
543
+ }
544
+ }
545
+ }
546
+ else if (input.pairs && input.pairs.length > 0) {
547
+ pairsToCompare = input.pairs;
548
+ }
549
+ else {
550
+ throw new MCPError('VALIDATION_ERROR', 'Either pairs or cluster_id must be provided');
551
+ }
552
+ if (pairsToCompare.length === 0) {
553
+ return formatResponse(successResult({
554
+ results: [],
555
+ total_compared: 0,
556
+ message: 'No pairs to compare',
557
+ next_steps: [{ tool: 'ocr_comparison_list', description: 'View existing comparisons' }],
558
+ }));
559
+ }
560
+ // Compare each pair by calling the existing compare handler
561
+ const results = [];
562
+ const errors = [];
563
+ for (const pair of pairsToCompare) {
564
+ try {
565
+ const compareResult = await handleDocumentCompare({
566
+ document_id_1: pair.doc1,
567
+ document_id_2: pair.doc2,
568
+ include_text_diff: input.include_text_diff ?? true,
569
+ max_diff_operations: 100, // Use smaller limit for batch
570
+ include_provenance: false,
571
+ });
572
+ const parsed = JSON.parse(compareResult.content[0].text);
573
+ if (parsed.success && parsed.data) {
574
+ results.push({
575
+ document_id_1: pair.doc1,
576
+ document_id_2: pair.doc2,
577
+ comparison_id: parsed.data.comparison_id,
578
+ similarity_ratio: parsed.data.similarity_ratio,
579
+ summary: parsed.data.summary,
580
+ });
581
+ }
582
+ else {
583
+ errors.push({
584
+ doc1: pair.doc1,
585
+ doc2: pair.doc2,
586
+ error: parsed.error?.message ?? 'Unknown error',
587
+ });
588
+ }
589
+ }
590
+ catch (e) {
591
+ errors.push({
592
+ doc1: pair.doc1,
593
+ doc2: pair.doc2,
594
+ error: e instanceof Error ? e.message : String(e),
595
+ });
596
+ }
597
+ }
598
+ // M-4: If every comparison failed, throw an error instead of returning success
599
+ if (results.length === 0 && errors.length > 0) {
600
+ const errorDetails = errors
601
+ .map((e) => ` ${e.doc1} <-> ${e.doc2}: ${e.error}`)
602
+ .join('\n');
603
+ throw new MCPError('INTERNAL_ERROR', `All ${errors.length} comparison(s) failed:\n${errorDetails}`);
604
+ }
605
+ return formatResponse(successResult({
606
+ results,
607
+ errors: errors.length > 0 ? errors : undefined,
608
+ total_compared: results.length,
609
+ total_errors: errors.length,
610
+ total_pairs_requested: pairsToCompare.length,
611
+ next_steps: [{ tool: 'ocr_comparison_list', description: 'List all comparison results' }, { tool: 'ocr_comparison_get', description: 'View details for a specific comparison' }],
612
+ }));
613
+ }
614
+ catch (error) {
615
+ return handleError(error);
616
+ }
617
+ }
618
+ // ═══════════════════════════════════════════════════════════════════════════════
619
+ // COMPARISON MATRIX HANDLER
620
+ // ═══════════════════════════════════════════════════════════════════════════════
621
+ const ComparisonMatrixInput = z.object({
622
+ document_ids: z.array(z.string()).optional()
623
+ .describe('Document IDs to include (default: all documents with embeddings)'),
624
+ max_documents: z.number().int().min(2).max(100).default(50)
625
+ .describe('Maximum documents in matrix'),
626
+ });
627
+ /**
628
+ * Handle ocr_comparison_matrix - Compute pairwise similarity matrix for documents
629
+ */
630
+ async function handleComparisonMatrix(params) {
631
+ try {
632
+ const input = validateInput(ComparisonMatrixInput, params);
633
+ const { db } = requireDatabase();
634
+ const conn = db.getConnection();
635
+ // Compute document centroid embeddings
636
+ const docEmbeddings = computeDocumentEmbeddings(conn, input.document_ids);
637
+ if (docEmbeddings.length < 2) {
638
+ throw new MCPError('VALIDATION_ERROR', `Need at least 2 documents with embeddings for a similarity matrix. Found: ${docEmbeddings.length}`);
639
+ }
640
+ // Limit to max_documents (default 50 from schema)
641
+ const limited = docEmbeddings.slice(0, input.max_documents);
642
+ // Get file names for all documents
643
+ const documentIds = [];
644
+ const fileNames = [];
645
+ for (const de of limited) {
646
+ documentIds.push(de.document_id);
647
+ const doc = db.getDocument(de.document_id);
648
+ fileNames.push(doc?.file_name ?? 'unknown');
649
+ }
650
+ // Compute NxN similarity matrix
651
+ const n = limited.length;
652
+ const matrix = [];
653
+ let mostSimilarPair = { doc1_index: 0, doc2_index: 1, similarity: -1 };
654
+ let leastSimilarPair = { doc1_index: 0, doc2_index: 1, similarity: 2 };
655
+ let totalSimilarity = 0;
656
+ let pairCount = 0;
657
+ for (let i = 0; i < n; i++) {
658
+ const row = [];
659
+ for (let j = 0; j < n; j++) {
660
+ if (i === j) {
661
+ row.push(1.0);
662
+ }
663
+ else {
664
+ const sim = cosineSimilarity(limited[i].embedding, Array.from(limited[j].embedding));
665
+ const rounded = Math.round(sim * 10000) / 10000;
666
+ row.push(rounded);
667
+ // Only track for upper triangle to avoid double-counting
668
+ if (j > i) {
669
+ totalSimilarity += rounded;
670
+ pairCount++;
671
+ if (rounded > mostSimilarPair.similarity) {
672
+ mostSimilarPair = { doc1_index: i, doc2_index: j, similarity: rounded };
673
+ }
674
+ if (rounded < leastSimilarPair.similarity) {
675
+ leastSimilarPair = { doc1_index: i, doc2_index: j, similarity: rounded };
676
+ }
677
+ }
678
+ }
679
+ }
680
+ matrix.push(row);
681
+ }
682
+ const averageSimilarity = pairCount > 0
683
+ ? Math.round((totalSimilarity / pairCount) * 10000) / 10000
684
+ : 0;
685
+ return formatResponse(successResult({
686
+ document_ids: documentIds,
687
+ file_names: fileNames,
688
+ matrix,
689
+ most_similar_pair: mostSimilarPair,
690
+ least_similar_pair: leastSimilarPair,
691
+ average_similarity: averageSimilarity,
692
+ documents_analyzed: n,
693
+ next_steps: [{ tool: 'ocr_document_compare', description: 'Compare the most similar pair in detail' }, { tool: 'ocr_cluster_documents', description: 'Cluster documents by similarity' }],
694
+ }));
695
+ }
696
+ catch (error) {
697
+ return handleError(error);
698
+ }
699
+ }
700
+ // ═══════════════════════════════════════════════════════════════════════════════
701
+ // TOOL EXPORTS
702
+ // ═══════════════════════════════════════════════════════════════════════════════
703
+ export const comparisonTools = {
704
+ ocr_document_compare: {
705
+ description: '[ANALYSIS] Diff two documents for text and structural differences. Returns similarity ratios and diffs. Both must have status "complete".',
706
+ inputSchema: DocumentCompareInput.shape,
707
+ handler: handleDocumentCompare,
708
+ },
709
+ ocr_comparison_list: {
710
+ description: '[ANALYSIS] Use to list past document comparisons with optional filtering by document ID. Returns comparison summaries with similarity ratios. Use ocr_comparison_get for full diff data.',
711
+ inputSchema: ComparisonListInput.shape,
712
+ handler: handleComparisonList,
713
+ },
714
+ ocr_comparison_get: {
715
+ description: '[ANALYSIS] Use to retrieve full diff data for a specific comparison by ID. Returns text diff operations and structural differences. Use after ocr_comparison_list.',
716
+ inputSchema: ComparisonGetInput.shape,
717
+ handler: handleComparisonGet,
718
+ },
719
+ ocr_comparison_discover: {
720
+ description: '[ANALYSIS] Find likely-similar document pairs ranked by embedding similarity. Follow with ocr_document_compare or ocr_comparison_batch.',
721
+ inputSchema: ComparisonDiscoverInput.shape,
722
+ handler: handleComparisonDiscover,
723
+ },
724
+ ocr_comparison_batch: {
725
+ description: '[ANALYSIS] Compare multiple document pairs at once. Provide explicit pairs or a cluster_id to compare all within a cluster.',
726
+ inputSchema: ComparisonBatchInput.shape,
727
+ handler: handleComparisonBatch,
728
+ },
729
+ ocr_comparison_matrix: {
730
+ description: '[ANALYSIS] NxN pairwise cosine similarity matrix across documents. Returns most/least similar pairs and averages. Requires embeddings.',
731
+ inputSchema: ComparisonMatrixInput.shape,
732
+ handler: handleComparisonMatrix,
733
+ },
734
+ };
735
+ //# sourceMappingURL=comparison.js.map