ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,1394 @@
1
+ /**
2
+ * Evaluation Report MCP Tools
3
+ *
4
+ * Tools for generating evaluation reports on OCR and VLM processing results.
5
+ * Produces markdown reports with statistics, metrics, and quality analysis.
6
+ *
7
+ * CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
8
+ * Use console.error() for all logging.
9
+ *
10
+ * @module tools/reports
11
+ */
12
+ import { z } from 'zod';
13
+ import * as fs from 'fs';
14
+ import { dirname } from 'path';
15
+ import { safeMin, safeMax } from '../utils/math.js';
16
+ import { requireDatabase } from '../server/state.js';
17
+ import { successResult } from '../server/types.js';
18
+ import { MCPError } from '../server/errors.js';
19
+ import { formatResponse, handleError } from './shared.js';
20
+ import { validateInput, sanitizePath } from '../utils/validation.js';
21
+ import { getImageStats, getImagesByDocument, } from '../services/storage/database/image-operations.js';
22
+ import { getComparisonSummariesByDocument } from '../services/storage/database/comparison-operations.js';
23
+ import { getClusteringStats, getClusterSummariesForDocument, } from '../services/storage/database/cluster-operations.js';
24
+ // ===============================================================================
25
+ // VALIDATION SCHEMAS
26
+ // ===============================================================================
27
+ const EvaluationReportInput = z.object({
28
+ output_path: z.string().optional(),
29
+ confidence_threshold: z.number().min(0).max(1).default(0.7),
30
+ });
31
+ const DocumentReportInput = z.object({
32
+ document_id: z.string().min(1),
33
+ });
34
+ const ReportOverviewInput = z.object({
35
+ section: z.enum(['quality', 'corpus', 'all']).default('all'),
36
+ include_section_frequency: z.boolean().default(true),
37
+ include_content_type_distribution: z.boolean().default(true),
38
+ limit: z.number().int().min(1).max(100).default(20),
39
+ });
40
+ const ReportPerformanceInput = z.object({
41
+ section: z.enum(['pipeline', 'throughput', 'bottlenecks', 'all']).default('all'),
42
+ group_by: z.enum(['total', 'document', 'mode', 'file_type']).default('total'),
43
+ limit: z.number().int().min(1).max(100).default(20),
44
+ processor_filter: z.string().optional(),
45
+ bucket: z.enum(['hourly', 'daily', 'weekly', 'monthly']).default('daily'),
46
+ created_after: z.string().optional(),
47
+ created_before: z.string().optional(),
48
+ });
49
+ const ErrorAnalyticsInput = z.object({
50
+ include_error_messages: z.boolean().default(true),
51
+ limit: z.number().int().min(1).max(50).default(10),
52
+ });
53
+ // MERGE-C: Unified trends schema (ocr_quality_trends + ocr_timeline_analytics → ocr_trends)
54
+ const TrendsInput = z.object({
55
+ metric: z.enum(['quality', 'volume']).describe('Trend type: quality (OCR scores over time) or volume (processing counts over time)'),
56
+ bucket: z.enum(['hourly', 'daily', 'weekly', 'monthly']).default('daily'),
57
+ created_after: z.string().optional(),
58
+ created_before: z.string().optional(),
59
+ // quality-specific
60
+ group_by: z.enum(['none', 'ocr_mode', 'processor']).default('none')
61
+ .describe('(quality only) Group by OCR mode or processor'),
62
+ // volume-specific
63
+ volume_metric: z.enum(['documents', 'pages', 'chunks', 'embeddings', 'images', 'cost']).default('documents')
64
+ .describe('(volume only) Which metric to track over time'),
65
+ });
66
+ /**
67
+ * Handle ocr_evaluation_report - Generate comprehensive evaluation report
68
+ */
69
+ export async function handleEvaluationReport(params) {
70
+ try {
71
+ const input = validateInput(EvaluationReportInput, params);
72
+ const outputPath = input.output_path;
73
+ const confidenceThreshold = input.confidence_threshold ?? 0.7;
74
+ const { db } = requireDatabase();
75
+ // Get overall stats
76
+ const imageStats = getImageStats(db.getConnection());
77
+ const dbStats = db.getStats();
78
+ // Get per-document stats
79
+ const documents = db.listDocuments({ limit: 1000 });
80
+ const docStats = [];
81
+ const imageTypeDistribution = {};
82
+ let totalConfidence = 0;
83
+ let confidenceCount = 0;
84
+ // M-10: Prepare per-document image status count query (reuse statement)
85
+ const docImageCountStmt = db.getConnection().prepare(`
86
+ SELECT
87
+ COUNT(*) as total,
88
+ COUNT(CASE WHEN vlm_status = 'pending' THEN 1 END) as pending,
89
+ COUNT(CASE WHEN vlm_status = 'failed' THEN 1 END) as failed
90
+ FROM images WHERE document_id = ?
91
+ `);
92
+ for (const doc of documents) {
93
+ // M-10: Use vlmStatus filter to only load complete images from SQL
94
+ const completeImages = getImagesByDocument(db.getConnection(), doc.id, {
95
+ vlmStatus: 'complete',
96
+ });
97
+ const ocrResult = db.getOCRResultByDocumentId(doc.id);
98
+ const docImageCounts = docImageCountStmt.get(doc.id);
99
+ const confidences = completeImages
100
+ .filter((i) => i.vlm_confidence !== null)
101
+ .map((i) => i.vlm_confidence);
102
+ // Track image types
103
+ const docImageTypes = {};
104
+ for (const img of completeImages) {
105
+ if (img.vlm_structured_data) {
106
+ const imageType = img.vlm_structured_data.imageType || 'other';
107
+ docImageTypes[imageType] = (docImageTypes[imageType] || 0) + 1;
108
+ imageTypeDistribution[imageType] = (imageTypeDistribution[imageType] || 0) + 1;
109
+ }
110
+ }
111
+ // Calculate stats
112
+ const avgConfidence = confidences.length > 0 ? confidences.reduce((a, b) => a + b, 0) / confidences.length : 0;
113
+ totalConfidence += confidences.reduce((a, b) => a + b, 0);
114
+ confidenceCount += confidences.length;
115
+ docStats.push({
116
+ document_id: doc.id,
117
+ file_name: doc.file_name,
118
+ page_count: doc.page_count,
119
+ ocr_text_length: ocrResult?.text_length ?? 0,
120
+ image_count: docImageCounts.total,
121
+ vlm_complete: completeImages.length,
122
+ vlm_pending: docImageCounts.pending,
123
+ vlm_failed: docImageCounts.failed,
124
+ avg_confidence: avgConfidence,
125
+ min_confidence: safeMin(confidences) ?? 0,
126
+ max_confidence: safeMax(confidences) ?? 0,
127
+ image_types: docImageTypes,
128
+ });
129
+ }
130
+ // M-10: Direct SQL for low confidence images instead of tracking in per-document loop
131
+ const lowConfidenceImages = db
132
+ .getConnection()
133
+ .prepare(`
134
+ SELECT i.id as image_id, i.document_id, d.file_name, i.page_number as page,
135
+ i.vlm_confidence as confidence,
136
+ COALESCE(json_extract(i.vlm_structured_data, '$.imageType'), 'unknown') as image_type,
137
+ COALESCE(i.extracted_path, 'unknown') as path
138
+ FROM images i
139
+ JOIN documents d ON d.id = i.document_id
140
+ WHERE i.vlm_status = 'complete'
141
+ AND i.vlm_confidence IS NOT NULL
142
+ AND i.vlm_confidence < ?
143
+ ORDER BY i.vlm_confidence ASC
144
+ LIMIT 50
145
+ `)
146
+ .all(confidenceThreshold);
147
+ // Calculate overall average confidence
148
+ const overallAvgConfidence = confidenceCount > 0 ? totalConfidence / confidenceCount : 0;
149
+ // Comparison statistics
150
+ const comparisonSummary = db
151
+ .getConnection()
152
+ .prepare(`
153
+ SELECT COUNT(*) as count, AVG(similarity_ratio) as avg_similarity
154
+ FROM comparisons
155
+ `)
156
+ .get();
157
+ const comparisonCount = comparisonSummary.count;
158
+ const avgComparisonSimilarity = comparisonSummary.avg_similarity;
159
+ // Clustering statistics
160
+ const clusteringStats = getClusteringStats(db.getConnection());
161
+ // Generate markdown report
162
+ const report = generateMarkdownReport({
163
+ dbStats,
164
+ imageStats,
165
+ docStats,
166
+ lowConfidenceImages, // Already limited to 50 by SQL query
167
+ imageTypeDistribution,
168
+ overallAvgConfidence,
169
+ confidenceThreshold,
170
+ comparisonStats: { total: comparisonCount, avg_similarity: avgComparisonSimilarity },
171
+ clusteringStats,
172
+ });
173
+ // Save to file if path provided
174
+ if (outputPath) {
175
+ const safeOutputPath = sanitizePath(outputPath);
176
+ const dir = dirname(safeOutputPath);
177
+ if (!fs.existsSync(dir)) {
178
+ fs.mkdirSync(dir, { recursive: true });
179
+ }
180
+ fs.writeFileSync(safeOutputPath, report);
181
+ console.error(`[INFO] Report saved to: ${safeOutputPath}`);
182
+ }
183
+ return formatResponse(successResult({
184
+ summary: {
185
+ total_documents: documents.length,
186
+ total_pages: documents.reduce((sum, d) => sum + (d.page_count || 0), 0),
187
+ total_images: imageStats.total,
188
+ vlm_processed: imageStats.processed,
189
+ vlm_pending: imageStats.pending,
190
+ vlm_failed: imageStats.failed,
191
+ overall_avg_confidence: overallAvgConfidence,
192
+ low_confidence_count: lowConfidenceImages.length,
193
+ total_comparisons: comparisonCount,
194
+ avg_comparison_similarity: avgComparisonSimilarity,
195
+ total_clusters: clusteringStats.total_clusters,
196
+ total_cluster_runs: clusteringStats.total_runs,
197
+ avg_coherence: clusteringStats.avg_coherence,
198
+ },
199
+ image_type_distribution: imageTypeDistribution,
200
+ output_path: outputPath ?? null,
201
+ report: outputPath ? null : report, // Only include report in response if not saved to file
202
+ next_steps: [
203
+ { tool: 'ocr_report_overview', description: 'Get quality and corpus overview' },
204
+ { tool: 'ocr_evaluate', description: 'Evaluate more images' },
205
+ ],
206
+ }));
207
+ }
208
+ catch (error) {
209
+ return handleError(error);
210
+ }
211
+ }
212
+ /**
213
+ * Handle ocr_document_report - Generate report for a single document
214
+ */
215
+ export async function handleDocumentReport(params) {
216
+ try {
217
+ const input = validateInput(DocumentReportInput, params);
218
+ const documentId = input.document_id;
219
+ const { db } = requireDatabase();
220
+ const doc = db.getDocument(documentId);
221
+ if (!doc) {
222
+ throw new MCPError('DOCUMENT_NOT_FOUND', `Document not found: ${documentId}`, {
223
+ document_id: documentId,
224
+ });
225
+ }
226
+ const ocrResult = db.getOCRResultByDocumentId(documentId);
227
+ const images = getImagesByDocument(db.getConnection(), documentId);
228
+ const chunks = db.getChunksByDocumentId(documentId);
229
+ const extractions = db.getExtractionsByDocument(documentId);
230
+ // Calculate image stats
231
+ const completeImages = images.filter((i) => i.vlm_status === 'complete');
232
+ const confidences = completeImages
233
+ .filter((i) => i.vlm_confidence !== null)
234
+ .map((i) => i.vlm_confidence);
235
+ const imageTypes = {};
236
+ for (const img of completeImages) {
237
+ if (img.vlm_structured_data) {
238
+ const imageType = img.vlm_structured_data.imageType || 'other';
239
+ imageTypes[imageType] = (imageTypes[imageType] || 0) + 1;
240
+ }
241
+ }
242
+ // Build image details
243
+ const imageDetails = images.map((img) => ({
244
+ id: img.id,
245
+ page: img.page_number,
246
+ index: img.image_index,
247
+ format: img.format,
248
+ dimensions: img.dimensions,
249
+ vlm_status: img.vlm_status,
250
+ confidence: img.vlm_confidence,
251
+ image_type: img.vlm_structured_data?.imageType || null,
252
+ primary_subject: img.vlm_structured_data?.primarySubject || null,
253
+ description_length: img.vlm_description?.length ?? 0,
254
+ has_embedding: !!img.vlm_embedding_id,
255
+ error: img.error_message,
256
+ }));
257
+ const docComparisons = getComparisonSummariesByDocument(db.getConnection(), documentId);
258
+ const docClusterMemberships = getClusterSummariesForDocument(db.getConnection(), documentId);
259
+ return formatResponse(successResult({
260
+ document: {
261
+ id: doc.id,
262
+ file_name: doc.file_name,
263
+ file_path: doc.file_path,
264
+ file_type: doc.file_type,
265
+ file_size: doc.file_size,
266
+ status: doc.status,
267
+ page_count: doc.page_count,
268
+ doc_title: doc.doc_title ?? null,
269
+ doc_author: doc.doc_author ?? null,
270
+ doc_subject: doc.doc_subject ?? null,
271
+ },
272
+ ocr: ocrResult
273
+ ? {
274
+ text_length: ocrResult.text_length,
275
+ quality_score: ocrResult.parse_quality_score,
276
+ processing_duration_ms: ocrResult.processing_duration_ms,
277
+ mode: ocrResult.datalab_mode,
278
+ cost_cents: ocrResult.cost_cents,
279
+ datalab_request_id: ocrResult.datalab_request_id,
280
+ content_hash: ocrResult.content_hash,
281
+ }
282
+ : null,
283
+ chunks: {
284
+ total: chunks.length,
285
+ },
286
+ images: {
287
+ total: images.length,
288
+ complete: completeImages.length,
289
+ pending: images.filter((i) => i.vlm_status === 'pending').length,
290
+ failed: images.filter((i) => i.vlm_status === 'failed').length,
291
+ avg_confidence: confidences.length > 0
292
+ ? confidences.reduce((a, b) => a + b, 0) / confidences.length
293
+ : null,
294
+ min_confidence: safeMin(confidences) ?? null,
295
+ max_confidence: safeMax(confidences) ?? null,
296
+ type_distribution: imageTypes,
297
+ details: imageDetails,
298
+ },
299
+ extractions: {
300
+ total: extractions.length,
301
+ items: extractions.map((e) => ({
302
+ id: e.id,
303
+ schema: e.schema_json ? JSON.parse(e.schema_json) : null,
304
+ result: e.extraction_json ? JSON.parse(e.extraction_json) : null,
305
+ created_at: e.created_at,
306
+ provenance_id: e.provenance_id,
307
+ })),
308
+ },
309
+ comparisons: {
310
+ total: docComparisons.length,
311
+ items: docComparisons.map((c) => ({
312
+ id: c.id,
313
+ compared_with: c.document_id_1 === documentId ? c.document_id_2 : c.document_id_1,
314
+ similarity_ratio: c.similarity_ratio,
315
+ summary: c.summary,
316
+ created_at: c.created_at,
317
+ processing_duration_ms: c.processing_duration_ms,
318
+ })),
319
+ },
320
+ clusters: {
321
+ total: docClusterMemberships.length,
322
+ items: docClusterMemberships.map((c) => ({
323
+ cluster_id: c.id,
324
+ run_id: c.run_id,
325
+ cluster_index: c.cluster_index,
326
+ label: c.label,
327
+ classification_tag: c.classification_tag,
328
+ coherence_score: c.coherence_score,
329
+ })),
330
+ },
331
+ next_steps: [
332
+ { tool: 'ocr_document_get', description: 'Get document metadata' },
333
+ { tool: 'ocr_search', description: 'Search within this document' },
334
+ ],
335
+ }));
336
+ }
337
+ catch (error) {
338
+ return handleError(error);
339
+ }
340
+ }
341
+ /**
342
+ * Handle ocr_report_overview - Consolidated quality + corpus overview
343
+ * Merges former ocr_quality_summary and ocr_corpus_profile.
344
+ * section='quality' | 'corpus' | 'all' (default: 'all')
345
+ */
346
+ export async function handleReportOverview(params) {
347
+ try {
348
+ const input = validateInput(ReportOverviewInput, params);
349
+ const section = input.section ?? 'all';
350
+ const { db } = requireDatabase();
351
+ const conn = db.getConnection();
352
+ const result = { section };
353
+ // ---- Quality section (former ocr_quality_summary) ----
354
+ if (section === 'quality' || section === 'all') {
355
+ const imageStats = getImageStats(conn);
356
+ const dbStats = db.getStats();
357
+ const confStats = conn
358
+ .prepare(`
359
+ SELECT
360
+ COUNT(*) as cnt,
361
+ AVG(vlm_confidence) as avg_conf,
362
+ MIN(vlm_confidence) as min_conf,
363
+ MAX(vlm_confidence) as max_conf,
364
+ SUM(CASE WHEN vlm_confidence >= 0.9 THEN 1 ELSE 0 END) as high,
365
+ SUM(CASE WHEN vlm_confidence >= 0.7 AND vlm_confidence < 0.9 THEN 1 ELSE 0 END) as medium,
366
+ SUM(CASE WHEN vlm_confidence >= 0.5 AND vlm_confidence < 0.7 THEN 1 ELSE 0 END) as low,
367
+ SUM(CASE WHEN vlm_confidence < 0.5 THEN 1 ELSE 0 END) as very_low
368
+ FROM images
369
+ WHERE vlm_status = 'complete' AND vlm_confidence IS NOT NULL
370
+ `)
371
+ .get();
372
+ const ocrQualityStats = conn
373
+ .prepare(`
374
+ SELECT
375
+ COUNT(parse_quality_score) as scored_count,
376
+ AVG(parse_quality_score) as avg_quality,
377
+ MIN(parse_quality_score) as min_quality,
378
+ MAX(parse_quality_score) as max_quality,
379
+ SUM(CASE WHEN parse_quality_score >= 4 THEN 1 ELSE 0 END) as excellent,
380
+ SUM(CASE WHEN parse_quality_score >= 3 AND parse_quality_score < 4 THEN 1 ELSE 0 END) as good,
381
+ SUM(CASE WHEN parse_quality_score >= 2 AND parse_quality_score < 3 THEN 1 ELSE 0 END) as fair,
382
+ SUM(CASE WHEN parse_quality_score < 2 THEN 1 ELSE 0 END) as poor,
383
+ COALESCE(SUM(cost_cents), 0) as total_ocr_cost
384
+ FROM ocr_results
385
+ `)
386
+ .get();
387
+ const formFillCost = conn
388
+ .prepare('SELECT COALESCE(SUM(cost_cents), 0) as total FROM form_fills')
389
+ .get().total;
390
+ const comparisonStats = conn
391
+ .prepare(`
392
+ SELECT
393
+ COUNT(*) as total,
394
+ AVG(similarity_ratio) as avg_similarity,
395
+ MIN(similarity_ratio) as min_similarity,
396
+ MAX(similarity_ratio) as max_similarity
397
+ FROM comparisons
398
+ `)
399
+ .get();
400
+ const qualityClusteringStats = getClusteringStats(conn);
401
+ result.quality = {
402
+ documents: {
403
+ total: dbStats.total_documents,
404
+ complete: dbStats.documents_by_status.complete,
405
+ failed: dbStats.documents_by_status.failed,
406
+ pending: dbStats.documents_by_status.pending,
407
+ },
408
+ ocr: {
409
+ total_chunks: dbStats.total_chunks,
410
+ total_embeddings: dbStats.total_embeddings,
411
+ },
412
+ ocr_quality: {
413
+ average: ocrQualityStats.scored_count > 0 ? ocrQualityStats.avg_quality : null,
414
+ min: ocrQualityStats.scored_count > 0 ? ocrQualityStats.min_quality : null,
415
+ max: ocrQualityStats.scored_count > 0 ? ocrQualityStats.max_quality : null,
416
+ scored_count: ocrQualityStats.scored_count,
417
+ distribution: {
418
+ excellent_gte4: ocrQualityStats.excellent || 0,
419
+ good_3to4: ocrQualityStats.good || 0,
420
+ fair_2to3: ocrQualityStats.fair || 0,
421
+ poor_lt2: ocrQualityStats.poor || 0,
422
+ },
423
+ },
424
+ costs: {
425
+ total_ocr_cost_cents: ocrQualityStats.total_ocr_cost,
426
+ total_form_fill_cost_cents: formFillCost,
427
+ total_cost_cents: ocrQualityStats.total_ocr_cost + formFillCost,
428
+ },
429
+ images: {
430
+ total: imageStats.total,
431
+ processed: imageStats.processed,
432
+ pending: imageStats.pending,
433
+ failed: imageStats.failed,
434
+ processing_rate: imageStats.total > 0
435
+ ? `${((imageStats.processed / imageStats.total) * 100).toFixed(1)}%`
436
+ : '0%',
437
+ },
438
+ vlm_confidence: {
439
+ average: confStats.cnt > 0 ? confStats.avg_conf : null,
440
+ min: confStats.cnt > 0 ? confStats.min_conf : null,
441
+ max: confStats.cnt > 0 ? confStats.max_conf : null,
442
+ distribution: {
443
+ high: confStats.high || 0,
444
+ medium: confStats.medium || 0,
445
+ low: confStats.low || 0,
446
+ very_low: confStats.very_low || 0,
447
+ },
448
+ },
449
+ extractions: {
450
+ total: dbStats.total_extractions,
451
+ extraction_rate: dbStats.total_documents > 0
452
+ ? `${((dbStats.total_extractions / dbStats.total_documents) * 100).toFixed(1)}%`
453
+ : '0%',
454
+ },
455
+ form_fills: {
456
+ total: dbStats.total_form_fills,
457
+ },
458
+ comparisons: {
459
+ total: comparisonStats.total,
460
+ avg_similarity: comparisonStats.total > 0 ? comparisonStats.avg_similarity : null,
461
+ min_similarity: comparisonStats.total > 0 ? comparisonStats.min_similarity : null,
462
+ max_similarity: comparisonStats.total > 0 ? comparisonStats.max_similarity : null,
463
+ },
464
+ clustering: {
465
+ total_clusters: qualityClusteringStats.total_clusters,
466
+ total_runs: qualityClusteringStats.total_runs,
467
+ avg_coherence: qualityClusteringStats.total_clusters > 0 ? qualityClusteringStats.avg_coherence : null,
468
+ },
469
+ };
470
+ }
471
+ // ---- Corpus section (former ocr_corpus_profile) ----
472
+ if (section === 'corpus' || section === 'all') {
473
+ // Document size distribution
474
+ const docSizeStats = conn
475
+ .prepare(`
476
+ SELECT
477
+ COALESCE(AVG(page_count), 0) as avg_page_count,
478
+ COALESCE(MIN(page_count), 0) as min_page_count,
479
+ COALESCE(MAX(page_count), 0) as max_page_count,
480
+ COALESCE(AVG(file_size), 0) as avg_file_size,
481
+ COALESCE(SUM(file_size), 0) as total_file_size,
482
+ COUNT(*) as total_documents
483
+ FROM documents
484
+ WHERE status = 'complete'
485
+ `)
486
+ .get();
487
+ const fileTypeDistribution = conn
488
+ .prepare(`
489
+ SELECT file_type, COUNT(*) as count
490
+ FROM documents
491
+ GROUP BY file_type
492
+ ORDER BY count DESC
493
+ `)
494
+ .all();
495
+ const chunkStats = conn
496
+ .prepare(`
497
+ SELECT
498
+ COALESCE(COUNT(*), 0) as total_chunks,
499
+ COALESCE(AVG(LENGTH(text)), 0) as avg_text_length,
500
+ COALESCE(MIN(LENGTH(text)), 0) as min_text_length,
501
+ COALESCE(MAX(LENGTH(text)), 0) as max_text_length,
502
+ COALESCE(SUM(CASE WHEN is_atomic = 1 THEN 1 ELSE 0 END), 0) as atomic_chunks,
503
+ COALESCE(SUM(CASE WHEN heading_context IS NOT NULL AND heading_context != '' THEN 1 ELSE 0 END), 0) as chunks_with_headings
504
+ FROM chunks
505
+ `)
506
+ .get();
507
+ const chunksPerDoc = conn
508
+ .prepare(`
509
+ SELECT
510
+ COALESCE(AVG(cnt), 0) as avg_chunks,
511
+ COALESCE(MIN(cnt), 0) as min_chunks,
512
+ COALESCE(MAX(cnt), 0) as max_chunks
513
+ FROM (SELECT COUNT(*) as cnt FROM chunks GROUP BY document_id)
514
+ `)
515
+ .get();
516
+ const avgContentTypes = conn
517
+ .prepare(`
518
+ SELECT COALESCE(AVG(
519
+ CASE
520
+ WHEN content_types IS NOT NULL AND content_types != '[]' AND content_types != ''
521
+ THEN json_array_length(content_types)
522
+ ELSE 0
523
+ END
524
+ ), 0) as avg_content_types
525
+ FROM chunks
526
+ `)
527
+ .get();
528
+ const corpusData = {
529
+ documents: {
530
+ total_complete: docSizeStats.total_documents,
531
+ avg_page_count: docSizeStats.avg_page_count,
532
+ min_page_count: docSizeStats.min_page_count,
533
+ max_page_count: docSizeStats.max_page_count,
534
+ avg_file_size: docSizeStats.avg_file_size,
535
+ total_file_size: docSizeStats.total_file_size,
536
+ },
537
+ file_types: fileTypeDistribution,
538
+ chunks: {
539
+ total_chunks: chunkStats.total_chunks,
540
+ avg_text_length: chunkStats.avg_text_length,
541
+ min_text_length: chunkStats.min_text_length,
542
+ max_text_length: chunkStats.max_text_length,
543
+ avg_content_types_per_chunk: avgContentTypes.avg_content_types,
544
+ atomic_chunks: chunkStats.atomic_chunks,
545
+ chunks_with_headings: chunkStats.chunks_with_headings,
546
+ per_document: {
547
+ avg: chunksPerDoc.avg_chunks,
548
+ min: chunksPerDoc.min_chunks,
549
+ max: chunksPerDoc.max_chunks,
550
+ },
551
+ },
552
+ };
553
+ if (input.include_content_type_distribution) {
554
+ corpusData.content_type_distribution = conn
555
+ .prepare(`
556
+ SELECT
557
+ j.value as content_type,
558
+ COUNT(*) as count
559
+ FROM chunks, json_each(COALESCE(content_types, '[]')) j
560
+ GROUP BY j.value
561
+ ORDER BY count DESC
562
+ LIMIT ?
563
+ `)
564
+ .all(input.limit);
565
+ }
566
+ if (input.include_section_frequency) {
567
+ corpusData.section_frequency = conn
568
+ .prepare(`
569
+ SELECT
570
+ heading_context,
571
+ COUNT(*) as occurrence_count,
572
+ COUNT(DISTINCT document_id) as document_count
573
+ FROM chunks
574
+ WHERE heading_context IS NOT NULL AND heading_context != ''
575
+ GROUP BY heading_context
576
+ ORDER BY occurrence_count DESC
577
+ LIMIT ?
578
+ `)
579
+ .all(input.limit);
580
+ }
581
+ corpusData.image_type_distribution = conn
582
+ .prepare(`
583
+ SELECT
584
+ COALESCE(json_extract(vlm_structured_data, '$.imageType'), 'unknown') as image_type,
585
+ COUNT(*) as count
586
+ FROM images
587
+ WHERE vlm_status = 'complete' AND vlm_structured_data IS NOT NULL
588
+ GROUP BY image_type
589
+ ORDER BY count DESC
590
+ `)
591
+ .all();
592
+ result.corpus = corpusData;
593
+ }
594
+ result.next_steps = [
595
+ { tool: 'ocr_report_performance', description: 'Get pipeline performance analytics' },
596
+ { tool: 'ocr_error_analytics', description: 'Analyze errors and failures' },
597
+ { tool: 'ocr_trends', description: 'View quality/volume trends over time' },
598
+ ];
599
+ return formatResponse(successResult(result));
600
+ }
601
+ catch (error) {
602
+ return handleError(error);
603
+ }
604
+ }
605
+ // ═══════════════════════════════════════════════════════════════════════════════
606
+ // COST ANALYTICS HANDLER
607
+ // ═══════════════════════════════════════════════════════════════════════════════
608
+ /**
609
+ * Handle ocr_cost_summary - Get cost analytics for OCR and form fill operations
610
+ */
611
+ async function handleCostSummary(params) {
612
+ try {
613
+ const input = validateInput(z.object({
614
+ group_by: z.enum(['document', 'mode', 'month', 'total']).default('total'),
615
+ }), params);
616
+ const { db } = requireDatabase();
617
+ const conn = db.getConnection();
618
+ const totals = conn
619
+ .prepare(`
620
+ SELECT
621
+ (SELECT COALESCE(SUM(cost_cents), 0) FROM ocr_results) as ocr_cost,
622
+ (SELECT COALESCE(SUM(cost_cents), 0) FROM form_fills) as form_fill_cost,
623
+ (SELECT COUNT(*) FROM ocr_results WHERE cost_cents > 0) as ocr_count,
624
+ (SELECT COUNT(*) FROM form_fills WHERE cost_cents > 0) as form_fill_count
625
+ `)
626
+ .get();
627
+ const result = {
628
+ total_cost_cents: totals.ocr_cost + totals.form_fill_cost,
629
+ total_cost_dollars: ((totals.ocr_cost + totals.form_fill_cost) / 100).toFixed(2),
630
+ ocr: { total_cents: totals.ocr_cost, document_count: totals.ocr_count },
631
+ form_fill: { total_cents: totals.form_fill_cost, fill_count: totals.form_fill_count },
632
+ };
633
+ if (input.group_by === 'mode') {
634
+ result.by_mode = conn
635
+ .prepare(`
636
+ SELECT datalab_mode as mode, COUNT(*) as count, COALESCE(SUM(cost_cents), 0) as total_cents
637
+ FROM ocr_results WHERE cost_cents > 0 GROUP BY datalab_mode
638
+ `)
639
+ .all();
640
+ }
641
+ else if (input.group_by === 'document') {
642
+ result.by_document = conn
643
+ .prepare(`
644
+ SELECT d.file_name, o.datalab_mode as mode, o.cost_cents, o.page_count
645
+ FROM ocr_results o JOIN documents d ON d.id = o.document_id
646
+ WHERE o.cost_cents > 0 ORDER BY o.cost_cents DESC LIMIT 50
647
+ `)
648
+ .all();
649
+ }
650
+ else if (input.group_by === 'month') {
651
+ result.by_month = conn
652
+ .prepare(`
653
+ SELECT strftime('%Y-%m', processing_completed_at) as month,
654
+ COUNT(*) as count, COALESCE(SUM(cost_cents), 0) as total_cents
655
+ FROM ocr_results WHERE cost_cents > 0
656
+ GROUP BY strftime('%Y-%m', processing_completed_at) ORDER BY month DESC
657
+ `)
658
+ .all();
659
+ }
660
+ // Comparison processing durations (compute-only, no API cost)
661
+ const compDurations = conn
662
+ .prepare(`
663
+ SELECT COUNT(*) as count,
664
+ COALESCE(SUM(processing_duration_ms), 0) as total_ms,
665
+ AVG(processing_duration_ms) as avg_ms
666
+ FROM comparisons
667
+ `)
668
+ .get();
669
+ result.comparison_compute = {
670
+ total_comparisons: compDurations.count,
671
+ total_duration_ms: compDurations.total_ms,
672
+ avg_duration_ms: compDurations.avg_ms,
673
+ };
674
+ // Clustering processing durations (compute-only, no API cost)
675
+ const clusterDurations = conn
676
+ .prepare(`
677
+ SELECT COUNT(*) as count,
678
+ COUNT(DISTINCT run_id) as runs,
679
+ COALESCE(SUM(processing_duration_ms), 0) as total_ms,
680
+ AVG(processing_duration_ms) as avg_ms
681
+ FROM clusters
682
+ `)
683
+ .get();
684
+ result.clustering_compute = {
685
+ total_clusters: clusterDurations.count,
686
+ total_runs: clusterDurations.runs,
687
+ total_duration_ms: clusterDurations.total_ms,
688
+ avg_duration_ms: clusterDurations.avg_ms,
689
+ };
690
+ result.next_steps = [
691
+ { tool: 'ocr_report_performance', description: 'Get pipeline performance analytics' },
692
+ { tool: 'ocr_db_stats', description: 'Get database overview statistics' },
693
+ ];
694
+ return formatResponse(successResult(result));
695
+ }
696
+ catch (error) {
697
+ return handleError(error);
698
+ }
699
+ }
700
+ // ═══════════════════════════════════════════════════════════════════════════════
701
+ // CONSOLIDATED PERFORMANCE REPORT HANDLER
702
+ // ═══════════════════════════════════════════════════════════════════════════════
703
+ /**
704
+ * Handle ocr_report_performance - Consolidated pipeline + throughput + bottlenecks
705
+ * Merges former ocr_pipeline_analytics, ocr_throughput_analytics, and ocr_provenance_bottlenecks.
706
+ * section='pipeline' | 'throughput' | 'bottlenecks' | 'all' (default: 'all')
707
+ */
708
+ export async function handleReportPerformance(params) {
709
+ try {
710
+ const input = validateInput(ReportPerformanceInput, params);
711
+ const section = input.section ?? 'all';
712
+ const { db } = requireDatabase();
713
+ const conn = db.getConnection();
714
+ const result = { section };
715
+ // ---- Pipeline section (former ocr_pipeline_analytics) ----
716
+ if (section === 'pipeline' || section === 'all') {
717
+ const ocrStats = conn
718
+ .prepare(`
719
+ SELECT
720
+ COALESCE(COUNT(*), 0) as total_docs,
721
+ COALESCE(SUM(page_count), 0) as total_pages,
722
+ COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
723
+ COALESCE(MIN(processing_duration_ms), 0) as min_duration_ms,
724
+ COALESCE(MAX(processing_duration_ms), 0) as max_duration_ms,
725
+ COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms,
726
+ COALESCE(AVG(parse_quality_score), 0) as avg_quality
727
+ FROM ocr_results
728
+ `)
729
+ .get();
730
+ const avgMsPerPage = ocrStats.total_pages > 0 ? ocrStats.total_duration_ms / ocrStats.total_pages : 0;
731
+ const embeddingStats = conn
732
+ .prepare(`
733
+ SELECT
734
+ COALESCE(COUNT(*), 0) as total_embeddings,
735
+ COALESCE(AVG(generation_duration_ms), 0) as avg_duration_ms,
736
+ COALESCE(MIN(generation_duration_ms), 0) as min_duration_ms,
737
+ COALESCE(MAX(generation_duration_ms), 0) as max_duration_ms,
738
+ COALESCE(SUM(generation_duration_ms), 0) as total_duration_ms,
739
+ COUNT(DISTINCT gpu_device) as device_count
740
+ FROM embeddings
741
+ `)
742
+ .get();
743
+ const vlmStats = conn
744
+ .prepare(`
745
+ SELECT
746
+ COALESCE(COUNT(*), 0) as total_images,
747
+ COALESCE(SUM(CASE WHEN vlm_status = 'complete' THEN 1 ELSE 0 END), 0) as completed,
748
+ COALESCE(SUM(CASE WHEN vlm_status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
749
+ COALESCE(AVG(CASE WHEN vlm_status = 'complete' THEN vlm_tokens_used END), 0) as avg_tokens,
750
+ COALESCE(SUM(CASE WHEN vlm_status = 'complete' THEN vlm_tokens_used ELSE 0 END), 0) as total_tokens,
751
+ COALESCE(AVG(CASE WHEN vlm_status = 'complete' THEN vlm_confidence END), 0) as avg_confidence
752
+ FROM images
753
+ `)
754
+ .get();
755
+ const compStats = conn
756
+ .prepare(`
757
+ SELECT
758
+ COALESCE(COUNT(*), 0) as total,
759
+ COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
760
+ COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms
761
+ FROM comparisons
762
+ `)
763
+ .get();
764
+ const clusterStats = conn
765
+ .prepare(`
766
+ SELECT
767
+ COALESCE(COUNT(*), 0) as total_clusters,
768
+ COUNT(DISTINCT run_id) as total_runs,
769
+ COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
770
+ COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms
771
+ FROM clusters
772
+ `)
773
+ .get();
774
+ const pagesPerMinute = ocrStats.total_duration_ms > 0
775
+ ? (ocrStats.total_pages / ocrStats.total_duration_ms) * 60000
776
+ : 0;
777
+ const embeddingsPerSecond = embeddingStats.total_duration_ms > 0
778
+ ? (embeddingStats.total_embeddings / embeddingStats.total_duration_ms) * 1000
779
+ : 0;
780
+ const pipelineData = {
781
+ ocr: {
782
+ total_docs: ocrStats.total_docs,
783
+ total_pages: ocrStats.total_pages,
784
+ avg_duration_ms: ocrStats.avg_duration_ms,
785
+ min_duration_ms: ocrStats.min_duration_ms,
786
+ max_duration_ms: ocrStats.max_duration_ms,
787
+ total_duration_ms: ocrStats.total_duration_ms,
788
+ avg_ms_per_page: avgMsPerPage,
789
+ avg_quality: ocrStats.avg_quality,
790
+ },
791
+ embeddings: {
792
+ total_embeddings: embeddingStats.total_embeddings,
793
+ avg_duration_ms: embeddingStats.avg_duration_ms,
794
+ min_duration_ms: embeddingStats.min_duration_ms,
795
+ max_duration_ms: embeddingStats.max_duration_ms,
796
+ total_duration_ms: embeddingStats.total_duration_ms,
797
+ device_count: embeddingStats.device_count,
798
+ },
799
+ vlm: {
800
+ total_images: vlmStats.total_images,
801
+ completed: vlmStats.completed,
802
+ failed: vlmStats.failed,
803
+ avg_tokens: vlmStats.avg_tokens,
804
+ total_tokens: vlmStats.total_tokens,
805
+ avg_confidence: vlmStats.avg_confidence,
806
+ },
807
+ comparisons: {
808
+ total: compStats.total,
809
+ avg_duration_ms: compStats.avg_duration_ms,
810
+ total_duration_ms: compStats.total_duration_ms,
811
+ },
812
+ clustering: {
813
+ total_clusters: clusterStats.total_clusters,
814
+ total_runs: clusterStats.total_runs,
815
+ avg_duration_ms: clusterStats.avg_duration_ms,
816
+ total_duration_ms: clusterStats.total_duration_ms,
817
+ },
818
+ throughput: {
819
+ pages_per_minute: pagesPerMinute,
820
+ embeddings_per_second: embeddingsPerSecond,
821
+ },
822
+ };
823
+ // Group-by breakdown
824
+ if (input.group_by === 'mode') {
825
+ pipelineData.by_mode = conn
826
+ .prepare(`
827
+ SELECT
828
+ datalab_mode as mode,
829
+ COUNT(*) as count,
830
+ COALESCE(AVG(processing_duration_ms), 0) as avg_ms,
831
+ COALESCE(AVG(parse_quality_score), 0) as avg_quality,
832
+ COALESCE(AVG(cost_cents), 0) as avg_cost
833
+ FROM ocr_results
834
+ GROUP BY datalab_mode
835
+ `)
836
+ .all();
837
+ }
838
+ else if (input.group_by === 'file_type') {
839
+ pipelineData.by_file_type = conn
840
+ .prepare(`
841
+ SELECT
842
+ d.file_type,
843
+ COUNT(*) as count,
844
+ COALESCE(AVG(o.processing_duration_ms), 0) as avg_ms,
845
+ COALESCE(AVG(o.parse_quality_score), 0) as avg_quality
846
+ FROM ocr_results o
847
+ JOIN documents d ON d.id = o.document_id
848
+ GROUP BY d.file_type
849
+ LIMIT ?
850
+ `)
851
+ .all(input.limit);
852
+ }
853
+ else if (input.group_by === 'document') {
854
+ pipelineData.by_document = conn
855
+ .prepare(`
856
+ SELECT
857
+ d.id as document_id,
858
+ d.file_name,
859
+ o.processing_duration_ms,
860
+ o.page_count,
861
+ o.parse_quality_score as quality,
862
+ o.datalab_mode as mode,
863
+ (SELECT COUNT(*) FROM chunks c WHERE c.document_id = d.id) as chunk_count,
864
+ (SELECT COUNT(*) FROM images i WHERE i.document_id = d.id) as image_count
865
+ FROM ocr_results o
866
+ JOIN documents d ON d.id = o.document_id
867
+ ORDER BY o.processing_duration_ms DESC
868
+ LIMIT ?
869
+ `)
870
+ .all(input.limit);
871
+ }
872
+ result.pipeline = pipelineData;
873
+ }
874
+ // ---- Throughput section (former ocr_throughput_analytics from timeline.ts) ----
875
+ if (section === 'throughput' || section === 'all') {
876
+ const bucket = input.bucket ?? 'daily';
877
+ const data = db.getThroughputAnalytics({
878
+ bucket,
879
+ created_after: input.created_after,
880
+ created_before: input.created_before,
881
+ });
882
+ const totalPages = data.reduce((sum, d) => sum + d.pages_processed, 0);
883
+ const totalEmbeddings = data.reduce((sum, d) => sum + d.embeddings_generated, 0);
884
+ const totalImages = data.reduce((sum, d) => sum + d.images_processed, 0);
885
+ const totalOcrMs = data.reduce((sum, d) => sum + d.total_ocr_duration_ms, 0);
886
+ const totalEmbMs = data.reduce((sum, d) => sum + d.total_embedding_duration_ms, 0);
887
+ result.throughput = {
888
+ bucket,
889
+ total_periods: data.length,
890
+ filters: {
891
+ created_after: input.created_after ?? null,
892
+ created_before: input.created_before ?? null,
893
+ },
894
+ summary: {
895
+ total_pages_processed: totalPages,
896
+ total_embeddings_generated: totalEmbeddings,
897
+ total_images_processed: totalImages,
898
+ total_ocr_duration_ms: totalOcrMs,
899
+ total_embedding_duration_ms: totalEmbMs,
900
+ overall_avg_ms_per_page: totalPages > 0
901
+ ? Math.round((totalOcrMs / totalPages) * 100) / 100
902
+ : 0,
903
+ overall_avg_ms_per_embedding: totalEmbeddings > 0
904
+ ? Math.round((totalEmbMs / totalEmbeddings) * 100) / 100
905
+ : 0,
906
+ },
907
+ data,
908
+ };
909
+ }
910
+ // ---- Bottlenecks section (former ocr_provenance_bottlenecks) ----
911
+ if (section === 'bottlenecks' || section === 'all') {
912
+ const byProcessor = conn
913
+ .prepare(`
914
+ SELECT
915
+ processor,
916
+ type,
917
+ COUNT(*) as count,
918
+ COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
919
+ COALESCE(MIN(processing_duration_ms), 0) as min_duration_ms,
920
+ COALESCE(MAX(processing_duration_ms), 0) as max_duration_ms,
921
+ COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms
922
+ FROM provenance
923
+ WHERE processing_duration_ms IS NOT NULL AND processing_duration_ms > 0
924
+ GROUP BY processor, type
925
+ ORDER BY total_duration_ms DESC
926
+ `)
927
+ .all();
928
+ const byChainDepth = conn
929
+ .prepare(`
930
+ SELECT
931
+ chain_depth,
932
+ type,
933
+ COUNT(*) as count,
934
+ COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
935
+ COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms
936
+ FROM provenance
937
+ WHERE processing_duration_ms IS NOT NULL AND processing_duration_ms > 0
938
+ GROUP BY chain_depth, type
939
+ ORDER BY chain_depth ASC, total_duration_ms DESC
940
+ `)
941
+ .all();
942
+ const slowestOps = conn
943
+ .prepare(`
944
+ SELECT
945
+ p.id as provenance_id,
946
+ p.type,
947
+ p.processor,
948
+ p.processing_duration_ms,
949
+ p.chain_depth,
950
+ p.source_path,
951
+ d.file_name as document_name
952
+ FROM provenance p
953
+ LEFT JOIN documents d ON d.provenance_id = p.root_document_id
954
+ WHERE p.processing_duration_ms IS NOT NULL AND p.processing_duration_ms > 0
955
+ ORDER BY p.processing_duration_ms DESC
956
+ LIMIT 10
957
+ `)
958
+ .all();
959
+ const grandTotal = byProcessor.reduce((sum, p) => sum + p.total_duration_ms, 0);
960
+ result.bottlenecks = {
961
+ grand_total_duration_ms: grandTotal,
962
+ by_processor: byProcessor.map((p) => ({
963
+ processor: p.processor,
964
+ type: p.type,
965
+ count: p.count,
966
+ avg_duration_ms: p.avg_duration_ms,
967
+ min_duration_ms: p.min_duration_ms,
968
+ max_duration_ms: p.max_duration_ms,
969
+ total_duration_ms: p.total_duration_ms,
970
+ pct_of_total: grandTotal > 0
971
+ ? Math.round((p.total_duration_ms / grandTotal) * 10000) / 100
972
+ : 0,
973
+ })),
974
+ by_chain_depth: byChainDepth.map((d) => ({
975
+ chain_depth: d.chain_depth,
976
+ type: d.type,
977
+ count: d.count,
978
+ avg_duration_ms: d.avg_duration_ms,
979
+ total_duration_ms: d.total_duration_ms,
980
+ })),
981
+ slowest_operations: slowestOps.map((o) => ({
982
+ provenance_id: o.provenance_id,
983
+ type: o.type,
984
+ processor: o.processor,
985
+ processing_duration_ms: o.processing_duration_ms,
986
+ chain_depth: o.chain_depth,
987
+ document_name: o.document_name,
988
+ source_path: o.source_path,
989
+ })),
990
+ };
991
+ }
992
+ result.next_steps = [
993
+ { tool: 'ocr_report_overview', description: 'Get quality and corpus overview' },
994
+ { tool: 'ocr_error_analytics', description: 'Analyze error patterns' },
995
+ ];
996
+ return formatResponse(successResult(result));
997
+ }
998
+ catch (error) {
999
+ return handleError(error);
1000
+ }
1001
+ }
1002
+ // ═══════════════════════════════════════════════════════════════════════════════
1003
+ // ERROR & RECOVERY ANALYTICS HANDLER
1004
+ // ═══════════════════════════════════════════════════════════════════════════════
1005
+ /**
1006
+ * Handle ocr_error_analytics - Get error and recovery analytics
1007
+ */
1008
+ export async function handleErrorAnalytics(params) {
1009
+ try {
1010
+ const input = validateInput(ErrorAnalyticsInput, params);
1011
+ const { db } = requireDatabase();
1012
+ const conn = db.getConnection();
1013
+ // 1. Document failure rates
1014
+ const docFailures = conn
1015
+ .prepare(`
1016
+ SELECT
1017
+ COUNT(*) as total,
1018
+ COALESCE(SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
1019
+ COALESCE(SUM(CASE WHEN status = 'complete' THEN 1 ELSE 0 END), 0) as complete,
1020
+ COALESCE(SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END), 0) as pending,
1021
+ COALESCE(SUM(CASE WHEN status = 'processing' THEN 1 ELSE 0 END), 0) as processing
1022
+ FROM documents
1023
+ `)
1024
+ .get();
1025
+ const docFailureRate = docFailures.total > 0 ? (docFailures.failed / docFailures.total) * 100 : 0;
1026
+ // 2. Failure by file type
1027
+ const failureByFileType = conn
1028
+ .prepare(`
1029
+ SELECT
1030
+ file_type,
1031
+ COUNT(*) as total,
1032
+ COALESCE(SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
1033
+ ROUND(
1034
+ CASE WHEN COUNT(*) > 0
1035
+ THEN CAST(SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS REAL) / COUNT(*) * 100
1036
+ ELSE 0
1037
+ END,
1038
+ 1) as failure_rate_pct
1039
+ FROM documents
1040
+ GROUP BY file_type
1041
+ ORDER BY failed DESC
1042
+ `)
1043
+ .all();
1044
+ // 4. VLM failure stats
1045
+ const vlmFailures = conn
1046
+ .prepare(`
1047
+ SELECT
1048
+ COUNT(*) as total_images,
1049
+ COALESCE(SUM(CASE WHEN vlm_status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
1050
+ COALESCE(SUM(CASE WHEN vlm_status = 'complete' THEN 1 ELSE 0 END), 0) as complete,
1051
+ COALESCE(SUM(CASE WHEN vlm_status = 'pending' THEN 1 ELSE 0 END), 0) as pending
1052
+ FROM images
1053
+ `)
1054
+ .get();
1055
+ const vlmFailureRate = vlmFailures.total_images > 0
1056
+ ? (vlmFailures.failed / vlmFailures.total_images) * 100
1057
+ : 0;
1058
+ // 6. Embedding failure stats (from chunks embedding_status)
1059
+ const embeddingFailures = conn
1060
+ .prepare(`
1061
+ SELECT
1062
+ COUNT(*) as total_chunks,
1063
+ COALESCE(SUM(CASE WHEN embedding_status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
1064
+ COALESCE(SUM(CASE WHEN embedding_status = 'complete' THEN 1 ELSE 0 END), 0) as complete,
1065
+ COALESCE(SUM(CASE WHEN embedding_status = 'pending' THEN 1 ELSE 0 END), 0) as pending
1066
+ FROM chunks
1067
+ `)
1068
+ .get();
1069
+ const result = {
1070
+ documents: {
1071
+ total: docFailures.total,
1072
+ failed: docFailures.failed,
1073
+ complete: docFailures.complete,
1074
+ pending: docFailures.pending,
1075
+ processing: docFailures.processing,
1076
+ failure_rate_pct: docFailureRate,
1077
+ },
1078
+ failure_by_file_type: failureByFileType,
1079
+ vlm: {
1080
+ total_images: vlmFailures.total_images,
1081
+ failed: vlmFailures.failed,
1082
+ complete: vlmFailures.complete,
1083
+ pending: vlmFailures.pending,
1084
+ failure_rate_pct: vlmFailureRate,
1085
+ },
1086
+ embeddings: {
1087
+ total_chunks: embeddingFailures.total_chunks,
1088
+ failed: embeddingFailures.failed,
1089
+ complete: embeddingFailures.complete,
1090
+ pending: embeddingFailures.pending,
1091
+ },
1092
+ };
1093
+ // 3. Common document errors (optional)
1094
+ if (input.include_error_messages) {
1095
+ result.common_document_errors = conn
1096
+ .prepare(`
1097
+ SELECT
1098
+ error_message,
1099
+ COUNT(*) as count
1100
+ FROM documents
1101
+ WHERE error_message IS NOT NULL
1102
+ GROUP BY error_message
1103
+ ORDER BY count DESC
1104
+ LIMIT ?
1105
+ `)
1106
+ .all(input.limit);
1107
+ // 5. VLM common errors
1108
+ result.common_vlm_errors = conn
1109
+ .prepare(`
1110
+ SELECT
1111
+ error_message,
1112
+ COUNT(*) as count
1113
+ FROM images
1114
+ WHERE vlm_status = 'failed' AND error_message IS NOT NULL
1115
+ GROUP BY error_message
1116
+ ORDER BY count DESC
1117
+ LIMIT ?
1118
+ `)
1119
+ .all(input.limit);
1120
+ }
1121
+ result.next_steps = [
1122
+ { tool: 'ocr_retry_failed', description: 'Retry failed documents' },
1123
+ { tool: 'ocr_image_reset_failed', description: 'Reset failed VLM images' },
1124
+ { tool: 'ocr_health_check', description: 'Run a full health check' },
1125
+ ];
1126
+ return formatResponse(successResult(result));
1127
+ }
1128
+ catch (error) {
1129
+ return handleError(error);
1130
+ }
1131
+ }
1132
+ // ═══════════════════════════════════════════════════════════════════════════════
1133
+ // UNIFIED TRENDS HANDLER (MERGE-C)
1134
+ // ═══════════════════════════════════════════════════════════════════════════════
1135
+ /**
1136
+ * Handle ocr_trends - Unified time-series trends
1137
+ * metric='quality': OCR quality scores over time (delegates to getQualityTrends)
1138
+ * metric='volume': Processing volume counts over time (delegates to getTimelineStats)
1139
+ */
1140
+ async function handleTrends(params) {
1141
+ try {
1142
+ const input = validateInput(TrendsInput, params);
1143
+ const { db } = requireDatabase();
1144
+ const bucket = input.bucket ?? 'daily';
1145
+ if (input.metric === 'quality') {
1146
+ const groupBy = input.group_by ?? 'none';
1147
+ const data = db.getQualityTrends({
1148
+ bucket,
1149
+ group_by: groupBy,
1150
+ created_after: input.created_after,
1151
+ created_before: input.created_before,
1152
+ });
1153
+ return formatResponse(successResult({
1154
+ metric: 'quality',
1155
+ bucket,
1156
+ group_by: groupBy,
1157
+ total_periods: data.length,
1158
+ filters: {
1159
+ created_after: input.created_after ?? null,
1160
+ created_before: input.created_before ?? null,
1161
+ },
1162
+ data,
1163
+ next_steps: [
1164
+ { tool: 'ocr_report_overview', description: 'Get aggregate quality summary' },
1165
+ { tool: 'ocr_trends', description: 'View volume trends (metric=volume)' },
1166
+ ],
1167
+ }));
1168
+ }
1169
+ // metric === 'volume'
1170
+ const volumeMetric = input.volume_metric ?? 'documents';
1171
+ const data = db.getTimelineStats({
1172
+ bucket,
1173
+ metric: volumeMetric,
1174
+ created_after: input.created_after,
1175
+ created_before: input.created_before,
1176
+ });
1177
+ return formatResponse(successResult({
1178
+ metric: 'volume',
1179
+ bucket,
1180
+ volume_metric: volumeMetric,
1181
+ total_periods: data.length,
1182
+ total_count: data.reduce((sum, d) => sum + d.count, 0),
1183
+ filters: {
1184
+ created_after: input.created_after ?? null,
1185
+ created_before: input.created_before ?? null,
1186
+ },
1187
+ data,
1188
+ next_steps: [
1189
+ { tool: 'ocr_report_performance', description: 'Get detailed pipeline performance' },
1190
+ { tool: 'ocr_trends', description: 'View quality trends (metric=quality)' },
1191
+ ],
1192
+ }));
1193
+ }
1194
+ catch (error) {
1195
+ return handleError(error);
1196
+ }
1197
+ }
1198
+ function generateMarkdownReport(params) {
1199
+ const now = new Date().toISOString();
1200
+ const { dbStats, imageStats, docStats, lowConfidenceImages, imageTypeDistribution, overallAvgConfidence, confidenceThreshold, } = params;
1201
+ let report = `# Gemini VLM Evaluation Report
1202
+
1203
+ Generated: ${now}
1204
+
1205
+ ## Executive Summary
1206
+
1207
+ | Metric | Value |
1208
+ |--------|-------|
1209
+ | Total Documents | ${dbStats.total_documents} |
1210
+ | Total Pages | ${docStats.reduce((sum, d) => sum + (d.page_count || 0), 0)} |
1211
+ | Total Images Extracted | ${imageStats.total} |
1212
+ | VLM Processed | ${imageStats.processed} |
1213
+ | VLM Pending | ${imageStats.pending} |
1214
+ | VLM Failed | ${imageStats.failed} |
1215
+ | **Overall Avg Confidence** | **${(overallAvgConfidence * 100).toFixed(1)}%** |
1216
+ | Low Confidence (< ${(confidenceThreshold * 100).toFixed(0)}%) | ${lowConfidenceImages.length} |
1217
+
1218
+ ---
1219
+
1220
+ ## Image Type Distribution
1221
+
1222
+ | Type | Count | Percentage |
1223
+ |------|-------|------------|
1224
+ `;
1225
+ const totalImages = Object.values(imageTypeDistribution).reduce((a, b) => a + b, 0);
1226
+ const sortedTypes = Object.entries(imageTypeDistribution).sort(([, a], [, b]) => b - a);
1227
+ for (const [type, count] of sortedTypes) {
1228
+ const pct = totalImages > 0 ? ((count / totalImages) * 100).toFixed(1) : '0.0';
1229
+ report += `| ${type} | ${count} | ${pct}% |\n`;
1230
+ }
1231
+ report += `
1232
+ ---
1233
+
1234
+ ## Per-Document Summary
1235
+
1236
+ | Document | Pages | Images | Complete | Avg Conf | Min Conf |
1237
+ |----------|-------|--------|----------|----------|----------|
1238
+ `;
1239
+ // Sort by number of images descending
1240
+ const sortedDocs = [...docStats].sort((a, b) => b.image_count - a.image_count);
1241
+ for (const doc of sortedDocs.slice(0, 20)) {
1242
+ // Top 20 documents
1243
+ const fileName = doc.file_name.length > 40 ? doc.file_name.slice(0, 37) + '...' : doc.file_name;
1244
+ report += `| ${fileName} | ${doc.page_count || 'N/A'} | ${doc.image_count} | ${doc.vlm_complete} | ${(doc.avg_confidence * 100).toFixed(1)}% | ${(doc.min_confidence * 100).toFixed(1)}% |\n`;
1245
+ }
1246
+ if (sortedDocs.length > 20) {
1247
+ report += `| ... and ${sortedDocs.length - 20} more | | | | | |\n`;
1248
+ }
1249
+ if (lowConfidenceImages.length > 0) {
1250
+ report += `
1251
+ ---
1252
+
1253
+ ## Low Confidence Images (< ${(confidenceThreshold * 100).toFixed(0)}%)
1254
+
1255
+ These images may need manual review or reprocessing.
1256
+
1257
+ | Document | Page | Confidence | Type | Path |
1258
+ |----------|------|------------|------|------|
1259
+ `;
1260
+ for (const img of lowConfidenceImages.slice(0, 30)) {
1261
+ const fileName = img.file_name.length > 30 ? img.file_name.slice(0, 27) + '...' : img.file_name;
1262
+ const shortPath = img.path.split('/').slice(-2).join('/');
1263
+ report += `| ${fileName} | ${img.page} | ${(img.confidence * 100).toFixed(1)}% | ${img.image_type} | ${shortPath} |\n`;
1264
+ }
1265
+ if (lowConfidenceImages.length > 30) {
1266
+ report += `| ... and ${lowConfidenceImages.length - 30} more | | | | |\n`;
1267
+ }
1268
+ }
1269
+ report += `
1270
+ ---
1271
+
1272
+ ## Processing Statistics
1273
+
1274
+ - **OCR Results**: ${dbStats.total_documents} documents processed
1275
+ - **Text Chunks**: ${dbStats.total_chunks} chunks created
1276
+ - **Text Embeddings**: ${dbStats.total_embeddings} embeddings stored
1277
+ - **Structured Extractions**: ${dbStats.total_extractions} extractions
1278
+ - **Form Fills**: ${dbStats.total_form_fills} form fills
1279
+ - **Comparisons**: ${params.comparisonStats.total} document comparisons
1280
+ - **Clusters**: ${params.clusteringStats.total_clusters} clusters across ${params.clusteringStats.total_runs} runs${params.clusteringStats.avg_coherence !== null ? ` (avg coherence: ${(params.clusteringStats.avg_coherence * 100).toFixed(1)}%)` : ''}
1281
+
1282
+ ### VLM Processing Rate
1283
+
1284
+ \`\`\`
1285
+ ${imageStats.total > 0 ? `Processed: ${'█'.repeat(Math.round((imageStats.processed / imageStats.total) * 40))}${'░'.repeat(40 - Math.round((imageStats.processed / imageStats.total) * 40))} ${((imageStats.processed / imageStats.total) * 100).toFixed(1)}%` : 'No images to process.'}
1286
+ \`\`\`
1287
+
1288
+ ---
1289
+
1290
+ *Report generated by OCR Provenance MCP System*
1291
+ `;
1292
+ return report;
1293
+ }
1294
+ // ═══════════════════════════════════════════════════════════════════════════════
1295
+ // TOOL DEFINITIONS FOR MCP REGISTRATION
1296
+ // ═══════════════════════════════════════════════════════════════════════════════
1297
+ /**
1298
+ * Report tools collection for MCP server registration
1299
+ */
1300
+ export const reportTools = {
1301
+ ocr_evaluation_report: {
1302
+ description: '[STATUS] Use to generate a comprehensive evaluation report with OCR and VLM metrics. Saves as markdown file. Returns report path and summary.',
1303
+ inputSchema: {
1304
+ output_path: z.string().optional().describe('Path to save markdown report (optional)'),
1305
+ confidence_threshold: z
1306
+ .number()
1307
+ .min(0)
1308
+ .max(1)
1309
+ .default(0.7)
1310
+ .describe('Threshold for low confidence flagging'),
1311
+ },
1312
+ handler: handleEvaluationReport,
1313
+ },
1314
+ ocr_document_report: {
1315
+ description: '[STATUS] Use to get a detailed report for a single document (images, extractions, comparisons, clusters). Returns comprehensive document analysis.',
1316
+ inputSchema: {
1317
+ document_id: z.string().min(1).describe('Document ID'),
1318
+ },
1319
+ handler: handleDocumentReport,
1320
+ },
1321
+ ocr_report_overview: {
1322
+ description: '[STATUS] Quality and corpus overview. section="quality"|"corpus"|"all" (default). Aggregate scores, content type stats.',
1323
+ inputSchema: {
1324
+ section: z
1325
+ .enum(['quality', 'corpus', 'all'])
1326
+ .default('all')
1327
+ .describe('Which section to return: quality, corpus, or all'),
1328
+ include_section_frequency: z
1329
+ .boolean()
1330
+ .default(true)
1331
+ .describe('(corpus) Include most common section headings across documents'),
1332
+ include_content_type_distribution: z
1333
+ .boolean()
1334
+ .default(true)
1335
+ .describe('(corpus) Include content type distribution (tables, code, etc.)'),
1336
+ limit: z.number().int().min(1).max(100).default(20).describe('(corpus) Max items per list'),
1337
+ },
1338
+ handler: handleReportOverview,
1339
+ },
1340
+ ocr_cost_summary: {
1341
+ description: '[STATUS] Use to get cost analytics for OCR and form fill operations. Returns costs grouped by document, mode, month, or total.',
1342
+ inputSchema: {
1343
+ group_by: z
1344
+ .enum(['document', 'mode', 'month', 'total'])
1345
+ .default('total')
1346
+ .describe('How to group cost data'),
1347
+ },
1348
+ handler: handleCostSummary,
1349
+ },
1350
+ ocr_report_performance: {
1351
+ description: '[STATUS] Pipeline performance analytics. section="pipeline"|"throughput"|"bottlenecks"|"all" (default).',
1352
+ inputSchema: {
1353
+ section: z
1354
+ .enum(['pipeline', 'throughput', 'bottlenecks', 'all'])
1355
+ .default('all')
1356
+ .describe('Which section to return'),
1357
+ group_by: z
1358
+ .enum(['total', 'document', 'mode', 'file_type'])
1359
+ .default('total')
1360
+ .describe('(pipeline) How to group performance data'),
1361
+ limit: z.number().int().min(1).max(100).default(20).describe('(pipeline) Max items per group'),
1362
+ bucket: z
1363
+ .enum(['hourly', 'daily', 'weekly', 'monthly'])
1364
+ .default('daily')
1365
+ .describe('(throughput) Time bucket granularity'),
1366
+ created_after: z
1367
+ .string()
1368
+ .optional()
1369
+ .describe('(throughput) Filter data created after this ISO 8601 timestamp'),
1370
+ created_before: z
1371
+ .string()
1372
+ .optional()
1373
+ .describe('(throughput) Filter data created before this ISO 8601 timestamp'),
1374
+ },
1375
+ handler: handleReportPerformance,
1376
+ },
1377
+ ocr_error_analytics: {
1378
+ description: '[STATUS] Use to get error and recovery analytics (failure rates, common error messages). Returns error breakdown for documents, VLM, and embeddings.',
1379
+ inputSchema: {
1380
+ include_error_messages: z
1381
+ .boolean()
1382
+ .default(true)
1383
+ .describe('Include most common error messages'),
1384
+ limit: z.number().int().min(1).max(50).default(10),
1385
+ },
1386
+ handler: handleErrorAnalytics,
1387
+ },
1388
+ ocr_trends: {
1389
+ description: '[STATUS] Time-series trends. metric="quality" for OCR scores, "volume" for processing counts. Bucketed by time period.',
1390
+ inputSchema: TrendsInput.shape,
1391
+ handler: handleTrends,
1392
+ },
1393
+ };
1394
+ //# sourceMappingURL=reports.js.map