ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,104 @@
1
+ """
2
+ OCR Provenance MCP System - Python Workers
3
+
4
+ This package provides:
5
+ - GPU utilities for device detection and VRAM monitoring
6
+ - Datalab OCR worker for document processing
7
+ - Embedding worker for local inference with nomic-embed-text-v1.5
8
+
9
+ CRITICAL DESIGN PRINCIPLES:
10
+ - CP-004: Local Inference - Embedding generation MUST run locally
11
+ - No data leaves the local machine for embedding generation
12
+ - Auto-detects best device: CUDA > MPS (Apple Silicon) > CPU
13
+
14
+ Supported Platforms:
15
+ - Linux/Windows with NVIDIA GPU (CUDA)
16
+ - macOS with Apple Silicon (MPS)
17
+ - Any platform without GPU (CPU fallback)
18
+
19
+ Module Structure:
20
+ - gpu_utils: GPU verification, VRAM monitoring, device detection
21
+ - ocr_worker: Datalab OCR API integration (future)
22
+ - embedding_worker: nomic-embed-text-v1.5 inference
23
+ """
24
+
25
+ __version__ = "1.0.0"
26
+ __author__ = "OCR Provenance MCP System"
27
+
28
+ from .embedding_worker import (
29
+ DEFAULT_BATCH_SIZE,
30
+ DEFAULT_DEVICE,
31
+ EMBEDDING_DIM,
32
+ MODEL_NAME,
33
+ # Constants
34
+ MODEL_PATH,
35
+ MODEL_VERSION,
36
+ PREFIX_DOCUMENT,
37
+ PREFIX_QUERY,
38
+ # Data classes
39
+ EmbeddingResult,
40
+ QueryEmbeddingResult,
41
+ embed_chunks,
42
+ embed_query,
43
+ embed_with_oom_recovery,
44
+ generate_embeddings,
45
+ generate_query_embedding,
46
+ # Core functions
47
+ load_model,
48
+ )
49
+ from .gpu_utils import (
50
+ EmbeddingModelError,
51
+ # Error classes
52
+ GPUError,
53
+ # Type definitions
54
+ GPUInfo,
55
+ GPUNotAvailableError,
56
+ GPUOutOfMemoryError,
57
+ ModelInfo,
58
+ VRAMUsage,
59
+ clear_gpu_memory,
60
+ get_vram_usage,
61
+ test_embedding_generation,
62
+ # Core functions
63
+ verify_gpu,
64
+ verify_model_loading,
65
+ )
66
+
67
+ __all__ = [
68
+ "DEFAULT_BATCH_SIZE",
69
+ "DEFAULT_DEVICE",
70
+ "EMBEDDING_DIM",
71
+ "MODEL_NAME",
72
+ # Constants (from embedding_worker)
73
+ "MODEL_PATH",
74
+ "MODEL_VERSION",
75
+ "PREFIX_DOCUMENT",
76
+ "PREFIX_QUERY",
77
+ # Error classes (from gpu_utils)
78
+ "EmbeddingModelError",
79
+ # Data classes (from embedding_worker)
80
+ "EmbeddingResult",
81
+ "GPUError",
82
+ # Type definitions (from gpu_utils)
83
+ "GPUInfo",
84
+ "GPUNotAvailableError",
85
+ "GPUOutOfMemoryError",
86
+ "ModelInfo",
87
+ "QueryEmbeddingResult",
88
+ "VRAMUsage",
89
+ # Version
90
+ "__version__",
91
+ # GPU utilities (from gpu_utils)
92
+ "clear_gpu_memory",
93
+ "embed_chunks",
94
+ "embed_query",
95
+ "embed_with_oom_recovery",
96
+ "generate_embeddings",
97
+ "generate_query_embedding",
98
+ "get_vram_usage",
99
+ # Embedding functions (from embedding_worker)
100
+ "load_model",
101
+ "test_embedding_generation",
102
+ "verify_gpu",
103
+ "verify_model_loading",
104
+ ]
@@ -0,0 +1,440 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Document Clustering Worker for OCR Provenance MCP System
4
+
5
+ Clusters documents by their embedding vectors using HDBSCAN, Agglomerative,
6
+ or K-Means algorithms. Reads JSON from stdin, writes JSON to stdout.
7
+
8
+ CRITICAL CONSTRAINTS:
9
+ - NEVER use print() except for the final JSON output to stdout
10
+ - Use sys.stderr.write() for any debug logging
11
+ - All numpy types MUST be converted to Python types before JSON output
12
+
13
+ Dependencies: scikit-learn >= 1.3 (includes HDBSCAN), numpy
14
+
15
+ Usage:
16
+ echo '{"embeddings": [...], "document_ids": [...], "algorithm": "hdbscan"}' | python clustering_worker.py
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import sys
23
+ import time
24
+
25
+ import numpy as np
26
+
27
+
28
+ def validate_inputs(data: dict) -> tuple[np.ndarray, list[str], str, dict, np.ndarray | None]:
29
+ """
30
+ Validate and extract inputs from the parsed JSON data.
31
+
32
+ Returns:
33
+ Tuple of (embeddings, document_ids, algorithm, params, distance_matrix)
34
+ distance_matrix is None when not provided (use cosine on embeddings).
35
+
36
+ Raises:
37
+ ValueError: On invalid inputs
38
+ """
39
+ # Validate embeddings
40
+ if "embeddings" not in data:
41
+ raise ValueError("Missing required field: 'embeddings'")
42
+
43
+ embeddings = np.array(data["embeddings"], dtype=np.float32)
44
+
45
+ if embeddings.ndim != 2:
46
+ raise ValueError(f"Embeddings must be 2-dimensional (N, D), got shape {embeddings.shape}")
47
+
48
+ n_docs = embeddings.shape[0]
49
+ if n_docs < 2:
50
+ raise ValueError(f"At least 2 documents required for clustering, got {n_docs}")
51
+
52
+ # Validate document_ids
53
+ document_ids = data.get("document_ids", [])
54
+ if document_ids and len(document_ids) != n_docs:
55
+ raise ValueError(
56
+ f"document_ids length ({len(document_ids)}) does not match embeddings count ({n_docs})"
57
+ )
58
+
59
+ # Validate algorithm
60
+ algorithm = data.get("algorithm", "hdbscan")
61
+ valid_algorithms = ("hdbscan", "agglomerative", "kmeans")
62
+ if algorithm not in valid_algorithms:
63
+ raise ValueError(f"Unknown algorithm '{algorithm}'. Must be one of: {valid_algorithms}")
64
+
65
+ # Extract algorithm parameters
66
+ params = {
67
+ "n_clusters": data.get("n_clusters"),
68
+ "min_cluster_size": data.get("min_cluster_size", 3),
69
+ "distance_threshold": data.get("distance_threshold", 1.0),
70
+ "linkage": data.get("linkage", "average"),
71
+ }
72
+
73
+ # Validate optional precomputed distance matrix
74
+ distance_matrix: np.ndarray | None = None
75
+ if "distance_matrix" in data:
76
+ distance_matrix = np.array(data["distance_matrix"], dtype=np.float64)
77
+ if distance_matrix.shape != (n_docs, n_docs):
78
+ raise ValueError(
79
+ f"distance_matrix shape {distance_matrix.shape} does not match "
80
+ f"document count ({n_docs}, {n_docs})"
81
+ )
82
+
83
+ return embeddings, document_ids, algorithm, params, distance_matrix
84
+
85
+
86
+ def cluster_hdbscan(
87
+ embeddings: np.ndarray,
88
+ min_cluster_size: int,
89
+ distance_matrix: np.ndarray | None = None,
90
+ ) -> tuple[np.ndarray, np.ndarray]:
91
+ """
92
+ Cluster using HDBSCAN with cosine distance matrix.
93
+
94
+ Args:
95
+ embeddings: (N, D) float32 array
96
+ min_cluster_size: Minimum points to form a cluster
97
+ distance_matrix: Optional precomputed distance matrix (N, N)
98
+
99
+ Returns:
100
+ Tuple of (labels, probabilities)
101
+ """
102
+ from sklearn.cluster import HDBSCAN
103
+ from sklearn.metrics.pairwise import cosine_distances
104
+
105
+ dist_matrix = distance_matrix if distance_matrix is not None else cosine_distances(embeddings)
106
+
107
+ clusterer = HDBSCAN(
108
+ min_cluster_size=min_cluster_size,
109
+ metric="precomputed",
110
+ cluster_selection_method="eom",
111
+ allow_single_cluster=True,
112
+ )
113
+
114
+ # MUST pass .copy() -- sklearn may mutate the input distance matrix
115
+ labels = clusterer.fit_predict(dist_matrix.copy())
116
+ probabilities = clusterer.probabilities_
117
+
118
+ return labels, probabilities
119
+
120
+
121
+ def cluster_agglomerative(
122
+ embeddings: np.ndarray,
123
+ n_clusters: int | None,
124
+ distance_threshold: float,
125
+ linkage: str,
126
+ distance_matrix: np.ndarray | None = None,
127
+ ) -> tuple[np.ndarray, np.ndarray]:
128
+ """
129
+ Cluster using Agglomerative Clustering with cosine metric.
130
+
131
+ Args:
132
+ embeddings: (N, D) float32 array
133
+ n_clusters: Number of clusters (None to use distance_threshold)
134
+ distance_threshold: Max linkage distance (used when n_clusters is None)
135
+ linkage: Linkage criterion ('average', 'complete', 'single')
136
+ distance_matrix: Optional precomputed distance matrix (N, N)
137
+
138
+ Returns:
139
+ Tuple of (labels, probabilities)
140
+
141
+ Raises:
142
+ ValueError: If ward linkage is requested (incompatible with cosine/precomputed)
143
+ """
144
+ from sklearn.cluster import AgglomerativeClustering
145
+
146
+ # CRITICAL: ward linkage is INCOMPATIBLE with cosine/precomputed metric
147
+ if linkage == "ward":
148
+ raise ValueError(
149
+ "Ward linkage is incompatible with cosine distance. "
150
+ "Use 'average', 'complete', or 'single' instead."
151
+ )
152
+
153
+ metric = "precomputed" if distance_matrix is not None else "cosine"
154
+ fit_data = distance_matrix if distance_matrix is not None else embeddings
155
+
156
+ if n_clusters is not None:
157
+ clusterer = AgglomerativeClustering(
158
+ n_clusters=n_clusters,
159
+ metric=metric,
160
+ linkage=linkage,
161
+ )
162
+ else:
163
+ clusterer = AgglomerativeClustering(
164
+ n_clusters=None,
165
+ metric=metric,
166
+ linkage=linkage,
167
+ distance_threshold=distance_threshold,
168
+ )
169
+
170
+ labels = clusterer.fit_predict(fit_data)
171
+ # Agglomerative does not produce probabilities
172
+ probabilities = np.ones(len(labels), dtype=np.float64)
173
+
174
+ return labels, probabilities
175
+
176
+
177
+ def cluster_kmeans(
178
+ embeddings: np.ndarray,
179
+ n_clusters: int | None,
180
+ distance_matrix: np.ndarray | None = None,
181
+ ) -> tuple[np.ndarray, np.ndarray]:
182
+ """
183
+ Cluster using K-Means.
184
+
185
+ When a precomputed distance_matrix is provided, K-Means cannot be used
186
+ directly (it requires feature vectors). In this case we fall back to
187
+ spectral embedding of the distance matrix into n_clusters dimensions,
188
+ then run K-Means on the spectral features.
189
+
190
+ Args:
191
+ embeddings: (N, D) float32 array
192
+ n_clusters: Number of clusters (defaults to sqrt(N) if None)
193
+ distance_matrix: Optional precomputed distance matrix (N, N)
194
+
195
+ Returns:
196
+ Tuple of (labels, probabilities)
197
+ """
198
+ from sklearn.cluster import KMeans
199
+
200
+ if n_clusters is None:
201
+ # Reasonable default: sqrt(N), clamped to [2, N-1]
202
+ n_clusters = max(2, min(int(np.sqrt(len(embeddings))), len(embeddings) - 1))
203
+
204
+ if distance_matrix is not None:
205
+ # K-Means needs feature vectors; convert distance matrix via MDS
206
+ from sklearn.manifold import MDS
207
+
208
+ mds = MDS(
209
+ n_components=min(n_clusters, len(embeddings) - 1),
210
+ dissimilarity="precomputed",
211
+ random_state=42,
212
+ normalized_stress=False,
213
+ )
214
+ feature_vectors = mds.fit_transform(distance_matrix)
215
+ clusterer = KMeans(n_clusters=n_clusters, n_init="auto", random_state=42)
216
+ labels = clusterer.fit_predict(feature_vectors)
217
+ else:
218
+ clusterer = KMeans(n_clusters=n_clusters, n_init="auto")
219
+ labels = clusterer.fit_predict(embeddings)
220
+
221
+ # K-Means does not produce probabilities
222
+ probabilities = np.ones(len(labels), dtype=np.float64)
223
+
224
+ return labels, probabilities
225
+
226
+
227
+ def compute_centroids(embeddings: np.ndarray, labels: np.ndarray) -> list[list[float]]:
228
+ """
229
+ Compute L2-normalized centroid for each cluster (excluding noise label -1).
230
+
231
+ Args:
232
+ embeddings: (N, D) float32 array
233
+ labels: Cluster labels (N,)
234
+
235
+ Returns:
236
+ List of centroid vectors, one per cluster (ordered by cluster label)
237
+ """
238
+ unique_labels = sorted(set(labels.tolist()))
239
+ centroids = []
240
+
241
+ for k in unique_labels:
242
+ if k == -1:
243
+ continue # Skip noise
244
+ mask = labels == k
245
+ cluster_embeddings = embeddings[mask]
246
+ centroid = cluster_embeddings.mean(axis=0)
247
+ # L2 normalize
248
+ norm = np.linalg.norm(centroid)
249
+ if norm > 0:
250
+ centroid = centroid / norm
251
+ centroids.append(centroid.tolist())
252
+
253
+ return centroids
254
+
255
+
256
+ def compute_coherence_scores(embeddings: np.ndarray, labels: np.ndarray) -> list[float]:
257
+ """
258
+ Compute average pairwise cosine similarity within each cluster.
259
+
260
+ Args:
261
+ embeddings: (N, D) float32 array
262
+ labels: Cluster labels (N,)
263
+
264
+ Returns:
265
+ List of coherence scores, one per cluster (ordered by cluster label)
266
+ """
267
+ from sklearn.metrics.pairwise import cosine_similarity
268
+
269
+ unique_labels = sorted(set(labels.tolist()))
270
+ scores = []
271
+
272
+ for k in unique_labels:
273
+ if k == -1:
274
+ continue # Skip noise
275
+ mask = labels == k
276
+ cluster_embeddings = embeddings[mask]
277
+
278
+ if len(cluster_embeddings) < 2:
279
+ # Single-member cluster has perfect coherence
280
+ scores.append(1.0)
281
+ continue
282
+
283
+ sim_matrix = cosine_similarity(cluster_embeddings)
284
+ # Average of upper triangle (excluding diagonal)
285
+ n = len(cluster_embeddings)
286
+ upper_sum = (sim_matrix.sum() - np.trace(sim_matrix)) / 2.0
287
+ n_pairs = n * (n - 1) / 2.0
288
+ avg_sim = float(upper_sum / n_pairs) if n_pairs > 0 else 1.0
289
+ scores.append(round(avg_sim, 6))
290
+
291
+ return scores
292
+
293
+
294
+ def compute_silhouette(embeddings: np.ndarray, labels: np.ndarray) -> float:
295
+ """
296
+ Compute silhouette score, excluding noise points (label == -1).
297
+
298
+ Returns 0.0 if all docs are noise or only 1 cluster exists.
299
+ """
300
+ from sklearn.metrics import silhouette_score
301
+
302
+ # Filter out noise
303
+ non_noise_mask = labels >= 0
304
+ filtered_embeddings = embeddings[non_noise_mask]
305
+ filtered_labels = labels[non_noise_mask]
306
+
307
+ # Need at least 2 clusters and 2 samples
308
+ unique_clusters = set(filtered_labels.tolist())
309
+ if len(unique_clusters) < 2 or len(filtered_embeddings) < 2:
310
+ return 0.0
311
+
312
+ score = silhouette_score(filtered_embeddings, filtered_labels, metric="cosine")
313
+ return round(float(score), 6)
314
+
315
+
316
+ def run_clustering(data: dict) -> dict:
317
+ """
318
+ Main clustering pipeline.
319
+
320
+ Args:
321
+ data: Parsed input JSON
322
+
323
+ Returns:
324
+ Result dict ready for JSON serialization
325
+ """
326
+ start_time = time.perf_counter()
327
+
328
+ # Validate inputs
329
+ embeddings, _document_ids, algorithm, params, distance_matrix = validate_inputs(data)
330
+
331
+ # Dispatch to algorithm
332
+ if algorithm == "hdbscan":
333
+ labels, probabilities = cluster_hdbscan(
334
+ embeddings, params["min_cluster_size"], distance_matrix
335
+ )
336
+ elif algorithm == "agglomerative":
337
+ labels, probabilities = cluster_agglomerative(
338
+ embeddings,
339
+ params["n_clusters"],
340
+ params["distance_threshold"],
341
+ params["linkage"],
342
+ distance_matrix,
343
+ )
344
+ elif algorithm == "kmeans":
345
+ labels, probabilities = cluster_kmeans(embeddings, params["n_clusters"], distance_matrix)
346
+
347
+ # Compute metrics
348
+ labels_list = labels.tolist()
349
+ noise_mask = labels == -1
350
+ noise_indices = [int(i) for i in np.where(noise_mask)[0]]
351
+ noise_count = int(noise_mask.sum())
352
+
353
+ # Number of actual clusters (excluding noise label -1)
354
+ unique_clusters = set(labels_list)
355
+ unique_clusters.discard(-1)
356
+ n_clusters = len(unique_clusters)
357
+
358
+ centroids = compute_centroids(embeddings, labels)
359
+ coherence_scores = compute_coherence_scores(embeddings, labels)
360
+ silhouette = compute_silhouette(embeddings, labels)
361
+
362
+ elapsed_ms = round((time.perf_counter() - start_time) * 1000, 2)
363
+
364
+ return {
365
+ "success": True,
366
+ "labels": labels_list,
367
+ "probabilities": [round(float(p), 6) for p in probabilities],
368
+ "centroids": centroids,
369
+ "n_clusters": n_clusters,
370
+ "noise_count": noise_count,
371
+ "noise_indices": noise_indices,
372
+ "silhouette_score": silhouette,
373
+ "coherence_scores": coherence_scores,
374
+ "elapsed_ms": elapsed_ms,
375
+ }
376
+
377
+
378
+ def main() -> None:
379
+ """Entry point: read JSON from stdin, write JSON to stdout."""
380
+ try:
381
+ raw_input = sys.stdin.read()
382
+ if not raw_input.strip():
383
+ raise ValueError("Empty input on stdin")
384
+
385
+ data = json.loads(raw_input)
386
+ result = run_clustering(data)
387
+ print(json.dumps(result))
388
+ sys.exit(0)
389
+
390
+ except json.JSONDecodeError as e:
391
+ print(
392
+ json.dumps(
393
+ {
394
+ "success": False,
395
+ "error": f"Invalid JSON input: {e}",
396
+ "error_type": "JSONDecodeError",
397
+ }
398
+ )
399
+ )
400
+ sys.exit(1)
401
+
402
+ except ValueError as e:
403
+ print(
404
+ json.dumps(
405
+ {
406
+ "success": False,
407
+ "error": str(e),
408
+ "error_type": "ValueError",
409
+ }
410
+ )
411
+ )
412
+ sys.exit(1)
413
+
414
+ except ImportError as e:
415
+ print(
416
+ json.dumps(
417
+ {
418
+ "success": False,
419
+ "error": f"Missing dependency: {e}. Requires scikit-learn >= 1.3 and numpy.",
420
+ "error_type": "ImportError",
421
+ }
422
+ )
423
+ )
424
+ sys.exit(1)
425
+
426
+ except Exception as e:
427
+ print(
428
+ json.dumps(
429
+ {
430
+ "success": False,
431
+ "error": str(e),
432
+ "error_type": type(e).__name__,
433
+ }
434
+ )
435
+ )
436
+ sys.exit(1)
437
+
438
+
439
+ if __name__ == "__main__":
440
+ main()