ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,564 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Datalab File Manager Worker for OCR Provenance MCP System
4
+
5
+ Manages file uploads, listing, retrieval, and deletion via Datalab API.
6
+ FAIL-FAST: No fallbacks, no mocks. Errors propagate immediately.
7
+ """
8
+
9
+ import argparse
10
+ import hashlib
11
+ import json
12
+ import logging
13
+ import os
14
+ import sys
15
+ import time
16
+ from dataclasses import asdict, dataclass
17
+ from pathlib import Path
18
+
19
+ # Configure logging FIRST - all logging goes to stderr
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
23
+ stream=sys.stderr,
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # =============================================================================
29
+ # CONSTANTS
30
+ # =============================================================================
31
+
32
+ # SDK handles base URL via DATALAB_HOST env var (default: https://www.datalab.to)
33
+
34
+
35
+ # =============================================================================
36
+ # ERROR CLASSES (same pattern as form_fill_worker.py)
37
+ # =============================================================================
38
+
39
+
40
+ class FileManagerError(Exception):
41
+ """Base file manager error with category for error handling."""
42
+
43
+ def __init__(self, message: str, category: str):
44
+ super().__init__(message)
45
+ self.category = category
46
+
47
+
48
+ class FileManagerAPIError(FileManagerError):
49
+ """API errors (4xx/5xx responses)."""
50
+
51
+ def __init__(self, message: str, status_code: int):
52
+ category = "FILE_MANAGER_SERVER_ERROR" if status_code >= 500 else "FILE_MANAGER_API_ERROR"
53
+ super().__init__(message, category)
54
+ self.status_code = status_code
55
+
56
+
57
+ class FileManagerFileError(FileManagerError):
58
+ """File access errors."""
59
+
60
+ def __init__(self, message: str, file_path: str):
61
+ super().__init__(message, "FILE_MANAGER_FILE_ERROR")
62
+ self.file_path = file_path
63
+
64
+
65
+ # =============================================================================
66
+ # DATA STRUCTURES
67
+ # =============================================================================
68
+
69
+
70
+ @dataclass
71
+ class UploadResult:
72
+ """Result from file upload."""
73
+
74
+ file_id: str
75
+ reference: str | None
76
+ file_name: str
77
+ file_hash: str
78
+ file_size: int
79
+ content_type: str
80
+ status: str # 'complete' or 'failed'
81
+ error: str | None = None
82
+ processing_duration_ms: int = 0
83
+
84
+
85
+ @dataclass
86
+ class FileInfo:
87
+ """File metadata from Datalab."""
88
+
89
+ file_id: str
90
+ file_name: str | None
91
+ file_size: int | None
92
+ content_type: str | None
93
+ created_at: str | None
94
+ reference: str | None
95
+ status: str | None
96
+
97
+
98
+ @dataclass
99
+ class FileListResult:
100
+ """Result from listing files."""
101
+
102
+ files: list[dict]
103
+ total: int
104
+
105
+
106
+ @dataclass
107
+ class DownloadUrlResult:
108
+ """Result from get_download_url with metadata."""
109
+
110
+ download_url: str
111
+ expires_in: int
112
+ file_id: str
113
+
114
+
115
+ # =============================================================================
116
+ # HELPERS
117
+ # =============================================================================
118
+
119
+
120
+ def _import_sdk_exceptions() -> tuple:
121
+ """Import SDK exception classes (deferred to match get_client pattern)."""
122
+ from datalab_sdk.exceptions import (
123
+ DatalabAPIError,
124
+ DatalabFileError,
125
+ DatalabTimeoutError,
126
+ DatalabValidationError,
127
+ )
128
+
129
+ return DatalabAPIError, DatalabFileError, DatalabTimeoutError, DatalabValidationError
130
+
131
+
132
+ def _handle_sdk_exception(e: Exception, operation: str, context: str = "") -> None:
133
+ """
134
+ Handle SDK exceptions with specific error types.
135
+ Raises the appropriate FileManager error based on the SDK exception type.
136
+ """
137
+ DatalabAPIError, DatalabFileError, DatalabTimeoutError, DatalabValidationError = _import_sdk_exceptions()
138
+
139
+ if isinstance(e, DatalabValidationError):
140
+ raise FileManagerAPIError(f"Invalid input for {operation}: {e}", 400) from e
141
+
142
+ if isinstance(e, DatalabTimeoutError):
143
+ raise FileManagerAPIError(f"{operation} timeout: {e}", 504) from e
144
+
145
+ if isinstance(e, DatalabFileError):
146
+ raise FileManagerFileError(f"{operation} file error: {e}", context or "unknown") from e
147
+
148
+ if isinstance(e, DatalabAPIError):
149
+ status = getattr(e, "status_code", 500)
150
+ error_msg = str(e)
151
+ if status == 429 or "rate limit" in error_msg.lower():
152
+ raise FileManagerAPIError(f"Rate limit exceeded during {operation}: {e}", 429) from e
153
+ if status in (401, 403):
154
+ raise FileManagerAPIError(f"Authentication error during {operation} ({status}): {e}", status) from e
155
+ if status == 404 or "not found" in error_msg.lower():
156
+ raise FileManagerAPIError(f"Not found during {operation}: {e}", 404) from e
157
+ raise FileManagerAPIError(f"API error during {operation} ({status}): {e}", status) from e
158
+
159
+ # Unexpected exception type — log and raise as 500
160
+ logger.error(f"Unexpected error during {operation}: {type(e).__name__}: {e}")
161
+ raise FileManagerAPIError(f"SDK {operation} failed: {e}", 500) from e
162
+
163
+
164
+ def get_client() -> "DatalabClient":
165
+ """
166
+ Get a DatalabClient instance.
167
+ FAIL-FAST: Raises immediately if API key not set.
168
+ The SDK reads DATALAB_API_KEY from the environment automatically.
169
+ """
170
+ from datalab_sdk import DatalabClient
171
+
172
+ api_key = os.environ.get("DATALAB_API_KEY")
173
+ if not api_key:
174
+ raise ValueError(
175
+ "DATALAB_API_KEY environment variable is required. "
176
+ "Get your key from https://www.datalab.to/settings"
177
+ )
178
+ if api_key == "your_api_key_here":
179
+ raise ValueError(
180
+ "DATALAB_API_KEY is set to placeholder value. Update .env with your actual API key."
181
+ )
182
+ return DatalabClient()
183
+
184
+
185
+ def compute_file_hash(file_path: str) -> str:
186
+ """Compute SHA-256 of file content (64KB chunks for memory efficiency)."""
187
+ h = hashlib.sha256()
188
+ with open(file_path, "rb") as f:
189
+ while True:
190
+ chunk = f.read(65536)
191
+ if not chunk:
192
+ break
193
+ h.update(chunk)
194
+ return f"sha256:{h.hexdigest()}"
195
+
196
+
197
+ def get_content_type(file_path: str) -> str:
198
+ """Determine content type from file extension."""
199
+ ext = Path(file_path).suffix.lower()
200
+ content_types = {
201
+ ".pdf": "application/pdf",
202
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
203
+ ".doc": "application/msword",
204
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
205
+ ".ppt": "application/vnd.ms-powerpoint",
206
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
207
+ ".xls": "application/vnd.ms-excel",
208
+ ".png": "image/png",
209
+ ".jpg": "image/jpeg",
210
+ ".jpeg": "image/jpeg",
211
+ ".tiff": "image/tiff",
212
+ ".tif": "image/tiff",
213
+ ".bmp": "image/bmp",
214
+ ".gif": "image/gif",
215
+ ".webp": "image/webp",
216
+ ".txt": "text/plain",
217
+ ".csv": "text/csv",
218
+ ".md": "text/markdown",
219
+ }
220
+ return content_types.get(ext, "application/octet-stream")
221
+
222
+
223
+ def validate_file(file_path: str) -> Path:
224
+ """
225
+ Validate file exists and is readable.
226
+ FAIL-FAST: Raises immediately on any issue.
227
+ """
228
+ path = Path(file_path).resolve()
229
+
230
+ if not path.exists():
231
+ raise FileManagerFileError(f"File not found: {file_path}", str(path))
232
+
233
+ if not path.is_file():
234
+ raise FileManagerFileError(f"Not a file: {file_path}", str(path))
235
+
236
+ return path
237
+
238
+
239
+ def _serialize_file_metadata(obj: object) -> dict:
240
+ """
241
+ Serialize an UploadedFileMetadata SDK object to a plain dict.
242
+ L-2: SDK returns UploadedFileMetadata dataclass objects, not dicts.
243
+ We explicitly convert to ensure consistent JSON output.
244
+ """
245
+ from dataclasses import fields as dc_fields
246
+
247
+ # If it's already a dict, return as-is
248
+ if isinstance(obj, dict):
249
+ return obj
250
+
251
+ # If it's a dataclass, convert properly with str(file_id) for L-1
252
+ try:
253
+ dc_fields(obj) # Raises TypeError if not a dataclass
254
+ result = asdict(obj)
255
+ # L-1: Ensure file_id is str (SDK returns int)
256
+ if "file_id" in result:
257
+ result["file_id"] = str(result["file_id"])
258
+ return result
259
+ except TypeError:
260
+ pass
261
+
262
+ # Fallback: convert known attributes
263
+ result = {}
264
+ for attr in ("file_id", "original_filename", "content_type", "reference",
265
+ "upload_status", "file_size", "created", "error"):
266
+ val = getattr(obj, attr, None)
267
+ if val is not None:
268
+ result[attr] = str(val) if attr == "file_id" else val
269
+ return result
270
+
271
+
272
+ # =============================================================================
273
+ # API ACTIONS
274
+ # =============================================================================
275
+
276
+
277
+ def upload_file(file_path: str, timeout: int = 300) -> UploadResult:
278
+ """
279
+ Upload a file to Datalab cloud storage via SDK.
280
+
281
+ The SDK handles the 3-step upload process internally with retry logic
282
+ (tenacity-based exponential backoff for 429/5xx).
283
+
284
+ Args:
285
+ file_path: Path to file to upload
286
+ timeout: Request timeout in seconds (unused - SDK manages timeouts)
287
+
288
+ Returns:
289
+ UploadResult with file_id and reference
290
+
291
+ Raises:
292
+ FileManagerAPIError: On API errors
293
+ FileManagerFileError: On file access issues
294
+ ValueError: On missing API key
295
+ """
296
+ validated_path = validate_file(file_path)
297
+ client = get_client()
298
+ file_hash = compute_file_hash(str(validated_path))
299
+ file_size = validated_path.stat().st_size
300
+ file_name = validated_path.name
301
+ content_type = get_content_type(str(validated_path))
302
+
303
+ logger.info(f"Uploading file via SDK: {validated_path} ({file_size} bytes)")
304
+
305
+ start_time = time.time()
306
+
307
+ try:
308
+ result = client.upload_files(str(validated_path))
309
+ except Exception as e:
310
+ _handle_sdk_exception(e, "upload", str(validated_path))
311
+
312
+ # SDK returns UploadedFileMetadata with file_id (int), reference, etc.
313
+ # L-1: SDK's UploadedFileMetadata.file_id is int — convert to str for JSON protocol
314
+ file_id = str(result.file_id)
315
+ reference = result.reference
316
+
317
+ if not file_id:
318
+ raise FileManagerAPIError("SDK returned empty file_id", 500)
319
+
320
+ logger.info(f"Upload complete via SDK: file_id={file_id}, reference={reference}")
321
+
322
+ end_time = time.time()
323
+ duration_ms = int((end_time - start_time) * 1000)
324
+
325
+ return UploadResult(
326
+ file_id=file_id,
327
+ reference=reference,
328
+ file_name=file_name,
329
+ file_hash=file_hash,
330
+ file_size=file_size,
331
+ content_type=content_type,
332
+ status="complete",
333
+ processing_duration_ms=duration_ms,
334
+ )
335
+
336
+
337
+ def list_files(limit: int = 50, offset: int = 0, timeout: int = 60) -> FileListResult:
338
+ """
339
+ List files in Datalab cloud storage via SDK.
340
+
341
+ Args:
342
+ limit: Max files to return
343
+ offset: Pagination offset
344
+ timeout: Request timeout in seconds (unused - SDK manages timeouts)
345
+
346
+ Returns:
347
+ FileListResult with files array and total count
348
+ """
349
+ client = get_client()
350
+
351
+ try:
352
+ data = client.list_files(limit=limit, offset=offset)
353
+ except Exception as e:
354
+ _handle_sdk_exception(e, "list_files")
355
+
356
+ # SDK returns dict with 'files' (list of UploadedFileMetadata objects), 'total', 'limit', 'offset'
357
+ # L-2: Explicitly serialize UploadedFileMetadata objects to plain dicts
358
+ raw_files = data.get("files", [])
359
+ files = [_serialize_file_metadata(f) for f in raw_files]
360
+ total = data.get("total", len(files))
361
+
362
+ return FileListResult(files=files, total=total)
363
+
364
+
365
+ def get_file(file_id: str, timeout: int = 60) -> FileInfo:
366
+ """
367
+ Get metadata for a specific file via SDK.
368
+
369
+ Args:
370
+ file_id: Datalab file ID
371
+ timeout: Request timeout in seconds (unused - SDK manages timeouts)
372
+
373
+ Returns:
374
+ FileInfo with file metadata
375
+ """
376
+ client = get_client()
377
+
378
+ try:
379
+ meta = client.get_file_metadata(file_id)
380
+ except Exception as e:
381
+ _handle_sdk_exception(e, "get_file_metadata")
382
+
383
+ # L-1: Ensure file_id is str
384
+ return FileInfo(
385
+ file_id=str(meta.file_id),
386
+ file_name=meta.original_filename,
387
+ file_size=meta.file_size,
388
+ content_type=meta.content_type,
389
+ created_at=str(meta.created) if meta.created else None,
390
+ reference=meta.reference,
391
+ status=meta.upload_status,
392
+ )
393
+
394
+
395
+ def get_download_url(file_id: str, expires_in: int = 3600, timeout: int = 60) -> DownloadUrlResult:
396
+ """
397
+ Get a download URL for a file via SDK.
398
+
399
+ Args:
400
+ file_id: Datalab file ID
401
+ expires_in: URL expiry time in seconds (default: 3600, max: 86400)
402
+ timeout: Request timeout in seconds (unused - SDK manages timeouts)
403
+
404
+ Returns:
405
+ DownloadUrlResult with download_url, expires_in, and file_id
406
+
407
+ Raises:
408
+ FileManagerAPIError: On invalid expires_in, API errors, or missing download_url
409
+ """
410
+ # L-3: Validate expires_in bounds
411
+ if expires_in < 60 or expires_in > 86400:
412
+ raise FileManagerAPIError(
413
+ f"expires_in must be between 60 and 86400 seconds, got {expires_in}", 400
414
+ )
415
+
416
+ client = get_client()
417
+
418
+ try:
419
+ data = client.get_file_download_url(file_id, expires_in=expires_in)
420
+ except Exception as e:
421
+ _handle_sdk_exception(e, "get_download_url")
422
+
423
+ download_url = data.get("download_url")
424
+ if not download_url:
425
+ raise FileManagerAPIError(
426
+ f"No download_url in SDK response. Keys: {list(data.keys())}",
427
+ 500,
428
+ )
429
+
430
+ return DownloadUrlResult(
431
+ download_url=download_url,
432
+ expires_in=expires_in,
433
+ file_id=str(data.get("file_id", file_id)),
434
+ )
435
+
436
+
437
+ def delete_file(file_id: str, timeout: int = 60) -> bool:
438
+ """
439
+ Delete a file from Datalab cloud storage via SDK.
440
+
441
+ Args:
442
+ file_id: Datalab file ID
443
+ timeout: Request timeout in seconds (unused - SDK manages timeouts)
444
+
445
+ Returns:
446
+ True if deleted
447
+ """
448
+ client = get_client()
449
+
450
+ try:
451
+ result = client.delete_file(file_id)
452
+ except Exception as e:
453
+ _handle_sdk_exception(e, "delete_file")
454
+
455
+ if not result.get("success", True):
456
+ raise FileManagerAPIError(
457
+ f"SDK delete returned failure: {result.get('message', 'unknown')}",
458
+ 500,
459
+ )
460
+
461
+ return True
462
+
463
+
464
+ # =============================================================================
465
+ # CLI INTERFACE
466
+ # =============================================================================
467
+
468
+
469
+ def main() -> None:
470
+ """CLI entry point."""
471
+ # Load .env file if present
472
+ try:
473
+ from dotenv import load_dotenv
474
+
475
+ env_path = Path(__file__).parent.parent / ".env"
476
+ if env_path.exists():
477
+ load_dotenv(env_path)
478
+ logger.debug(f"Loaded environment from {env_path}")
479
+ except ImportError:
480
+ pass # python-dotenv not installed, skip
481
+
482
+ parser = argparse.ArgumentParser(
483
+ description="Datalab File Manager Worker - Upload, list, get, download, delete files",
484
+ formatter_class=argparse.RawDescriptionHelpFormatter,
485
+ epilog="""
486
+ Examples:
487
+ python file_manager_worker.py --action upload --file document.pdf
488
+ python file_manager_worker.py --action list --limit 10
489
+ python file_manager_worker.py --action get --file-id abc123
490
+ python file_manager_worker.py --action download-url --file-id abc123 --expires-in 7200
491
+ python file_manager_worker.py --action delete --file-id abc123
492
+ """,
493
+ )
494
+ parser.add_argument(
495
+ "--action",
496
+ required=True,
497
+ choices=["upload", "list", "get", "download-url", "delete"],
498
+ help="Action to perform",
499
+ )
500
+ parser.add_argument("--file", "-f", type=str, help="File path (for upload)")
501
+ parser.add_argument("--file-id", type=str, help="Datalab file ID (for get/download-url/delete)")
502
+ parser.add_argument("--limit", type=int, default=50, help="Limit for list (default: 50)")
503
+ parser.add_argument("--offset", type=int, default=0, help="Offset for list (default: 0)")
504
+ parser.add_argument("--expires-in", type=int, default=3600, help="Download URL expiry in seconds (default: 3600, min: 60, max: 86400)")
505
+ parser.add_argument("--timeout", type=int, default=300, help="Timeout seconds (default: 300)")
506
+ parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
507
+
508
+ args = parser.parse_args()
509
+
510
+ # Suppress logging for clean JSON output
511
+ logging.getLogger().setLevel(logging.CRITICAL)
512
+ if args.verbose:
513
+ logging.getLogger().setLevel(logging.DEBUG)
514
+
515
+ try:
516
+ if args.action == "upload":
517
+ if not args.file:
518
+ raise ValueError("--file is required for upload action")
519
+ result = upload_file(args.file, timeout=args.timeout)
520
+ print(json.dumps(asdict(result)))
521
+
522
+ elif args.action == "list":
523
+ result = list_files(limit=args.limit, offset=args.offset, timeout=args.timeout)
524
+ print(json.dumps(asdict(result)))
525
+
526
+ elif args.action == "get":
527
+ if not args.file_id:
528
+ raise ValueError("--file-id is required for get action")
529
+ result = get_file(args.file_id, timeout=args.timeout)
530
+ print(json.dumps(asdict(result)))
531
+
532
+ elif args.action == "download-url":
533
+ if not args.file_id:
534
+ raise ValueError("--file-id is required for download-url action")
535
+ result = get_download_url(args.file_id, expires_in=args.expires_in, timeout=args.timeout)
536
+ print(json.dumps(asdict(result)))
537
+
538
+ elif args.action == "delete":
539
+ if not args.file_id:
540
+ raise ValueError("--file-id is required for delete action")
541
+ delete_file(args.file_id, timeout=args.timeout)
542
+ print(json.dumps({"deleted": True, "file_id": args.file_id}))
543
+
544
+ except Exception as e:
545
+ logger.exception(f"Fatal error: {e}")
546
+ details = {}
547
+ if hasattr(e, "status_code"):
548
+ details["status_code"] = e.status_code
549
+ if hasattr(e, "file_path"):
550
+ details["file_path"] = e.file_path
551
+ print(
552
+ json.dumps(
553
+ {
554
+ "error": str(e),
555
+ "category": getattr(e, "category", "FILE_MANAGER_API_ERROR"),
556
+ "details": details,
557
+ }
558
+ )
559
+ )
560
+ sys.exit(1)
561
+
562
+
563
+ if __name__ == "__main__":
564
+ main()