ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,564 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Datalab File Manager Worker for OCR Provenance MCP System
|
|
4
|
+
|
|
5
|
+
Manages file uploads, listing, retrieval, and deletion via Datalab API.
|
|
6
|
+
FAIL-FAST: No fallbacks, no mocks. Errors propagate immediately.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import asdict, dataclass
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
# Configure logging FIRST - all logging goes to stderr
|
|
20
|
+
logging.basicConfig(
|
|
21
|
+
level=logging.INFO,
|
|
22
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
23
|
+
stream=sys.stderr,
|
|
24
|
+
)
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# =============================================================================
|
|
29
|
+
# CONSTANTS
|
|
30
|
+
# =============================================================================
|
|
31
|
+
|
|
32
|
+
# SDK handles base URL via DATALAB_HOST env var (default: https://www.datalab.to)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# =============================================================================
|
|
36
|
+
# ERROR CLASSES (same pattern as form_fill_worker.py)
|
|
37
|
+
# =============================================================================
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FileManagerError(Exception):
|
|
41
|
+
"""Base file manager error with category for error handling."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, message: str, category: str):
|
|
44
|
+
super().__init__(message)
|
|
45
|
+
self.category = category
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FileManagerAPIError(FileManagerError):
|
|
49
|
+
"""API errors (4xx/5xx responses)."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, message: str, status_code: int):
|
|
52
|
+
category = "FILE_MANAGER_SERVER_ERROR" if status_code >= 500 else "FILE_MANAGER_API_ERROR"
|
|
53
|
+
super().__init__(message, category)
|
|
54
|
+
self.status_code = status_code
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class FileManagerFileError(FileManagerError):
|
|
58
|
+
"""File access errors."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, message: str, file_path: str):
|
|
61
|
+
super().__init__(message, "FILE_MANAGER_FILE_ERROR")
|
|
62
|
+
self.file_path = file_path
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# =============================================================================
|
|
66
|
+
# DATA STRUCTURES
|
|
67
|
+
# =============================================================================
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class UploadResult:
|
|
72
|
+
"""Result from file upload."""
|
|
73
|
+
|
|
74
|
+
file_id: str
|
|
75
|
+
reference: str | None
|
|
76
|
+
file_name: str
|
|
77
|
+
file_hash: str
|
|
78
|
+
file_size: int
|
|
79
|
+
content_type: str
|
|
80
|
+
status: str # 'complete' or 'failed'
|
|
81
|
+
error: str | None = None
|
|
82
|
+
processing_duration_ms: int = 0
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class FileInfo:
|
|
87
|
+
"""File metadata from Datalab."""
|
|
88
|
+
|
|
89
|
+
file_id: str
|
|
90
|
+
file_name: str | None
|
|
91
|
+
file_size: int | None
|
|
92
|
+
content_type: str | None
|
|
93
|
+
created_at: str | None
|
|
94
|
+
reference: str | None
|
|
95
|
+
status: str | None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class FileListResult:
|
|
100
|
+
"""Result from listing files."""
|
|
101
|
+
|
|
102
|
+
files: list[dict]
|
|
103
|
+
total: int
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass
|
|
107
|
+
class DownloadUrlResult:
|
|
108
|
+
"""Result from get_download_url with metadata."""
|
|
109
|
+
|
|
110
|
+
download_url: str
|
|
111
|
+
expires_in: int
|
|
112
|
+
file_id: str
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# =============================================================================
|
|
116
|
+
# HELPERS
|
|
117
|
+
# =============================================================================
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _import_sdk_exceptions() -> tuple:
|
|
121
|
+
"""Import SDK exception classes (deferred to match get_client pattern)."""
|
|
122
|
+
from datalab_sdk.exceptions import (
|
|
123
|
+
DatalabAPIError,
|
|
124
|
+
DatalabFileError,
|
|
125
|
+
DatalabTimeoutError,
|
|
126
|
+
DatalabValidationError,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return DatalabAPIError, DatalabFileError, DatalabTimeoutError, DatalabValidationError
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _handle_sdk_exception(e: Exception, operation: str, context: str = "") -> None:
|
|
133
|
+
"""
|
|
134
|
+
Handle SDK exceptions with specific error types.
|
|
135
|
+
Raises the appropriate FileManager error based on the SDK exception type.
|
|
136
|
+
"""
|
|
137
|
+
DatalabAPIError, DatalabFileError, DatalabTimeoutError, DatalabValidationError = _import_sdk_exceptions()
|
|
138
|
+
|
|
139
|
+
if isinstance(e, DatalabValidationError):
|
|
140
|
+
raise FileManagerAPIError(f"Invalid input for {operation}: {e}", 400) from e
|
|
141
|
+
|
|
142
|
+
if isinstance(e, DatalabTimeoutError):
|
|
143
|
+
raise FileManagerAPIError(f"{operation} timeout: {e}", 504) from e
|
|
144
|
+
|
|
145
|
+
if isinstance(e, DatalabFileError):
|
|
146
|
+
raise FileManagerFileError(f"{operation} file error: {e}", context or "unknown") from e
|
|
147
|
+
|
|
148
|
+
if isinstance(e, DatalabAPIError):
|
|
149
|
+
status = getattr(e, "status_code", 500)
|
|
150
|
+
error_msg = str(e)
|
|
151
|
+
if status == 429 or "rate limit" in error_msg.lower():
|
|
152
|
+
raise FileManagerAPIError(f"Rate limit exceeded during {operation}: {e}", 429) from e
|
|
153
|
+
if status in (401, 403):
|
|
154
|
+
raise FileManagerAPIError(f"Authentication error during {operation} ({status}): {e}", status) from e
|
|
155
|
+
if status == 404 or "not found" in error_msg.lower():
|
|
156
|
+
raise FileManagerAPIError(f"Not found during {operation}: {e}", 404) from e
|
|
157
|
+
raise FileManagerAPIError(f"API error during {operation} ({status}): {e}", status) from e
|
|
158
|
+
|
|
159
|
+
# Unexpected exception type — log and raise as 500
|
|
160
|
+
logger.error(f"Unexpected error during {operation}: {type(e).__name__}: {e}")
|
|
161
|
+
raise FileManagerAPIError(f"SDK {operation} failed: {e}", 500) from e
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def get_client() -> "DatalabClient":
|
|
165
|
+
"""
|
|
166
|
+
Get a DatalabClient instance.
|
|
167
|
+
FAIL-FAST: Raises immediately if API key not set.
|
|
168
|
+
The SDK reads DATALAB_API_KEY from the environment automatically.
|
|
169
|
+
"""
|
|
170
|
+
from datalab_sdk import DatalabClient
|
|
171
|
+
|
|
172
|
+
api_key = os.environ.get("DATALAB_API_KEY")
|
|
173
|
+
if not api_key:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
"DATALAB_API_KEY environment variable is required. "
|
|
176
|
+
"Get your key from https://www.datalab.to/settings"
|
|
177
|
+
)
|
|
178
|
+
if api_key == "your_api_key_here":
|
|
179
|
+
raise ValueError(
|
|
180
|
+
"DATALAB_API_KEY is set to placeholder value. Update .env with your actual API key."
|
|
181
|
+
)
|
|
182
|
+
return DatalabClient()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def compute_file_hash(file_path: str) -> str:
|
|
186
|
+
"""Compute SHA-256 of file content (64KB chunks for memory efficiency)."""
|
|
187
|
+
h = hashlib.sha256()
|
|
188
|
+
with open(file_path, "rb") as f:
|
|
189
|
+
while True:
|
|
190
|
+
chunk = f.read(65536)
|
|
191
|
+
if not chunk:
|
|
192
|
+
break
|
|
193
|
+
h.update(chunk)
|
|
194
|
+
return f"sha256:{h.hexdigest()}"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def get_content_type(file_path: str) -> str:
|
|
198
|
+
"""Determine content type from file extension."""
|
|
199
|
+
ext = Path(file_path).suffix.lower()
|
|
200
|
+
content_types = {
|
|
201
|
+
".pdf": "application/pdf",
|
|
202
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
203
|
+
".doc": "application/msword",
|
|
204
|
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
205
|
+
".ppt": "application/vnd.ms-powerpoint",
|
|
206
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
207
|
+
".xls": "application/vnd.ms-excel",
|
|
208
|
+
".png": "image/png",
|
|
209
|
+
".jpg": "image/jpeg",
|
|
210
|
+
".jpeg": "image/jpeg",
|
|
211
|
+
".tiff": "image/tiff",
|
|
212
|
+
".tif": "image/tiff",
|
|
213
|
+
".bmp": "image/bmp",
|
|
214
|
+
".gif": "image/gif",
|
|
215
|
+
".webp": "image/webp",
|
|
216
|
+
".txt": "text/plain",
|
|
217
|
+
".csv": "text/csv",
|
|
218
|
+
".md": "text/markdown",
|
|
219
|
+
}
|
|
220
|
+
return content_types.get(ext, "application/octet-stream")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def validate_file(file_path: str) -> Path:
|
|
224
|
+
"""
|
|
225
|
+
Validate file exists and is readable.
|
|
226
|
+
FAIL-FAST: Raises immediately on any issue.
|
|
227
|
+
"""
|
|
228
|
+
path = Path(file_path).resolve()
|
|
229
|
+
|
|
230
|
+
if not path.exists():
|
|
231
|
+
raise FileManagerFileError(f"File not found: {file_path}", str(path))
|
|
232
|
+
|
|
233
|
+
if not path.is_file():
|
|
234
|
+
raise FileManagerFileError(f"Not a file: {file_path}", str(path))
|
|
235
|
+
|
|
236
|
+
return path
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _serialize_file_metadata(obj: object) -> dict:
|
|
240
|
+
"""
|
|
241
|
+
Serialize an UploadedFileMetadata SDK object to a plain dict.
|
|
242
|
+
L-2: SDK returns UploadedFileMetadata dataclass objects, not dicts.
|
|
243
|
+
We explicitly convert to ensure consistent JSON output.
|
|
244
|
+
"""
|
|
245
|
+
from dataclasses import fields as dc_fields
|
|
246
|
+
|
|
247
|
+
# If it's already a dict, return as-is
|
|
248
|
+
if isinstance(obj, dict):
|
|
249
|
+
return obj
|
|
250
|
+
|
|
251
|
+
# If it's a dataclass, convert properly with str(file_id) for L-1
|
|
252
|
+
try:
|
|
253
|
+
dc_fields(obj) # Raises TypeError if not a dataclass
|
|
254
|
+
result = asdict(obj)
|
|
255
|
+
# L-1: Ensure file_id is str (SDK returns int)
|
|
256
|
+
if "file_id" in result:
|
|
257
|
+
result["file_id"] = str(result["file_id"])
|
|
258
|
+
return result
|
|
259
|
+
except TypeError:
|
|
260
|
+
pass
|
|
261
|
+
|
|
262
|
+
# Fallback: convert known attributes
|
|
263
|
+
result = {}
|
|
264
|
+
for attr in ("file_id", "original_filename", "content_type", "reference",
|
|
265
|
+
"upload_status", "file_size", "created", "error"):
|
|
266
|
+
val = getattr(obj, attr, None)
|
|
267
|
+
if val is not None:
|
|
268
|
+
result[attr] = str(val) if attr == "file_id" else val
|
|
269
|
+
return result
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# =============================================================================
|
|
273
|
+
# API ACTIONS
|
|
274
|
+
# =============================================================================
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def upload_file(file_path: str, timeout: int = 300) -> UploadResult:
|
|
278
|
+
"""
|
|
279
|
+
Upload a file to Datalab cloud storage via SDK.
|
|
280
|
+
|
|
281
|
+
The SDK handles the 3-step upload process internally with retry logic
|
|
282
|
+
(tenacity-based exponential backoff for 429/5xx).
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
file_path: Path to file to upload
|
|
286
|
+
timeout: Request timeout in seconds (unused - SDK manages timeouts)
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
UploadResult with file_id and reference
|
|
290
|
+
|
|
291
|
+
Raises:
|
|
292
|
+
FileManagerAPIError: On API errors
|
|
293
|
+
FileManagerFileError: On file access issues
|
|
294
|
+
ValueError: On missing API key
|
|
295
|
+
"""
|
|
296
|
+
validated_path = validate_file(file_path)
|
|
297
|
+
client = get_client()
|
|
298
|
+
file_hash = compute_file_hash(str(validated_path))
|
|
299
|
+
file_size = validated_path.stat().st_size
|
|
300
|
+
file_name = validated_path.name
|
|
301
|
+
content_type = get_content_type(str(validated_path))
|
|
302
|
+
|
|
303
|
+
logger.info(f"Uploading file via SDK: {validated_path} ({file_size} bytes)")
|
|
304
|
+
|
|
305
|
+
start_time = time.time()
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
result = client.upload_files(str(validated_path))
|
|
309
|
+
except Exception as e:
|
|
310
|
+
_handle_sdk_exception(e, "upload", str(validated_path))
|
|
311
|
+
|
|
312
|
+
# SDK returns UploadedFileMetadata with file_id (int), reference, etc.
|
|
313
|
+
# L-1: SDK's UploadedFileMetadata.file_id is int — convert to str for JSON protocol
|
|
314
|
+
file_id = str(result.file_id)
|
|
315
|
+
reference = result.reference
|
|
316
|
+
|
|
317
|
+
if not file_id:
|
|
318
|
+
raise FileManagerAPIError("SDK returned empty file_id", 500)
|
|
319
|
+
|
|
320
|
+
logger.info(f"Upload complete via SDK: file_id={file_id}, reference={reference}")
|
|
321
|
+
|
|
322
|
+
end_time = time.time()
|
|
323
|
+
duration_ms = int((end_time - start_time) * 1000)
|
|
324
|
+
|
|
325
|
+
return UploadResult(
|
|
326
|
+
file_id=file_id,
|
|
327
|
+
reference=reference,
|
|
328
|
+
file_name=file_name,
|
|
329
|
+
file_hash=file_hash,
|
|
330
|
+
file_size=file_size,
|
|
331
|
+
content_type=content_type,
|
|
332
|
+
status="complete",
|
|
333
|
+
processing_duration_ms=duration_ms,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def list_files(limit: int = 50, offset: int = 0, timeout: int = 60) -> FileListResult:
|
|
338
|
+
"""
|
|
339
|
+
List files in Datalab cloud storage via SDK.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
limit: Max files to return
|
|
343
|
+
offset: Pagination offset
|
|
344
|
+
timeout: Request timeout in seconds (unused - SDK manages timeouts)
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
FileListResult with files array and total count
|
|
348
|
+
"""
|
|
349
|
+
client = get_client()
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
data = client.list_files(limit=limit, offset=offset)
|
|
353
|
+
except Exception as e:
|
|
354
|
+
_handle_sdk_exception(e, "list_files")
|
|
355
|
+
|
|
356
|
+
# SDK returns dict with 'files' (list of UploadedFileMetadata objects), 'total', 'limit', 'offset'
|
|
357
|
+
# L-2: Explicitly serialize UploadedFileMetadata objects to plain dicts
|
|
358
|
+
raw_files = data.get("files", [])
|
|
359
|
+
files = [_serialize_file_metadata(f) for f in raw_files]
|
|
360
|
+
total = data.get("total", len(files))
|
|
361
|
+
|
|
362
|
+
return FileListResult(files=files, total=total)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def get_file(file_id: str, timeout: int = 60) -> FileInfo:
|
|
366
|
+
"""
|
|
367
|
+
Get metadata for a specific file via SDK.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
file_id: Datalab file ID
|
|
371
|
+
timeout: Request timeout in seconds (unused - SDK manages timeouts)
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
FileInfo with file metadata
|
|
375
|
+
"""
|
|
376
|
+
client = get_client()
|
|
377
|
+
|
|
378
|
+
try:
|
|
379
|
+
meta = client.get_file_metadata(file_id)
|
|
380
|
+
except Exception as e:
|
|
381
|
+
_handle_sdk_exception(e, "get_file_metadata")
|
|
382
|
+
|
|
383
|
+
# L-1: Ensure file_id is str
|
|
384
|
+
return FileInfo(
|
|
385
|
+
file_id=str(meta.file_id),
|
|
386
|
+
file_name=meta.original_filename,
|
|
387
|
+
file_size=meta.file_size,
|
|
388
|
+
content_type=meta.content_type,
|
|
389
|
+
created_at=str(meta.created) if meta.created else None,
|
|
390
|
+
reference=meta.reference,
|
|
391
|
+
status=meta.upload_status,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def get_download_url(file_id: str, expires_in: int = 3600, timeout: int = 60) -> DownloadUrlResult:
|
|
396
|
+
"""
|
|
397
|
+
Get a download URL for a file via SDK.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
file_id: Datalab file ID
|
|
401
|
+
expires_in: URL expiry time in seconds (default: 3600, max: 86400)
|
|
402
|
+
timeout: Request timeout in seconds (unused - SDK manages timeouts)
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
DownloadUrlResult with download_url, expires_in, and file_id
|
|
406
|
+
|
|
407
|
+
Raises:
|
|
408
|
+
FileManagerAPIError: On invalid expires_in, API errors, or missing download_url
|
|
409
|
+
"""
|
|
410
|
+
# L-3: Validate expires_in bounds
|
|
411
|
+
if expires_in < 60 or expires_in > 86400:
|
|
412
|
+
raise FileManagerAPIError(
|
|
413
|
+
f"expires_in must be between 60 and 86400 seconds, got {expires_in}", 400
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
client = get_client()
|
|
417
|
+
|
|
418
|
+
try:
|
|
419
|
+
data = client.get_file_download_url(file_id, expires_in=expires_in)
|
|
420
|
+
except Exception as e:
|
|
421
|
+
_handle_sdk_exception(e, "get_download_url")
|
|
422
|
+
|
|
423
|
+
download_url = data.get("download_url")
|
|
424
|
+
if not download_url:
|
|
425
|
+
raise FileManagerAPIError(
|
|
426
|
+
f"No download_url in SDK response. Keys: {list(data.keys())}",
|
|
427
|
+
500,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
return DownloadUrlResult(
|
|
431
|
+
download_url=download_url,
|
|
432
|
+
expires_in=expires_in,
|
|
433
|
+
file_id=str(data.get("file_id", file_id)),
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def delete_file(file_id: str, timeout: int = 60) -> bool:
|
|
438
|
+
"""
|
|
439
|
+
Delete a file from Datalab cloud storage via SDK.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
file_id: Datalab file ID
|
|
443
|
+
timeout: Request timeout in seconds (unused - SDK manages timeouts)
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
True if deleted
|
|
447
|
+
"""
|
|
448
|
+
client = get_client()
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
result = client.delete_file(file_id)
|
|
452
|
+
except Exception as e:
|
|
453
|
+
_handle_sdk_exception(e, "delete_file")
|
|
454
|
+
|
|
455
|
+
if not result.get("success", True):
|
|
456
|
+
raise FileManagerAPIError(
|
|
457
|
+
f"SDK delete returned failure: {result.get('message', 'unknown')}",
|
|
458
|
+
500,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
return True
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
# =============================================================================
|
|
465
|
+
# CLI INTERFACE
|
|
466
|
+
# =============================================================================
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def main() -> None:
|
|
470
|
+
"""CLI entry point."""
|
|
471
|
+
# Load .env file if present
|
|
472
|
+
try:
|
|
473
|
+
from dotenv import load_dotenv
|
|
474
|
+
|
|
475
|
+
env_path = Path(__file__).parent.parent / ".env"
|
|
476
|
+
if env_path.exists():
|
|
477
|
+
load_dotenv(env_path)
|
|
478
|
+
logger.debug(f"Loaded environment from {env_path}")
|
|
479
|
+
except ImportError:
|
|
480
|
+
pass # python-dotenv not installed, skip
|
|
481
|
+
|
|
482
|
+
parser = argparse.ArgumentParser(
|
|
483
|
+
description="Datalab File Manager Worker - Upload, list, get, download, delete files",
|
|
484
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
485
|
+
epilog="""
|
|
486
|
+
Examples:
|
|
487
|
+
python file_manager_worker.py --action upload --file document.pdf
|
|
488
|
+
python file_manager_worker.py --action list --limit 10
|
|
489
|
+
python file_manager_worker.py --action get --file-id abc123
|
|
490
|
+
python file_manager_worker.py --action download-url --file-id abc123 --expires-in 7200
|
|
491
|
+
python file_manager_worker.py --action delete --file-id abc123
|
|
492
|
+
""",
|
|
493
|
+
)
|
|
494
|
+
parser.add_argument(
|
|
495
|
+
"--action",
|
|
496
|
+
required=True,
|
|
497
|
+
choices=["upload", "list", "get", "download-url", "delete"],
|
|
498
|
+
help="Action to perform",
|
|
499
|
+
)
|
|
500
|
+
parser.add_argument("--file", "-f", type=str, help="File path (for upload)")
|
|
501
|
+
parser.add_argument("--file-id", type=str, help="Datalab file ID (for get/download-url/delete)")
|
|
502
|
+
parser.add_argument("--limit", type=int, default=50, help="Limit for list (default: 50)")
|
|
503
|
+
parser.add_argument("--offset", type=int, default=0, help="Offset for list (default: 0)")
|
|
504
|
+
parser.add_argument("--expires-in", type=int, default=3600, help="Download URL expiry in seconds (default: 3600, min: 60, max: 86400)")
|
|
505
|
+
parser.add_argument("--timeout", type=int, default=300, help="Timeout seconds (default: 300)")
|
|
506
|
+
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging")
|
|
507
|
+
|
|
508
|
+
args = parser.parse_args()
|
|
509
|
+
|
|
510
|
+
# Suppress logging for clean JSON output
|
|
511
|
+
logging.getLogger().setLevel(logging.CRITICAL)
|
|
512
|
+
if args.verbose:
|
|
513
|
+
logging.getLogger().setLevel(logging.DEBUG)
|
|
514
|
+
|
|
515
|
+
try:
|
|
516
|
+
if args.action == "upload":
|
|
517
|
+
if not args.file:
|
|
518
|
+
raise ValueError("--file is required for upload action")
|
|
519
|
+
result = upload_file(args.file, timeout=args.timeout)
|
|
520
|
+
print(json.dumps(asdict(result)))
|
|
521
|
+
|
|
522
|
+
elif args.action == "list":
|
|
523
|
+
result = list_files(limit=args.limit, offset=args.offset, timeout=args.timeout)
|
|
524
|
+
print(json.dumps(asdict(result)))
|
|
525
|
+
|
|
526
|
+
elif args.action == "get":
|
|
527
|
+
if not args.file_id:
|
|
528
|
+
raise ValueError("--file-id is required for get action")
|
|
529
|
+
result = get_file(args.file_id, timeout=args.timeout)
|
|
530
|
+
print(json.dumps(asdict(result)))
|
|
531
|
+
|
|
532
|
+
elif args.action == "download-url":
|
|
533
|
+
if not args.file_id:
|
|
534
|
+
raise ValueError("--file-id is required for download-url action")
|
|
535
|
+
result = get_download_url(args.file_id, expires_in=args.expires_in, timeout=args.timeout)
|
|
536
|
+
print(json.dumps(asdict(result)))
|
|
537
|
+
|
|
538
|
+
elif args.action == "delete":
|
|
539
|
+
if not args.file_id:
|
|
540
|
+
raise ValueError("--file-id is required for delete action")
|
|
541
|
+
delete_file(args.file_id, timeout=args.timeout)
|
|
542
|
+
print(json.dumps({"deleted": True, "file_id": args.file_id}))
|
|
543
|
+
|
|
544
|
+
except Exception as e:
|
|
545
|
+
logger.exception(f"Fatal error: {e}")
|
|
546
|
+
details = {}
|
|
547
|
+
if hasattr(e, "status_code"):
|
|
548
|
+
details["status_code"] = e.status_code
|
|
549
|
+
if hasattr(e, "file_path"):
|
|
550
|
+
details["file_path"] = e.file_path
|
|
551
|
+
print(
|
|
552
|
+
json.dumps(
|
|
553
|
+
{
|
|
554
|
+
"error": str(e),
|
|
555
|
+
"category": getattr(e, "category", "FILE_MANAGER_API_ERROR"),
|
|
556
|
+
"details": details,
|
|
557
|
+
}
|
|
558
|
+
)
|
|
559
|
+
)
|
|
560
|
+
sys.exit(1)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
if __name__ == "__main__":
|
|
564
|
+
main()
|