ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,712 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Datalab OCR Worker for OCR Provenance MCP System
|
|
4
|
+
|
|
5
|
+
Extracts text from documents using Datalab API.
|
|
6
|
+
FAIL-FAST: No fallbacks, no mocks. Errors propagate immediately.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
import uuid
|
|
18
|
+
from dataclasses import asdict, dataclass
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Literal
|
|
21
|
+
|
|
22
|
+
# Configure logging FIRST
|
|
23
|
+
logging.basicConfig(
|
|
24
|
+
level=logging.INFO,
|
|
25
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
26
|
+
stream=sys.stderr,
|
|
27
|
+
)
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# =============================================================================
|
|
32
|
+
# ERROR CLASSES (CS-ERR-001 compliant - inline, no separate module)
|
|
33
|
+
# =============================================================================
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class OCRError(Exception):
|
|
37
|
+
"""Base OCR error with category for error handling."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, message: str, category: str, request_id: str | None = None):
|
|
40
|
+
super().__init__(message)
|
|
41
|
+
self.category = category
|
|
42
|
+
self.request_id = request_id
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class OCRAPIError(OCRError):
|
|
46
|
+
"""API errors (4xx/5xx responses)."""
|
|
47
|
+
|
|
48
|
+
def __init__(self, message: str, status_code: int, request_id: str | None = None):
|
|
49
|
+
category = "OCR_SERVER_ERROR" if status_code >= 500 else "OCR_API_ERROR"
|
|
50
|
+
super().__init__(message, category, request_id)
|
|
51
|
+
self.status_code = status_code
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class OCRRateLimitError(OCRError):
|
|
55
|
+
"""Rate limit exceeded (429)."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, message: str = "Rate limit exceeded", retry_after: int = 60):
|
|
58
|
+
super().__init__(message, "OCR_RATE_LIMIT")
|
|
59
|
+
self.retry_after = retry_after
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class OCRTimeoutError(OCRError):
|
|
63
|
+
"""Processing timeout."""
|
|
64
|
+
|
|
65
|
+
def __init__(self, message: str, request_id: str | None = None):
|
|
66
|
+
super().__init__(message, "OCR_TIMEOUT", request_id)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class OCRFileError(OCRError):
|
|
70
|
+
"""File access errors."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, message: str, file_path: str):
|
|
73
|
+
super().__init__(message, "OCR_FILE_ERROR")
|
|
74
|
+
self.file_path = file_path
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class OCRAuthenticationError(OCRError):
|
|
78
|
+
"""Authentication/subscription errors (401/403)."""
|
|
79
|
+
|
|
80
|
+
def __init__(self, message: str, status_code: int):
|
|
81
|
+
# Provide actionable error message
|
|
82
|
+
if "subscription" in message.lower() or "expired" in message.lower() or status_code == 403:
|
|
83
|
+
detailed_msg = (
|
|
84
|
+
f"Datalab API subscription inactive (HTTP {status_code}). {message} "
|
|
85
|
+
"Action: Renew subscription at https://www.datalab.to/settings"
|
|
86
|
+
)
|
|
87
|
+
elif status_code == 401:
|
|
88
|
+
detailed_msg = (
|
|
89
|
+
f"Datalab API authentication failed. {message} "
|
|
90
|
+
"Action: Verify DATALAB_API_KEY is correct"
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
detailed_msg = f"Datalab API access denied (HTTP {status_code}). {message}"
|
|
94
|
+
super().__init__(detailed_msg, "OCR_AUTHENTICATION_ERROR")
|
|
95
|
+
self.status_code = status_code
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# =============================================================================
|
|
99
|
+
# DATA STRUCTURES (match src/models/document.ts exactly)
|
|
100
|
+
# =============================================================================
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class PageOffset:
|
|
105
|
+
"""
|
|
106
|
+
Character offset for a single page.
|
|
107
|
+
MUST match src/models/document.ts PageOffset interface.
|
|
108
|
+
Note: TypeScript uses camelCase (charStart), Python uses snake_case (char_start).
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
page: int # 1-indexed page number
|
|
112
|
+
char_start: int # Start offset in full text
|
|
113
|
+
char_end: int # End offset in full text
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class OCRResult:
|
|
118
|
+
"""
|
|
119
|
+
Result from OCR processing.
|
|
120
|
+
MUST match src/models/document.ts OCRResult interface exactly.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
# Required fields (match TypeScript interface)
|
|
124
|
+
id: str # UUID - generate with uuid.uuid4()
|
|
125
|
+
provenance_id: str # UUID - caller provides
|
|
126
|
+
document_id: str # UUID - caller provides
|
|
127
|
+
extracted_text: str # Markdown text from Datalab
|
|
128
|
+
text_length: int # len(extracted_text)
|
|
129
|
+
datalab_request_id: str # Unique ID for this request
|
|
130
|
+
datalab_mode: Literal["fast", "balanced", "accurate"]
|
|
131
|
+
parse_quality_score: float | None
|
|
132
|
+
page_count: int
|
|
133
|
+
cost_cents: float | None
|
|
134
|
+
content_hash: str # sha256:... of extracted_text
|
|
135
|
+
processing_started_at: str # ISO 8601
|
|
136
|
+
processing_completed_at: str # ISO 8601
|
|
137
|
+
processing_duration_ms: int
|
|
138
|
+
|
|
139
|
+
# Additional fields for provenance (not in TS interface but needed)
|
|
140
|
+
page_offsets: list[PageOffset] # Character offsets per page
|
|
141
|
+
error: str | None = None
|
|
142
|
+
|
|
143
|
+
# Images extracted by Datalab (filename -> base64 data)
|
|
144
|
+
images: dict[str, str] | None = None
|
|
145
|
+
|
|
146
|
+
# JSON block hierarchy from Datalab (when output_format includes 'json')
|
|
147
|
+
json_blocks: dict | None = None
|
|
148
|
+
|
|
149
|
+
# Datalab metadata (page_stats, block_counts, etc.)
|
|
150
|
+
metadata: dict | None = None
|
|
151
|
+
|
|
152
|
+
# Structured extraction result (when page_schema provided)
|
|
153
|
+
extraction_json: dict | list | None = None
|
|
154
|
+
|
|
155
|
+
# Full cost breakdown dict from Datalab
|
|
156
|
+
cost_breakdown_full: dict | None = None
|
|
157
|
+
|
|
158
|
+
# Document metadata from Datalab
|
|
159
|
+
doc_title: str | None = None
|
|
160
|
+
doc_author: str | None = None
|
|
161
|
+
doc_subject: str | None = None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# =============================================================================
|
|
165
|
+
# SUPPORTED FILE TYPES (match src/models/document.ts)
|
|
166
|
+
# =============================================================================
|
|
167
|
+
|
|
168
|
+
SUPPORTED_EXTENSIONS = frozenset(
|
|
169
|
+
{
|
|
170
|
+
".pdf",
|
|
171
|
+
".png",
|
|
172
|
+
".jpg",
|
|
173
|
+
".jpeg",
|
|
174
|
+
".tiff",
|
|
175
|
+
".tif",
|
|
176
|
+
".bmp",
|
|
177
|
+
".gif",
|
|
178
|
+
".webp",
|
|
179
|
+
".docx",
|
|
180
|
+
".doc",
|
|
181
|
+
".pptx",
|
|
182
|
+
".ppt",
|
|
183
|
+
".xlsx",
|
|
184
|
+
".xls",
|
|
185
|
+
".txt",
|
|
186
|
+
".csv",
|
|
187
|
+
".md",
|
|
188
|
+
}
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# =============================================================================
|
|
193
|
+
# MAIN IMPLEMENTATION
|
|
194
|
+
# =============================================================================
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def get_api_key() -> str:
|
|
198
|
+
"""
|
|
199
|
+
Get Datalab API key from environment.
|
|
200
|
+
FAIL-FAST: Raises immediately if not set.
|
|
201
|
+
"""
|
|
202
|
+
api_key = os.environ.get("DATALAB_API_KEY")
|
|
203
|
+
if not api_key:
|
|
204
|
+
raise ValueError(
|
|
205
|
+
"DATALAB_API_KEY environment variable is required. "
|
|
206
|
+
"Get your key from https://www.datalab.to/settings"
|
|
207
|
+
)
|
|
208
|
+
if api_key == "your_api_key_here":
|
|
209
|
+
raise ValueError(
|
|
210
|
+
"DATALAB_API_KEY is set to placeholder value. Update .env with your actual API key."
|
|
211
|
+
)
|
|
212
|
+
return api_key
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def validate_file(file_path: str) -> Path:
|
|
216
|
+
"""
|
|
217
|
+
Validate file exists and is supported type.
|
|
218
|
+
FAIL-FAST: Raises immediately on any issue.
|
|
219
|
+
"""
|
|
220
|
+
path = Path(file_path).resolve()
|
|
221
|
+
|
|
222
|
+
if not path.exists():
|
|
223
|
+
raise OCRFileError(f"File not found: {file_path}", str(path))
|
|
224
|
+
|
|
225
|
+
if not path.is_file():
|
|
226
|
+
raise OCRFileError(f"Not a file: {file_path}", str(path))
|
|
227
|
+
|
|
228
|
+
if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
|
229
|
+
raise OCRFileError(
|
|
230
|
+
f"Unsupported file type: {path.suffix}. "
|
|
231
|
+
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
|
|
232
|
+
str(path),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return path
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def compute_content_hash(content: str) -> str:
|
|
239
|
+
"""
|
|
240
|
+
Compute SHA-256 hash matching src/utils/hash.ts format.
|
|
241
|
+
|
|
242
|
+
Returns: 'sha256:' + 64 lowercase hex characters
|
|
243
|
+
"""
|
|
244
|
+
hash_hex = hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
245
|
+
return f"sha256:{hash_hex}"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def parse_page_offsets(markdown: str) -> list[PageOffset]:
|
|
249
|
+
"""
|
|
250
|
+
Parse page delimiters from Datalab paginated output.
|
|
251
|
+
|
|
252
|
+
Datalab with paginate=True adds markers like:
|
|
253
|
+
---
|
|
254
|
+
<!-- Page 2 -->
|
|
255
|
+
|
|
256
|
+
Returns list of PageOffset with character positions.
|
|
257
|
+
"""
|
|
258
|
+
# Pattern matches page markers: newline + "---" + newline + "<!-- Page N -->" + newline
|
|
259
|
+
page_pattern = r"\n---\n<!-- Page (\d+) -->\n"
|
|
260
|
+
|
|
261
|
+
parts = re.split(page_pattern, markdown)
|
|
262
|
+
|
|
263
|
+
if len(parts) == 1:
|
|
264
|
+
# No page markers = single page document
|
|
265
|
+
return [PageOffset(page=1, char_start=0, char_end=len(markdown))]
|
|
266
|
+
|
|
267
|
+
offsets = []
|
|
268
|
+
current_offset = 0
|
|
269
|
+
|
|
270
|
+
# First part is page 1 content
|
|
271
|
+
page1_content = parts[0]
|
|
272
|
+
offsets.append(PageOffset(page=1, char_start=0, char_end=len(page1_content)))
|
|
273
|
+
current_offset = len(page1_content)
|
|
274
|
+
|
|
275
|
+
# Subsequent parts: alternating page_number, content
|
|
276
|
+
for i in range(1, len(parts), 2):
|
|
277
|
+
if i + 1 < len(parts):
|
|
278
|
+
page_num = int(parts[i])
|
|
279
|
+
content = parts[i + 1]
|
|
280
|
+
marker_len = len(f"\n---\n<!-- Page {page_num} -->\n")
|
|
281
|
+
offsets.append(
|
|
282
|
+
PageOffset(
|
|
283
|
+
page=page_num,
|
|
284
|
+
char_start=current_offset + marker_len,
|
|
285
|
+
char_end=current_offset + marker_len + len(content),
|
|
286
|
+
)
|
|
287
|
+
)
|
|
288
|
+
current_offset += marker_len + len(content)
|
|
289
|
+
|
|
290
|
+
return offsets
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def process_document(
|
|
294
|
+
file_path: str,
|
|
295
|
+
document_id: str,
|
|
296
|
+
provenance_id: str,
|
|
297
|
+
mode: Literal["fast", "balanced", "accurate"] = "balanced",
|
|
298
|
+
timeout: int = 300,
|
|
299
|
+
# New Datalab API parameters
|
|
300
|
+
max_pages: int | None = None,
|
|
301
|
+
page_range: str | None = None,
|
|
302
|
+
skip_cache: bool = False,
|
|
303
|
+
disable_image_extraction: bool = False,
|
|
304
|
+
extras: list[str] | None = None,
|
|
305
|
+
page_schema: str | None = None,
|
|
306
|
+
additional_config: dict | None = None,
|
|
307
|
+
file_url: str | None = None,
|
|
308
|
+
) -> OCRResult:
|
|
309
|
+
"""
|
|
310
|
+
Process a document through Datalab OCR.
|
|
311
|
+
|
|
312
|
+
This is the MAIN function. Everything else supports this.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
file_path: Path to document (PDF, image, or Office file)
|
|
316
|
+
document_id: UUID of the document record in database
|
|
317
|
+
provenance_id: UUID for the OCR_RESULT provenance record
|
|
318
|
+
mode: OCR quality mode (accurate costs more but better quality)
|
|
319
|
+
timeout: Maximum wait time in seconds (minimum 30s for API polling)
|
|
320
|
+
max_pages: Maximum pages to process (Datalab limit: 7000)
|
|
321
|
+
page_range: Specific pages to process, 0-indexed (e.g. "0-5,10")
|
|
322
|
+
skip_cache: Force reprocessing, skip Datalab cache
|
|
323
|
+
disable_image_extraction: Skip image extraction for text-only processing
|
|
324
|
+
extras: Extra Datalab features (e.g. ["track_changes", "chart_understanding"])
|
|
325
|
+
page_schema: JSON schema string for structured data extraction per page
|
|
326
|
+
additional_config: Additional Datalab config dict
|
|
327
|
+
file_url: URL of file to process (instead of local file, passed to Datalab as file_url)
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
OCRResult with extracted text and metadata
|
|
331
|
+
|
|
332
|
+
Raises:
|
|
333
|
+
OCRAPIError: On 4xx/5xx API responses
|
|
334
|
+
OCRRateLimitError: On 429 (wait and retry)
|
|
335
|
+
OCRTimeoutError: On timeout
|
|
336
|
+
OCRFileError: On file access issues
|
|
337
|
+
ValueError: On missing API key
|
|
338
|
+
"""
|
|
339
|
+
from datalab_sdk import ConvertOptions, DatalabClient
|
|
340
|
+
from datalab_sdk.exceptions import (
|
|
341
|
+
DatalabAPIError,
|
|
342
|
+
DatalabFileError,
|
|
343
|
+
DatalabTimeoutError,
|
|
344
|
+
DatalabValidationError,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Validate inputs
|
|
348
|
+
if file_url:
|
|
349
|
+
validated_path = None # No local file when using URL
|
|
350
|
+
logger.info(f"Processing document from URL: {file_url} (mode={mode})")
|
|
351
|
+
else:
|
|
352
|
+
validated_path = validate_file(file_path)
|
|
353
|
+
logger.info(f"Processing document: {validated_path} (mode={mode})")
|
|
354
|
+
api_key = get_api_key()
|
|
355
|
+
|
|
356
|
+
# Record timing
|
|
357
|
+
start_time = time.time()
|
|
358
|
+
start_timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
359
|
+
|
|
360
|
+
# Generate unique request ID for tracking
|
|
361
|
+
request_id = str(uuid.uuid4())
|
|
362
|
+
|
|
363
|
+
try:
|
|
364
|
+
# Initialize client
|
|
365
|
+
client = DatalabClient(api_key=api_key)
|
|
366
|
+
|
|
367
|
+
# Configure options - paginate=True for page offset tracking
|
|
368
|
+
options = ConvertOptions(output_format="markdown,json", mode=mode, paginate=True)
|
|
369
|
+
# Only set optional Datalab API params if provided
|
|
370
|
+
if max_pages is not None:
|
|
371
|
+
options.max_pages = max_pages
|
|
372
|
+
if page_range is not None:
|
|
373
|
+
options.page_range = page_range
|
|
374
|
+
if skip_cache:
|
|
375
|
+
options.skip_cache = True
|
|
376
|
+
if disable_image_extraction:
|
|
377
|
+
options.disable_image_extraction = True
|
|
378
|
+
if extras:
|
|
379
|
+
# SDK expects comma-separated string, not list
|
|
380
|
+
options.extras = ",".join(extras) if isinstance(extras, list) else extras
|
|
381
|
+
if page_schema:
|
|
382
|
+
options.page_schema = page_schema
|
|
383
|
+
if additional_config:
|
|
384
|
+
options.additional_config = additional_config
|
|
385
|
+
|
|
386
|
+
# Calculate max_polls based on timeout (3 second poll interval) (FIX-P2-1)
|
|
387
|
+
max_polls = max(timeout // 3, 30)
|
|
388
|
+
|
|
389
|
+
# Call Datalab API
|
|
390
|
+
if file_url:
|
|
391
|
+
result = client.convert(
|
|
392
|
+
file_url=file_url, options=options, max_polls=max_polls, poll_interval=3
|
|
393
|
+
)
|
|
394
|
+
else:
|
|
395
|
+
result = client.convert(
|
|
396
|
+
file_path=str(validated_path), options=options, max_polls=max_polls, poll_interval=3
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Record completion
|
|
400
|
+
end_time = time.time()
|
|
401
|
+
end_timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
402
|
+
duration_ms = int((end_time - start_time) * 1000)
|
|
403
|
+
|
|
404
|
+
# Check for errors in result
|
|
405
|
+
if not result.success:
|
|
406
|
+
error_msg = result.error or "Unknown error during OCR processing"
|
|
407
|
+
logger.error(f"OCR failed: {error_msg}")
|
|
408
|
+
raise OCRAPIError(error_msg, status_code=500, request_id=request_id)
|
|
409
|
+
|
|
410
|
+
# Extract data from result
|
|
411
|
+
markdown = result.markdown or ""
|
|
412
|
+
page_count = result.page_count or 1
|
|
413
|
+
quality_score = result.parse_quality_score
|
|
414
|
+
|
|
415
|
+
# Get cost from response
|
|
416
|
+
# SDK v0.2.1 returns: {"list_cost_cents": N, "final_cost_cents": N}
|
|
417
|
+
# final_cost_cents is the actual charge after any discounts
|
|
418
|
+
cost_breakdown = result.cost_breakdown or {}
|
|
419
|
+
cost_cents = cost_breakdown.get("final_cost_cents")
|
|
420
|
+
if cost_cents is None:
|
|
421
|
+
cost_cents = cost_breakdown.get("total_cost_cents")
|
|
422
|
+
if cost_breakdown and cost_cents is None:
|
|
423
|
+
logger.warning("cost_breakdown present but no cost key found. Keys: %s", list(cost_breakdown.keys()))
|
|
424
|
+
|
|
425
|
+
# Capture images from Datalab response (filename -> base64 data)
|
|
426
|
+
# Images are returned as a dict with filename keys and base64-encoded image data
|
|
427
|
+
images = getattr(result, "images", None) or {}
|
|
428
|
+
if images:
|
|
429
|
+
logger.info(f"Captured {len(images)} images from Datalab response")
|
|
430
|
+
|
|
431
|
+
# Capture JSON block hierarchy (from output_format="markdown,json")
|
|
432
|
+
json_blocks = None
|
|
433
|
+
raw_json = getattr(result, "json", None)
|
|
434
|
+
if raw_json is not None:
|
|
435
|
+
if isinstance(raw_json, dict):
|
|
436
|
+
json_blocks = raw_json
|
|
437
|
+
elif hasattr(raw_json, "__dict__"):
|
|
438
|
+
json_blocks = raw_json.__dict__
|
|
439
|
+
else:
|
|
440
|
+
logger.warning(f"JSON output requested but got unexpected type: {type(raw_json)}")
|
|
441
|
+
if json_blocks is not None:
|
|
442
|
+
children = json_blocks.get("children", json_blocks.get("blocks", []))
|
|
443
|
+
logger.info(
|
|
444
|
+
f"Captured JSON block hierarchy with {len(children) if isinstance(children, list) else 0} top-level blocks"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Capture metadata (page_stats, block_counts, etc.)
|
|
448
|
+
metadata_dict = None
|
|
449
|
+
raw_metadata = getattr(result, "metadata", None)
|
|
450
|
+
if raw_metadata is not None:
|
|
451
|
+
if isinstance(raw_metadata, dict):
|
|
452
|
+
metadata_dict = raw_metadata
|
|
453
|
+
elif hasattr(raw_metadata, "__dict__"):
|
|
454
|
+
metadata_dict = raw_metadata.__dict__
|
|
455
|
+
|
|
456
|
+
# Capture structured extraction result (when page_schema provided)
|
|
457
|
+
extraction_json = None
|
|
458
|
+
raw_extraction = getattr(result, "extraction_schema_json", None)
|
|
459
|
+
if raw_extraction is not None:
|
|
460
|
+
if isinstance(raw_extraction, str):
|
|
461
|
+
extraction_json = json.loads(raw_extraction)
|
|
462
|
+
elif isinstance(raw_extraction, (dict, list)):
|
|
463
|
+
extraction_json = raw_extraction
|
|
464
|
+
if extraction_json is not None:
|
|
465
|
+
logger.info("Captured structured extraction data")
|
|
466
|
+
|
|
467
|
+
# Capture extras feature data (when extras params are enabled)
|
|
468
|
+
# These are returned as top-level attributes on the result object
|
|
469
|
+
extras_features: dict = {}
|
|
470
|
+
for extras_key in (
|
|
471
|
+
"links",
|
|
472
|
+
"charts",
|
|
473
|
+
"tracked_changes",
|
|
474
|
+
"table_row_bboxes",
|
|
475
|
+
"infographics",
|
|
476
|
+
):
|
|
477
|
+
val = getattr(result, extras_key, None)
|
|
478
|
+
if val is not None:
|
|
479
|
+
extras_features[extras_key] = val
|
|
480
|
+
if extras_features:
|
|
481
|
+
# Merge extras features into metadata dict for downstream storage
|
|
482
|
+
if metadata_dict is None:
|
|
483
|
+
metadata_dict = {}
|
|
484
|
+
metadata_dict["extras_features"] = extras_features
|
|
485
|
+
logger.info(f"Captured extras features: {list(extras_features.keys())}")
|
|
486
|
+
|
|
487
|
+
# Extract document metadata fields from Datalab metadata
|
|
488
|
+
doc_title = None
|
|
489
|
+
doc_author = None
|
|
490
|
+
doc_subject = None
|
|
491
|
+
if metadata_dict:
|
|
492
|
+
doc_title = metadata_dict.get("title")
|
|
493
|
+
doc_author = metadata_dict.get("author")
|
|
494
|
+
doc_subject = metadata_dict.get("subject")
|
|
495
|
+
|
|
496
|
+
# Parse page offsets for provenance tracking
|
|
497
|
+
page_offsets = parse_page_offsets(markdown)
|
|
498
|
+
|
|
499
|
+
# Compute content hash (matching src/utils/hash.ts format)
|
|
500
|
+
content_hash = compute_content_hash(markdown)
|
|
501
|
+
|
|
502
|
+
ocr_result = OCRResult(
|
|
503
|
+
id=str(uuid.uuid4()),
|
|
504
|
+
provenance_id=provenance_id,
|
|
505
|
+
document_id=document_id,
|
|
506
|
+
extracted_text=markdown,
|
|
507
|
+
text_length=len(markdown),
|
|
508
|
+
datalab_request_id=request_id,
|
|
509
|
+
datalab_mode=mode,
|
|
510
|
+
parse_quality_score=quality_score,
|
|
511
|
+
page_count=page_count,
|
|
512
|
+
cost_cents=cost_cents,
|
|
513
|
+
content_hash=content_hash,
|
|
514
|
+
processing_started_at=start_timestamp,
|
|
515
|
+
processing_completed_at=end_timestamp,
|
|
516
|
+
processing_duration_ms=duration_ms,
|
|
517
|
+
page_offsets=page_offsets,
|
|
518
|
+
images=images if images else None,
|
|
519
|
+
json_blocks=json_blocks,
|
|
520
|
+
metadata=metadata_dict,
|
|
521
|
+
extraction_json=extraction_json,
|
|
522
|
+
cost_breakdown_full=cost_breakdown if cost_breakdown else None,
|
|
523
|
+
doc_title=doc_title,
|
|
524
|
+
doc_author=doc_author,
|
|
525
|
+
doc_subject=doc_subject,
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
logger.info(
|
|
529
|
+
f"OCR complete: {page_count} pages, {len(markdown)} chars, "
|
|
530
|
+
f"{duration_ms}ms, cost=${(cost_cents or 0) / 100:.4f}"
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
return ocr_result
|
|
534
|
+
|
|
535
|
+
except DatalabAPIError as e:
|
|
536
|
+
status = getattr(e, "status_code", 500)
|
|
537
|
+
error_msg = str(e)
|
|
538
|
+
if status == 429 or "rate limit" in error_msg.lower():
|
|
539
|
+
logger.error(f"Rate limit exceeded: {e}")
|
|
540
|
+
raise OCRRateLimitError(error_msg) from e
|
|
541
|
+
elif status in (401, 403):
|
|
542
|
+
logger.error(f"Authentication error ({status}): {e}")
|
|
543
|
+
raise OCRAuthenticationError(error_msg, status) from e
|
|
544
|
+
else:
|
|
545
|
+
logger.error(f"API error ({status}): {e}")
|
|
546
|
+
raise OCRAPIError(error_msg, status, request_id) from e
|
|
547
|
+
|
|
548
|
+
except DatalabTimeoutError as e:
|
|
549
|
+
logger.error(f"Timeout after {timeout}s: {e}")
|
|
550
|
+
raise OCRTimeoutError(str(e), request_id) from e
|
|
551
|
+
|
|
552
|
+
except DatalabFileError as e:
|
|
553
|
+
logger.error(f"File error: {e}")
|
|
554
|
+
raise OCRFileError(str(e), file_url or str(validated_path)) from e
|
|
555
|
+
|
|
556
|
+
except DatalabValidationError as e:
|
|
557
|
+
logger.error(f"Validation error: {e}")
|
|
558
|
+
raise OCRAPIError(f"Invalid input: {e}", 400, request_id) from e
|
|
559
|
+
|
|
560
|
+
except Exception as e:
|
|
561
|
+
# Catch-all for unexpected errors - still fail fast
|
|
562
|
+
logger.error(f"Unexpected error during OCR: {e}")
|
|
563
|
+
raise OCRAPIError(str(e), 500, request_id) from e
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
# =============================================================================
|
|
567
|
+
# CLI INTERFACE (for manual testing)
|
|
568
|
+
# =============================================================================
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def main() -> None:
|
|
572
|
+
"""CLI entry point for manual testing."""
|
|
573
|
+
# Load .env file if present
|
|
574
|
+
try:
|
|
575
|
+
from dotenv import load_dotenv
|
|
576
|
+
|
|
577
|
+
env_path = Path(__file__).parent.parent / ".env"
|
|
578
|
+
if env_path.exists():
|
|
579
|
+
load_dotenv(env_path)
|
|
580
|
+
logger.debug(f"Loaded environment from {env_path}")
|
|
581
|
+
except ImportError:
|
|
582
|
+
pass # python-dotenv not installed, skip
|
|
583
|
+
|
|
584
|
+
parser = argparse.ArgumentParser(
|
|
585
|
+
description="Datalab OCR Worker - Extract text from documents",
|
|
586
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
587
|
+
epilog="""
|
|
588
|
+
Examples:
|
|
589
|
+
# Process single PDF
|
|
590
|
+
python ocr_worker.py --file ./data/bench/doc_0005.pdf --mode accurate
|
|
591
|
+
|
|
592
|
+
# Process with JSON output
|
|
593
|
+
python ocr_worker.py --file ./data/bench/doc_0005.pdf --json
|
|
594
|
+
""",
|
|
595
|
+
)
|
|
596
|
+
parser.add_argument("--file", "-f", type=str, help="Single file to process")
|
|
597
|
+
parser.add_argument(
|
|
598
|
+
"--file-url", type=str, help="URL of file to process (instead of local file)"
|
|
599
|
+
)
|
|
600
|
+
parser.add_argument(
|
|
601
|
+
"--mode",
|
|
602
|
+
"-m",
|
|
603
|
+
choices=["fast", "balanced", "accurate"],
|
|
604
|
+
default="balanced",
|
|
605
|
+
help="OCR mode (default: balanced)",
|
|
606
|
+
)
|
|
607
|
+
parser.add_argument(
|
|
608
|
+
"--doc-id", type=str, help="Document ID (UUID) - auto-generated if not provided"
|
|
609
|
+
)
|
|
610
|
+
parser.add_argument(
|
|
611
|
+
"--prov-id", type=str, help="Provenance ID (UUID) - auto-generated if not provided"
|
|
612
|
+
)
|
|
613
|
+
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
614
|
+
# Datalab API parameters
|
|
615
|
+
parser.add_argument("--max-pages", type=int, help="Max pages to process (Datalab limit: 7000)")
|
|
616
|
+
parser.add_argument("--page-range", type=str, help='Page range, 0-indexed (e.g. "0-5,10")')
|
|
617
|
+
parser.add_argument(
|
|
618
|
+
"--skip-cache", action="store_true", help="Force reprocessing, skip Datalab cache"
|
|
619
|
+
)
|
|
620
|
+
parser.add_argument(
|
|
621
|
+
"--disable-image-extraction", action="store_true", help="Skip image extraction"
|
|
622
|
+
)
|
|
623
|
+
parser.add_argument(
|
|
624
|
+
"--extras",
|
|
625
|
+
type=str,
|
|
626
|
+
help='Comma-separated extras (e.g. "track_changes,chart_understanding")',
|
|
627
|
+
)
|
|
628
|
+
parser.add_argument(
|
|
629
|
+
"--page-schema", type=str, help="JSON schema string for structured extraction per page"
|
|
630
|
+
)
|
|
631
|
+
parser.add_argument(
|
|
632
|
+
"--additional-config", type=str, help="JSON string of additional Datalab config"
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
args = parser.parse_args()
|
|
636
|
+
|
|
637
|
+
if args.json:
|
|
638
|
+
# Suppress logging in JSON mode for clean output
|
|
639
|
+
logging.getLogger().setLevel(logging.CRITICAL)
|
|
640
|
+
|
|
641
|
+
if not args.file and not args.file_url:
|
|
642
|
+
parser.error("Either --file or --file-url is required")
|
|
643
|
+
|
|
644
|
+
try:
|
|
645
|
+
# Use provided IDs or generate new ones
|
|
646
|
+
doc_id = args.doc_id or str(uuid.uuid4())
|
|
647
|
+
prov_id = args.prov_id or str(uuid.uuid4())
|
|
648
|
+
# Parse extras list from comma-separated string
|
|
649
|
+
extras_list = args.extras.split(",") if args.extras else None
|
|
650
|
+
# Parse additional config JSON
|
|
651
|
+
additional_config = json.loads(args.additional_config) if args.additional_config else None
|
|
652
|
+
|
|
653
|
+
result = process_document(
|
|
654
|
+
args.file or "",
|
|
655
|
+
document_id=doc_id,
|
|
656
|
+
provenance_id=prov_id,
|
|
657
|
+
mode=args.mode,
|
|
658
|
+
max_pages=args.max_pages,
|
|
659
|
+
page_range=args.page_range,
|
|
660
|
+
skip_cache=args.skip_cache,
|
|
661
|
+
disable_image_extraction=args.disable_image_extraction,
|
|
662
|
+
extras=extras_list,
|
|
663
|
+
page_schema=args.page_schema,
|
|
664
|
+
additional_config=additional_config,
|
|
665
|
+
file_url=args.file_url,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
if args.json:
|
|
669
|
+
# asdict() recursively converts nested dataclasses
|
|
670
|
+
# Use compact format (no indent) for python-shell compatibility
|
|
671
|
+
print(json.dumps(asdict(result)))
|
|
672
|
+
else:
|
|
673
|
+
print("=== OCR Result ===")
|
|
674
|
+
print(f"Pages: {result.page_count}")
|
|
675
|
+
print(f"Characters: {result.text_length}")
|
|
676
|
+
print(f"Duration: {result.processing_duration_ms}ms")
|
|
677
|
+
print(f"Cost: ${(result.cost_cents or 0) / 100:.4f}")
|
|
678
|
+
print(f"Quality: {result.parse_quality_score}")
|
|
679
|
+
print(f"Hash: {result.content_hash[:40]}...")
|
|
680
|
+
print("\n=== Extracted Text (first 500 chars) ===")
|
|
681
|
+
print(result.extracted_text[:500])
|
|
682
|
+
|
|
683
|
+
except Exception as e:
|
|
684
|
+
# In --json mode, logging is set to CRITICAL to keep stdout clean.
|
|
685
|
+
# But fatal errors MUST be logged to stderr for diagnostics, so
|
|
686
|
+
# temporarily elevate logger level and use logger.critical().
|
|
687
|
+
if args.json:
|
|
688
|
+
logger.critical(f"Fatal error: {e}", exc_info=True)
|
|
689
|
+
else:
|
|
690
|
+
logger.exception(f"Fatal error: {e}")
|
|
691
|
+
if args.json:
|
|
692
|
+
details = {}
|
|
693
|
+
if hasattr(e, "status_code"):
|
|
694
|
+
details["status_code"] = e.status_code
|
|
695
|
+
if hasattr(e, "request_id"):
|
|
696
|
+
details["request_id"] = e.request_id
|
|
697
|
+
if hasattr(e, "file_path"):
|
|
698
|
+
details["file_path"] = e.file_path
|
|
699
|
+
print(
|
|
700
|
+
json.dumps(
|
|
701
|
+
{
|
|
702
|
+
"error": str(e),
|
|
703
|
+
"category": getattr(e, "category", "OCR_API_ERROR"),
|
|
704
|
+
"details": details,
|
|
705
|
+
}
|
|
706
|
+
)
|
|
707
|
+
)
|
|
708
|
+
sys.exit(1)
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
if __name__ == "__main__":
|
|
712
|
+
main()
|