ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,524 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Extract images from DOCX documents using stdlib zipfile + Pillow.
|
|
4
|
+
|
|
5
|
+
DOCX files are ZIP archives containing images in word/media/. This module
|
|
6
|
+
extracts those images and maps them to estimated page positions by parsing
|
|
7
|
+
word/document.xml for image references (a:blip elements).
|
|
8
|
+
|
|
9
|
+
This is a parallel extractor to image_extractor.py (PDF) for the OCR
|
|
10
|
+
Provenance MCP system, enabling VLM analysis of DOCX document images.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
python docx_image_extractor.py --input /path/to/doc.docx --output /path/to/images/
|
|
14
|
+
python docx_image_extractor.py -i doc.docx -o ./images --min-size 100 --max-images 50
|
|
15
|
+
|
|
16
|
+
Output:
|
|
17
|
+
JSON to stdout with extraction results:
|
|
18
|
+
{
|
|
19
|
+
"success": true,
|
|
20
|
+
"count": 5,
|
|
21
|
+
"images": [
|
|
22
|
+
{
|
|
23
|
+
"page": 1,
|
|
24
|
+
"index": 0,
|
|
25
|
+
"format": "png",
|
|
26
|
+
"width": 800,
|
|
27
|
+
"height": 600,
|
|
28
|
+
"bbox": {"x": 0, "y": 0, "width": 800, "height": 600},
|
|
29
|
+
"path": "/path/to/images/p001_i000.png",
|
|
30
|
+
"size": 12345
|
|
31
|
+
},
|
|
32
|
+
...
|
|
33
|
+
]
|
|
34
|
+
}
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import argparse
|
|
38
|
+
import io
|
|
39
|
+
import json
|
|
40
|
+
import os
|
|
41
|
+
import shutil
|
|
42
|
+
import subprocess
|
|
43
|
+
import sys
|
|
44
|
+
import tempfile
|
|
45
|
+
import xml.etree.ElementTree as ET
|
|
46
|
+
import zipfile
|
|
47
|
+
from pathlib import Path
|
|
48
|
+
from typing import Any
|
|
49
|
+
|
|
50
|
+
# Check for Pillow
|
|
51
|
+
try:
|
|
52
|
+
from PIL import Image
|
|
53
|
+
except ImportError:
|
|
54
|
+
print(
|
|
55
|
+
json.dumps(
|
|
56
|
+
{
|
|
57
|
+
"success": False,
|
|
58
|
+
"error": "Pillow not installed. Run: pip install Pillow",
|
|
59
|
+
"images": [],
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# OOXML namespaces used in word/document.xml
|
|
67
|
+
NSMAP = {
|
|
68
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
69
|
+
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
|
70
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
71
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
72
|
+
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
|
|
73
|
+
"v": "urn:schemas-microsoft-com:vml",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Relationship namespace for .rels files
|
|
77
|
+
RELS_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
|
|
78
|
+
|
|
79
|
+
# Paragraphs per estimated page
|
|
80
|
+
PARAGRAPHS_PER_PAGE = 40
|
|
81
|
+
|
|
82
|
+
# Formats accepted by Gemini VLM - anything else must be converted to PNG
|
|
83
|
+
GEMINI_NATIVE_FORMATS = {"png", "jpg", "jpeg", "gif", "webp"}
|
|
84
|
+
|
|
85
|
+
# Cache inkscape availability check
|
|
86
|
+
_INKSCAPE_PATH: str | None = shutil.which("inkscape")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _convert_with_inkscape(img_bytes: bytes, ext: str, filename: str) -> tuple[bool, bytes]:
|
|
90
|
+
"""Convert EMF/WMF to PNG using inkscape subprocess.
|
|
91
|
+
|
|
92
|
+
Returns (success, png_bytes_or_original_bytes).
|
|
93
|
+
"""
|
|
94
|
+
if _INKSCAPE_PATH is None:
|
|
95
|
+
return False, img_bytes
|
|
96
|
+
|
|
97
|
+
tmpdir = tempfile.mkdtemp(prefix="docx_img_")
|
|
98
|
+
try:
|
|
99
|
+
src = os.path.join(tmpdir, f"input.{ext}")
|
|
100
|
+
dst = os.path.join(tmpdir, "output.png")
|
|
101
|
+
with open(src, "wb") as f:
|
|
102
|
+
f.write(img_bytes)
|
|
103
|
+
|
|
104
|
+
result = subprocess.run(
|
|
105
|
+
[_INKSCAPE_PATH, src, "--export-type=png", f"--export-filename={dst}"],
|
|
106
|
+
capture_output=True,
|
|
107
|
+
text=True,
|
|
108
|
+
timeout=30,
|
|
109
|
+
)
|
|
110
|
+
if result.returncode == 0 and os.path.exists(dst):
|
|
111
|
+
with open(dst, "rb") as f:
|
|
112
|
+
return True, f.read()
|
|
113
|
+
|
|
114
|
+
print(
|
|
115
|
+
f"WARNING: inkscape failed for '{filename}': {result.stderr[:200]}",
|
|
116
|
+
file=sys.stderr,
|
|
117
|
+
)
|
|
118
|
+
return False, img_bytes
|
|
119
|
+
except subprocess.TimeoutExpired:
|
|
120
|
+
print(
|
|
121
|
+
f"WARNING: inkscape timed out converting '{filename}'",
|
|
122
|
+
file=sys.stderr,
|
|
123
|
+
)
|
|
124
|
+
return False, img_bytes
|
|
125
|
+
except Exception as e:
|
|
126
|
+
print(
|
|
127
|
+
f"WARNING: inkscape error for '{filename}': {e}",
|
|
128
|
+
file=sys.stderr,
|
|
129
|
+
)
|
|
130
|
+
return False, img_bytes
|
|
131
|
+
finally:
|
|
132
|
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _parse_relationships(zf: zipfile.ZipFile) -> dict[str, str]:
|
|
136
|
+
"""
|
|
137
|
+
Parse word/_rels/document.xml.rels to build a map of rId -> target path.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Dictionary mapping relationship IDs (e.g. "rId5") to target paths
|
|
141
|
+
(e.g. "media/image1.png").
|
|
142
|
+
"""
|
|
143
|
+
rels_path = "word/_rels/document.xml.rels"
|
|
144
|
+
rid_to_target: dict[str, str] = {}
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
with zf.open(rels_path) as f:
|
|
148
|
+
tree = ET.parse(f) # noqa: S314 - parsing trusted DOCX internal XML
|
|
149
|
+
except KeyError:
|
|
150
|
+
return rid_to_target
|
|
151
|
+
except ET.ParseError as e:
|
|
152
|
+
print(
|
|
153
|
+
f"WARNING: Failed to parse {rels_path}: {e}",
|
|
154
|
+
file=sys.stderr,
|
|
155
|
+
)
|
|
156
|
+
return rid_to_target
|
|
157
|
+
|
|
158
|
+
root = tree.getroot()
|
|
159
|
+
for rel in root.iter(f"{{{RELS_NS}}}Relationship"):
|
|
160
|
+
rid = rel.get("Id", "")
|
|
161
|
+
target = rel.get("Target", "")
|
|
162
|
+
if rid and target:
|
|
163
|
+
rid_to_target[rid] = target
|
|
164
|
+
|
|
165
|
+
return rid_to_target
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _parse_image_positions(
|
|
169
|
+
zf: zipfile.ZipFile,
|
|
170
|
+
rid_to_target: dict[str, str],
|
|
171
|
+
) -> list[dict[str, Any]]:
|
|
172
|
+
"""
|
|
173
|
+
Parse word/document.xml to find image references and their paragraph positions.
|
|
174
|
+
|
|
175
|
+
Walks all paragraphs (<w:p>) in order. For each paragraph that contains
|
|
176
|
+
an image reference (a:blip with r:embed), records the paragraph index
|
|
177
|
+
and the target media file.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
List of dicts: {"paragraph_index": int, "media_file": str}
|
|
181
|
+
where media_file is the filename inside word/media/.
|
|
182
|
+
"""
|
|
183
|
+
doc_path = "word/document.xml"
|
|
184
|
+
positions: list[dict[str, Any]] = []
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
with zf.open(doc_path) as f:
|
|
188
|
+
tree = ET.parse(f) # noqa: S314 - parsing trusted DOCX internal XML
|
|
189
|
+
except KeyError:
|
|
190
|
+
return positions
|
|
191
|
+
except ET.ParseError as e:
|
|
192
|
+
print(
|
|
193
|
+
f"WARNING: Failed to parse {doc_path}: {e}",
|
|
194
|
+
file=sys.stderr,
|
|
195
|
+
)
|
|
196
|
+
return positions
|
|
197
|
+
|
|
198
|
+
root = tree.getroot()
|
|
199
|
+
w_p_tag = f"{{{NSMAP['w']}}}p"
|
|
200
|
+
a_blip_tag = f"{{{NSMAP['a']}}}blip"
|
|
201
|
+
r_embed_attr = f"{{{NSMAP['r']}}}embed"
|
|
202
|
+
|
|
203
|
+
for paragraph_index, element in enumerate(root.iter(w_p_tag)):
|
|
204
|
+
# Search for a:blip elements inside this paragraph
|
|
205
|
+
for blip in element.iter(a_blip_tag):
|
|
206
|
+
rid = blip.get(r_embed_attr, "")
|
|
207
|
+
if rid and rid in rid_to_target:
|
|
208
|
+
target = rid_to_target[rid]
|
|
209
|
+
# target is like "media/image1.png"
|
|
210
|
+
media_file = target.split("/")[-1] if "/" in target else target
|
|
211
|
+
positions.append(
|
|
212
|
+
{
|
|
213
|
+
"paragraph_index": paragraph_index,
|
|
214
|
+
"media_file": media_file,
|
|
215
|
+
}
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return positions
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _estimate_page(paragraph_index: int) -> int:
|
|
222
|
+
"""Estimate 1-indexed page number from paragraph index."""
|
|
223
|
+
return (paragraph_index // PARAGRAPHS_PER_PAGE) + 1
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def extract_images(
|
|
227
|
+
docx_path: str,
|
|
228
|
+
output_dir: str,
|
|
229
|
+
min_size: int = 50,
|
|
230
|
+
max_images: int = 100,
|
|
231
|
+
formats: list[str] | None = None,
|
|
232
|
+
) -> dict[str, Any]:
|
|
233
|
+
"""
|
|
234
|
+
Extract images from a DOCX document.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
docx_path: Path to the DOCX file
|
|
238
|
+
output_dir: Directory to save extracted images
|
|
239
|
+
min_size: Minimum dimension (width or height) to include an image
|
|
240
|
+
max_images: Maximum number of images to extract
|
|
241
|
+
formats: List of formats to include (default: all)
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Dictionary with success status and list of extracted images
|
|
245
|
+
"""
|
|
246
|
+
output = Path(output_dir)
|
|
247
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
248
|
+
|
|
249
|
+
images: list[dict[str, Any]] = []
|
|
250
|
+
errors: list[str] = []
|
|
251
|
+
|
|
252
|
+
# Open DOCX as ZIP - fail fast if it cannot be opened
|
|
253
|
+
try:
|
|
254
|
+
zf = zipfile.ZipFile(docx_path, "r")
|
|
255
|
+
except zipfile.BadZipFile:
|
|
256
|
+
return {
|
|
257
|
+
"success": False,
|
|
258
|
+
"error": (
|
|
259
|
+
f"Cannot open as ZIP archive: {docx_path}. "
|
|
260
|
+
"The file may be corrupted or not a valid DOCX. "
|
|
261
|
+
"Verify the file opens in Microsoft Word or LibreOffice."
|
|
262
|
+
),
|
|
263
|
+
"images": [],
|
|
264
|
+
}
|
|
265
|
+
except FileNotFoundError:
|
|
266
|
+
return {
|
|
267
|
+
"success": False,
|
|
268
|
+
"error": f"DOCX file not found: {docx_path}",
|
|
269
|
+
"images": [],
|
|
270
|
+
}
|
|
271
|
+
except PermissionError:
|
|
272
|
+
return {
|
|
273
|
+
"success": False,
|
|
274
|
+
"error": (
|
|
275
|
+
f"Permission denied reading: {docx_path}. "
|
|
276
|
+
"Check file permissions with: ls -la '{docx_path}'"
|
|
277
|
+
),
|
|
278
|
+
"images": [],
|
|
279
|
+
}
|
|
280
|
+
except Exception as e:
|
|
281
|
+
return {
|
|
282
|
+
"success": False,
|
|
283
|
+
"error": f"Failed to open DOCX file '{docx_path}': {type(e).__name__}: {e}",
|
|
284
|
+
"images": [],
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
with zf:
|
|
288
|
+
# List all files in word/media/
|
|
289
|
+
media_files = [
|
|
290
|
+
name
|
|
291
|
+
for name in zf.namelist()
|
|
292
|
+
if name.startswith("word/media/") and not name.endswith("/")
|
|
293
|
+
]
|
|
294
|
+
|
|
295
|
+
# No images directory - valid DOCX with no embedded images
|
|
296
|
+
if not media_files:
|
|
297
|
+
return {
|
|
298
|
+
"success": True,
|
|
299
|
+
"count": 0,
|
|
300
|
+
"images": [],
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
# Parse relationships and document.xml for position mapping
|
|
304
|
+
rid_to_target = _parse_relationships(zf)
|
|
305
|
+
image_positions = _parse_image_positions(zf, rid_to_target)
|
|
306
|
+
|
|
307
|
+
# Build a lookup: media filename -> paragraph index
|
|
308
|
+
media_to_paragraph: dict[str, int] = {}
|
|
309
|
+
for pos in image_positions:
|
|
310
|
+
fname = pos["media_file"]
|
|
311
|
+
if fname not in media_to_paragraph:
|
|
312
|
+
media_to_paragraph[fname] = pos["paragraph_index"]
|
|
313
|
+
|
|
314
|
+
# Sort media files for deterministic output
|
|
315
|
+
media_files.sort()
|
|
316
|
+
|
|
317
|
+
count = 0
|
|
318
|
+
# Per-page image index tracking (matches PDF extractor pattern)
|
|
319
|
+
page_image_counts: dict[int, int] = {}
|
|
320
|
+
|
|
321
|
+
for zip_entry in media_files:
|
|
322
|
+
if count >= max_images:
|
|
323
|
+
break
|
|
324
|
+
|
|
325
|
+
media_filename = zip_entry.split("/")[-1]
|
|
326
|
+
ext = media_filename.rsplit(".", 1)[-1].lower() if "." in media_filename else ""
|
|
327
|
+
|
|
328
|
+
# Filter by format if specified
|
|
329
|
+
if formats and ext not in [f.lower() for f in formats]:
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
# Read image bytes from ZIP
|
|
333
|
+
try:
|
|
334
|
+
img_bytes = zf.read(zip_entry)
|
|
335
|
+
except Exception as e:
|
|
336
|
+
errors.append(
|
|
337
|
+
f"File '{zip_entry}': Failed to read from ZIP: "
|
|
338
|
+
f"{type(e).__name__}: {e}. The DOCX archive may be corrupted."
|
|
339
|
+
)
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
# Get dimensions using PIL (C-1: close pil_img after use)
|
|
343
|
+
try:
|
|
344
|
+
pil_img = Image.open(io.BytesIO(img_bytes))
|
|
345
|
+
width, height = pil_img.size
|
|
346
|
+
except Exception as e:
|
|
347
|
+
errors.append(
|
|
348
|
+
f"File '{zip_entry}': Failed to read image dimensions with Pillow: "
|
|
349
|
+
f"{type(e).__name__}: {e}. The image data may be corrupted or in "
|
|
350
|
+
f"an unsupported format."
|
|
351
|
+
)
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
# Skip images smaller than min_size
|
|
355
|
+
if width < min_size or height < min_size:
|
|
356
|
+
pil_img.close()
|
|
357
|
+
continue
|
|
358
|
+
|
|
359
|
+
# Estimate page from paragraph position
|
|
360
|
+
paragraph_idx = media_to_paragraph.get(media_filename, 0)
|
|
361
|
+
page = _estimate_page(paragraph_idx)
|
|
362
|
+
|
|
363
|
+
bbox = {
|
|
364
|
+
"x": 0,
|
|
365
|
+
"y": 0,
|
|
366
|
+
"width": width,
|
|
367
|
+
"height": height,
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
# Convert non-native formats (EMF, WMF, BMP, TIFF) to PNG
|
|
371
|
+
# so the VLM pipeline (Gemini) can process them.
|
|
372
|
+
save_ext = ext
|
|
373
|
+
if ext not in GEMINI_NATIVE_FORMATS:
|
|
374
|
+
converted = False
|
|
375
|
+
# For EMF/WMF: use inkscape (best Linux EMF rasterizer)
|
|
376
|
+
if not converted and ext in ("emf", "wmf"):
|
|
377
|
+
converted, img_bytes = _convert_with_inkscape(img_bytes, ext, media_filename)
|
|
378
|
+
if converted:
|
|
379
|
+
save_ext = "png"
|
|
380
|
+
# Fallback to Pillow for simpler formats (BMP, TIFF)
|
|
381
|
+
# M-6: close RGBA intermediate and BytesIO buffer
|
|
382
|
+
if not converted:
|
|
383
|
+
try:
|
|
384
|
+
buf = io.BytesIO()
|
|
385
|
+
rgba_img = pil_img.convert("RGBA")
|
|
386
|
+
rgba_img.save(buf, format="PNG")
|
|
387
|
+
rgba_img.close()
|
|
388
|
+
img_bytes = buf.getvalue()
|
|
389
|
+
buf.close()
|
|
390
|
+
save_ext = "png"
|
|
391
|
+
converted = True
|
|
392
|
+
except Exception as e:
|
|
393
|
+
print(f"WARNING: Failed to convert {ext} to PNG: {e}", file=sys.stderr)
|
|
394
|
+
if not converted:
|
|
395
|
+
errors.append(
|
|
396
|
+
f"File '{media_filename}': Cannot convert {ext.upper()} to "
|
|
397
|
+
f"Gemini-compatible format (png/jpg/gif/webp). Saving as "
|
|
398
|
+
f"{ext.upper()}. VLM processing will skip this image. "
|
|
399
|
+
f"Install inkscape to enable conversion: "
|
|
400
|
+
f"sudo apt install inkscape"
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
if converted:
|
|
404
|
+
# Re-read dimensions from converted image
|
|
405
|
+
try:
|
|
406
|
+
with Image.open(io.BytesIO(img_bytes)) as converted_img:
|
|
407
|
+
width, height = converted_img.size
|
|
408
|
+
except Exception as e:
|
|
409
|
+
print(
|
|
410
|
+
f"WARNING: Failed to read converted image dimensions: {e}",
|
|
411
|
+
file=sys.stderr,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# C-1: close pil_img now that dimensions and conversion are done
|
|
415
|
+
pil_img.close()
|
|
416
|
+
|
|
417
|
+
# Per-page image index (matches PDF extractor pattern)
|
|
418
|
+
img_idx = page_image_counts.get(page, 0)
|
|
419
|
+
page_image_counts[page] = img_idx + 1
|
|
420
|
+
|
|
421
|
+
# Generate filename matching PDF extractor pattern
|
|
422
|
+
filename = f"p{page:03d}_i{img_idx:03d}.{save_ext}"
|
|
423
|
+
filepath = output / filename
|
|
424
|
+
|
|
425
|
+
# Save image
|
|
426
|
+
try:
|
|
427
|
+
with open(filepath, "wb") as f:
|
|
428
|
+
f.write(img_bytes)
|
|
429
|
+
except Exception as e:
|
|
430
|
+
errors.append(
|
|
431
|
+
f"File '{zip_entry}': Failed to save to '{filepath}': "
|
|
432
|
+
f"{type(e).__name__}: {e}. Check that the output directory "
|
|
433
|
+
f"'{output_dir}' is writable."
|
|
434
|
+
)
|
|
435
|
+
continue
|
|
436
|
+
|
|
437
|
+
img_size = len(img_bytes)
|
|
438
|
+
# M-7: free img_bytes after writing to disk
|
|
439
|
+
del img_bytes
|
|
440
|
+
|
|
441
|
+
images.append(
|
|
442
|
+
{
|
|
443
|
+
"page": page,
|
|
444
|
+
"index": img_idx,
|
|
445
|
+
"format": save_ext,
|
|
446
|
+
"width": width,
|
|
447
|
+
"height": height,
|
|
448
|
+
"bbox": bbox,
|
|
449
|
+
"path": str(filepath.absolute()),
|
|
450
|
+
"size": img_size,
|
|
451
|
+
}
|
|
452
|
+
)
|
|
453
|
+
count += 1
|
|
454
|
+
|
|
455
|
+
result: dict[str, Any] = {
|
|
456
|
+
"success": True,
|
|
457
|
+
"count": len(images),
|
|
458
|
+
"images": images,
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
if errors:
|
|
462
|
+
result["warnings"] = errors
|
|
463
|
+
|
|
464
|
+
return result
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def main():
|
|
468
|
+
"""CLI entry point."""
|
|
469
|
+
parser = argparse.ArgumentParser(
|
|
470
|
+
description="Extract images from DOCX documents for VLM analysis"
|
|
471
|
+
)
|
|
472
|
+
parser.add_argument(
|
|
473
|
+
"--input",
|
|
474
|
+
"-i",
|
|
475
|
+
required=True,
|
|
476
|
+
help="Path to input DOCX file",
|
|
477
|
+
)
|
|
478
|
+
parser.add_argument(
|
|
479
|
+
"--output",
|
|
480
|
+
"-o",
|
|
481
|
+
required=True,
|
|
482
|
+
help="Output directory for extracted images",
|
|
483
|
+
)
|
|
484
|
+
parser.add_argument(
|
|
485
|
+
"--min-size",
|
|
486
|
+
type=int,
|
|
487
|
+
default=50,
|
|
488
|
+
help="Minimum image dimension in pixels (default: 50)",
|
|
489
|
+
)
|
|
490
|
+
parser.add_argument(
|
|
491
|
+
"--max-images",
|
|
492
|
+
type=int,
|
|
493
|
+
default=100,
|
|
494
|
+
help="Maximum images to extract (default: 100)",
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
args = parser.parse_args()
|
|
498
|
+
|
|
499
|
+
# Validate input file exists
|
|
500
|
+
if not os.path.isfile(args.input):
|
|
501
|
+
print(
|
|
502
|
+
json.dumps(
|
|
503
|
+
{
|
|
504
|
+
"success": False,
|
|
505
|
+
"error": f"Input file does not exist: {args.input}",
|
|
506
|
+
"images": [],
|
|
507
|
+
}
|
|
508
|
+
)
|
|
509
|
+
)
|
|
510
|
+
sys.exit(1)
|
|
511
|
+
|
|
512
|
+
result = extract_images(
|
|
513
|
+
docx_path=args.input,
|
|
514
|
+
output_dir=args.output,
|
|
515
|
+
min_size=args.min_size,
|
|
516
|
+
max_images=args.max_images,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
print(json.dumps(result))
|
|
520
|
+
sys.exit(0 if result["success"] else 1)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
if __name__ == "__main__":
|
|
524
|
+
main()
|