ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,800 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* VLM Pipeline - Batch Image Processing with Embedding Integration
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the full VLM processing pipeline:
|
|
5
|
+
* 1. Fetch pending images from database
|
|
6
|
+
* 2. Analyze with Gemini VLM
|
|
7
|
+
* 3. Generate embeddings for descriptions
|
|
8
|
+
* 4. Track provenance
|
|
9
|
+
* 5. Update database records
|
|
10
|
+
*
|
|
11
|
+
* @module services/vlm/pipeline
|
|
12
|
+
*/
|
|
13
|
+
import { existsSync, unlinkSync } from 'fs';
|
|
14
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
15
|
+
import { getVLMService, } from './service.js';
|
|
16
|
+
import { getImage, getImagesByDocument, getPendingImages, setImageProcessing, updateImageVLMResult, setImageVLMFailed, setImageVLMSkipped, getImageStats, findByContentHash, copyVLMResult, resetProcessingImages, } from '../storage/database/image-operations.js';
|
|
17
|
+
import { getEmbeddingClient, MODEL_NAME as EMBEDDING_MODEL, } from '../embedding/nomic.js';
|
|
18
|
+
import { computeHash } from '../../utils/hash.js';
|
|
19
|
+
import { ProvenanceType } from '../../models/provenance.js';
|
|
20
|
+
import { getImageOptimizer } from '../images/optimizer.js';
|
|
21
|
+
const DEFAULT_CONFIG = {
|
|
22
|
+
batchSize: 10,
|
|
23
|
+
concurrency: 5,
|
|
24
|
+
minConfidence: 0.5,
|
|
25
|
+
useUniversalPrompt: true,
|
|
26
|
+
skipEmbeddings: false,
|
|
27
|
+
skipProvenance: false,
|
|
28
|
+
imageOptimization: {
|
|
29
|
+
enabled: true,
|
|
30
|
+
ocrMaxWidth: 4800,
|
|
31
|
+
vlmMaxDimension: 2048,
|
|
32
|
+
vlmSkipBelowSize: 50,
|
|
33
|
+
vlmMinRelevance: 0.3,
|
|
34
|
+
vlmSkipLogosIcons: true,
|
|
35
|
+
},
|
|
36
|
+
};
|
|
37
|
+
/**
|
|
38
|
+
* VLMPipeline - Orchestrates image processing workflow
|
|
39
|
+
*
|
|
40
|
+
* Integrates VLM analysis with:
|
|
41
|
+
* - Database operations (image records)
|
|
42
|
+
* - Embedding generation (Nomic)
|
|
43
|
+
* - Vector storage (sqlite-vec)
|
|
44
|
+
* - Provenance tracking
|
|
45
|
+
* - Image relevance filtering (logos, icons, decorative elements)
|
|
46
|
+
*/
|
|
47
|
+
export class VLMPipeline {
|
|
48
|
+
vlm;
|
|
49
|
+
embeddingClient;
|
|
50
|
+
config;
|
|
51
|
+
db;
|
|
52
|
+
dbService;
|
|
53
|
+
vectorService;
|
|
54
|
+
optimizer;
|
|
55
|
+
constructor(db, options) {
|
|
56
|
+
this.db = db;
|
|
57
|
+
this.vlm = options.vlmService ?? getVLMService();
|
|
58
|
+
this.embeddingClient = options.embeddingClient ?? getEmbeddingClient();
|
|
59
|
+
this.config = { ...DEFAULT_CONFIG, ...options.config };
|
|
60
|
+
this.dbService = options.dbService ?? null;
|
|
61
|
+
this.vectorService = options.vectorService;
|
|
62
|
+
this.optimizer =
|
|
63
|
+
options.optimizer ??
|
|
64
|
+
getImageOptimizer({
|
|
65
|
+
vlmMaxDimension: this.config.imageOptimization.vlmMaxDimension,
|
|
66
|
+
vlmSkipBelowSize: this.config.imageOptimization.vlmSkipBelowSize,
|
|
67
|
+
minRelevanceScore: this.config.imageOptimization.vlmMinRelevance,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Process all images in a document.
|
|
72
|
+
*
|
|
73
|
+
* @param documentId - Document UUID
|
|
74
|
+
* @returns BatchResult with processing summary
|
|
75
|
+
*/
|
|
76
|
+
async processDocument(documentId) {
|
|
77
|
+
// Reset any stuck 'processing' images back to pending (crash recovery)
|
|
78
|
+
const stuckCount = resetProcessingImages(this.db, documentId);
|
|
79
|
+
if (stuckCount > 0) {
|
|
80
|
+
console.error(`[VLMPipeline] Reset ${stuckCount} stuck processing images for document ${documentId}`);
|
|
81
|
+
}
|
|
82
|
+
const pending = getImagesByDocument(this.db, documentId, { vlmStatus: 'pending' }).filter((img) => !img.is_header_footer);
|
|
83
|
+
if (pending.length === 0) {
|
|
84
|
+
return {
|
|
85
|
+
total: 0,
|
|
86
|
+
successful: 0,
|
|
87
|
+
failed: 0,
|
|
88
|
+
skipped: 0,
|
|
89
|
+
totalTokens: 0,
|
|
90
|
+
totalTimeMs: 0,
|
|
91
|
+
results: [],
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
return this.processImages(pending);
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Process all pending images in the database.
|
|
98
|
+
*
|
|
99
|
+
* @param limit - Maximum images to process
|
|
100
|
+
* @returns BatchResult with processing summary
|
|
101
|
+
*/
|
|
102
|
+
async processPending(limit) {
|
|
103
|
+
const images = getPendingImages(this.db, limit ?? this.config.batchSize * 10).filter((img) => !img.is_header_footer);
|
|
104
|
+
return this.processImages(images);
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Process a single image by ID.
|
|
108
|
+
*
|
|
109
|
+
* @param imageId - Image UUID
|
|
110
|
+
* @returns ProcessingResult
|
|
111
|
+
*/
|
|
112
|
+
async processOne(imageId) {
|
|
113
|
+
const image = getImage(this.db, imageId);
|
|
114
|
+
if (!image) {
|
|
115
|
+
return {
|
|
116
|
+
imageId,
|
|
117
|
+
success: false,
|
|
118
|
+
error: 'Image not found',
|
|
119
|
+
processingTimeMs: 0,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
const [result] = await this.processBatch([image]);
|
|
123
|
+
return result;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Process array of images in batches.
|
|
127
|
+
*/
|
|
128
|
+
async processImages(images) {
|
|
129
|
+
const startTime = Date.now();
|
|
130
|
+
const results = [];
|
|
131
|
+
for (let i = 0; i < images.length; i += this.config.batchSize) {
|
|
132
|
+
const batch = images.slice(i, i + this.config.batchSize);
|
|
133
|
+
const batchResults = await this.processBatch(batch);
|
|
134
|
+
results.push(...batchResults);
|
|
135
|
+
}
|
|
136
|
+
// Count successful (processed), skipped (relevance filtered), and failed
|
|
137
|
+
const successful = results.filter((r) => r.success && r.description);
|
|
138
|
+
const skipped = results.filter((r) => r.success && !r.description && r.error?.startsWith('Skipped:'));
|
|
139
|
+
const failed = results.filter((r) => !r.success);
|
|
140
|
+
return {
|
|
141
|
+
total: results.length,
|
|
142
|
+
successful: successful.length,
|
|
143
|
+
failed: failed.length,
|
|
144
|
+
skipped: skipped.length,
|
|
145
|
+
totalTokens: successful.reduce((sum, r) => sum + (r.tokensUsed || 0), 0),
|
|
146
|
+
totalTimeMs: Date.now() - startTime,
|
|
147
|
+
results,
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Process a batch of images with rate limiting and exponential backoff.
|
|
152
|
+
*
|
|
153
|
+
* F-INTEG-9: Uses exponential backoff on 429/5xx errors (1s -> 2s -> 4s -> ... -> 32s max).
|
|
154
|
+
* Aborts batch after 5 consecutive failures to avoid wasting resources.
|
|
155
|
+
*/
|
|
156
|
+
async processBatch(images) {
|
|
157
|
+
const BASE_DELAY_MS = 100; // 100ms courtesy delay; rate limiter handles throttling (FIX-P0-2)
|
|
158
|
+
const MAX_DELAY_MS = 32000; // 32 second max backoff
|
|
159
|
+
const MAX_CONSECUTIVE_FAILURES = 5; // Abort batch after this many consecutive failures
|
|
160
|
+
// Mark all as processing (returns false if image not in 'pending' state)
|
|
161
|
+
const claimedImages = [];
|
|
162
|
+
for (const img of images) {
|
|
163
|
+
const claimed = setImageProcessing(this.db, img.id);
|
|
164
|
+
if (!claimed) {
|
|
165
|
+
console.error(`[WARN] Image ${img.id} is no longer pending, skipping`);
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
claimedImages.push(img);
|
|
169
|
+
}
|
|
170
|
+
// Process SEQUENTIALLY with rate limiting (no concurrency)
|
|
171
|
+
const results = [];
|
|
172
|
+
let currentDelay = BASE_DELAY_MS;
|
|
173
|
+
let consecutiveFailures = 0;
|
|
174
|
+
for (let i = 0; i < claimedImages.length; i++) {
|
|
175
|
+
const img = claimedImages[i];
|
|
176
|
+
// Abort batch if too many consecutive failures (likely API outage)
|
|
177
|
+
if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
|
|
178
|
+
console.error(`[VLMPipeline] Aborting batch: ${MAX_CONSECUTIVE_FAILURES} consecutive failures. ` +
|
|
179
|
+
`Processed ${results.length}/${claimedImages.length} images.`);
|
|
180
|
+
// Mark remaining as failed so they can be retried later
|
|
181
|
+
for (let j = i; j < claimedImages.length; j++) {
|
|
182
|
+
try {
|
|
183
|
+
setImageVLMFailed(this.db, claimedImages[j].id, 'Batch aborted: too many consecutive failures');
|
|
184
|
+
}
|
|
185
|
+
catch (error) {
|
|
186
|
+
console.error(`[VLMPipeline] Failed to mark image ${claimedImages[j].id} as failed during batch abort: ${String(error)}`);
|
|
187
|
+
}
|
|
188
|
+
results.push({
|
|
189
|
+
imageId: claimedImages[j].id,
|
|
190
|
+
success: false,
|
|
191
|
+
error: 'Batch aborted: too many consecutive failures',
|
|
192
|
+
processingTimeMs: 0,
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
break;
|
|
196
|
+
}
|
|
197
|
+
// Rate limit: wait between requests (skip for first request)
|
|
198
|
+
if (i > 0) {
|
|
199
|
+
console.error(`[VLMPipeline] Rate limiting: waiting ${currentDelay / 1000}s before next request...`);
|
|
200
|
+
await new Promise((resolve) => setTimeout(resolve, currentDelay));
|
|
201
|
+
}
|
|
202
|
+
console.error(`[VLMPipeline] Processing image ${i + 1}/${claimedImages.length}: ${img.id}`);
|
|
203
|
+
try {
|
|
204
|
+
const result = await this.processImage(img);
|
|
205
|
+
results.push(result);
|
|
206
|
+
if (result.success) {
|
|
207
|
+
console.error(`[VLMPipeline] Success: ${img.id} (confidence: ${result.confidence?.toFixed(2)})`);
|
|
208
|
+
// Reset backoff on success
|
|
209
|
+
currentDelay = BASE_DELAY_MS;
|
|
210
|
+
consecutiveFailures = 0;
|
|
211
|
+
}
|
|
212
|
+
else {
|
|
213
|
+
console.error(`[VLMPipeline] Failed: ${img.id} - ${result.error}`);
|
|
214
|
+
consecutiveFailures++;
|
|
215
|
+
// Apply exponential backoff on failure (likely 429 or 5xx)
|
|
216
|
+
currentDelay = Math.min(currentDelay * 2, MAX_DELAY_MS);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
catch (error) {
|
|
220
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
221
|
+
console.error(`[VLMPipeline] Error: ${img.id} - ${errorMessage}`);
|
|
222
|
+
results.push({
|
|
223
|
+
imageId: img.id,
|
|
224
|
+
success: false,
|
|
225
|
+
error: errorMessage,
|
|
226
|
+
processingTimeMs: 0,
|
|
227
|
+
});
|
|
228
|
+
consecutiveFailures++;
|
|
229
|
+
// Apply exponential backoff on error
|
|
230
|
+
currentDelay = Math.min(currentDelay * 2, MAX_DELAY_MS);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
return results;
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Process a single image through the full pipeline.
|
|
237
|
+
* Includes relevance filtering to skip logos, icons, and decorative elements.
|
|
238
|
+
*/
|
|
239
|
+
async processImage(image) {
|
|
240
|
+
const start = Date.now();
|
|
241
|
+
try {
|
|
242
|
+
// Validate image has extracted file
|
|
243
|
+
if (!image.extracted_path) {
|
|
244
|
+
const error = 'No extracted image file';
|
|
245
|
+
setImageVLMFailed(this.db, image.id, error);
|
|
246
|
+
return {
|
|
247
|
+
imageId: image.id,
|
|
248
|
+
success: false,
|
|
249
|
+
error,
|
|
250
|
+
processingTimeMs: Date.now() - start,
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
// Check image relevance if optimization enabled
|
|
254
|
+
if (this.config.imageOptimization.enabled) {
|
|
255
|
+
const shouldProcess = await this.checkImageRelevance(image);
|
|
256
|
+
if (!shouldProcess.process) {
|
|
257
|
+
const skipReason = `Skipped: ${shouldProcess.reason}`;
|
|
258
|
+
console.error(`[VLMPipeline] ${skipReason} - ${image.id}`);
|
|
259
|
+
// Dedup copies are already marked 'complete' by copyVLMResult — don't re-mark
|
|
260
|
+
if (shouldProcess.dedupSource) {
|
|
261
|
+
// Create VLM_DESCRIPTION provenance for the dedup copy
|
|
262
|
+
this.trackDedupProvenance(image, shouldProcess.dedupSource);
|
|
263
|
+
}
|
|
264
|
+
else {
|
|
265
|
+
// Mark as 'complete' (not 'failed') so retry_failed won't reprocess intentionally-skipped images
|
|
266
|
+
setImageVLMSkipped(this.db, image.id, skipReason);
|
|
267
|
+
}
|
|
268
|
+
return {
|
|
269
|
+
imageId: image.id,
|
|
270
|
+
success: true, // Not a failure, intentionally skipped
|
|
271
|
+
error: skipReason,
|
|
272
|
+
processingTimeMs: Date.now() - start,
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
// Verify image file exists on disk before processing
|
|
277
|
+
if (!existsSync(image.extracted_path)) {
|
|
278
|
+
const error = `Image file not found on disk: ${image.extracted_path} (image_id: ${image.id}). The database record exists but the file has been deleted.`;
|
|
279
|
+
setImageVLMFailed(this.db, image.id, error);
|
|
280
|
+
return {
|
|
281
|
+
imageId: image.id,
|
|
282
|
+
success: false,
|
|
283
|
+
error,
|
|
284
|
+
processingTimeMs: Date.now() - start,
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
// Optionally resize large images for VLM
|
|
288
|
+
let imagePath = image.extracted_path;
|
|
289
|
+
if (this.config.imageOptimization.enabled) {
|
|
290
|
+
const resized = await this.maybeResizeForVLM(image);
|
|
291
|
+
if (resized) {
|
|
292
|
+
imagePath = resized;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
try {
|
|
296
|
+
// Run VLM analysis
|
|
297
|
+
const vlmResult = await this.vlm.describeImage(imagePath, {
|
|
298
|
+
contextText: image.context_text ?? undefined,
|
|
299
|
+
useUniversalPrompt: this.config.useUniversalPrompt,
|
|
300
|
+
});
|
|
301
|
+
// Check confidence threshold
|
|
302
|
+
if (vlmResult.analysis.confidence < this.config.minConfidence) {
|
|
303
|
+
console.error(`[VLMPipeline] Low confidence (${vlmResult.analysis.confidence}) for image ${image.id}`);
|
|
304
|
+
}
|
|
305
|
+
// Track VLM_DESCRIPTION provenance FIRST (returns provenance ID for embedding chain)
|
|
306
|
+
let vlmProvId;
|
|
307
|
+
if (!this.config.skipProvenance && this.dbService) {
|
|
308
|
+
vlmProvId = this.trackProvenance(image, vlmResult);
|
|
309
|
+
}
|
|
310
|
+
// Generate embedding for description with VLM provenance ID
|
|
311
|
+
// T2.10: Include VLM extracted text in embedding for FTS searchability
|
|
312
|
+
let embeddingId = null;
|
|
313
|
+
if (!this.config.skipEmbeddings && vlmResult.description) {
|
|
314
|
+
let textForEmbedding = vlmResult.description;
|
|
315
|
+
if (vlmResult.analysis?.extractedText?.length > 0) {
|
|
316
|
+
textForEmbedding += '\n\nExtracted text: ' + vlmResult.analysis.extractedText.join(', ');
|
|
317
|
+
}
|
|
318
|
+
embeddingId = await this.generateAndStoreEmbedding(textForEmbedding, image, vlmProvId);
|
|
319
|
+
}
|
|
320
|
+
// Build VLM result for database
|
|
321
|
+
const dbResult = {
|
|
322
|
+
description: vlmResult.description,
|
|
323
|
+
structuredData: this.convertToStructuredData(vlmResult.analysis),
|
|
324
|
+
embeddingId: embeddingId || '',
|
|
325
|
+
model: vlmResult.model,
|
|
326
|
+
confidence: vlmResult.analysis.confidence,
|
|
327
|
+
tokensUsed: vlmResult.tokensUsed,
|
|
328
|
+
};
|
|
329
|
+
// Update database record
|
|
330
|
+
updateImageVLMResult(this.db, image.id, dbResult);
|
|
331
|
+
return {
|
|
332
|
+
imageId: image.id,
|
|
333
|
+
success: true,
|
|
334
|
+
description: vlmResult.description,
|
|
335
|
+
embeddingId: embeddingId ?? undefined,
|
|
336
|
+
tokensUsed: vlmResult.tokensUsed,
|
|
337
|
+
confidence: vlmResult.analysis.confidence,
|
|
338
|
+
processingTimeMs: Date.now() - start,
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
finally {
|
|
342
|
+
// Clean up temp resized file if it differs from the original
|
|
343
|
+
if (imagePath !== image.extracted_path) {
|
|
344
|
+
try {
|
|
345
|
+
unlinkSync(imagePath);
|
|
346
|
+
}
|
|
347
|
+
catch (cleanupErr) {
|
|
348
|
+
console.error('[VLMPipeline] Failed to clean up temp resized file:', cleanupErr instanceof Error ? cleanupErr.message : String(cleanupErr));
|
|
349
|
+
/* ignore cleanup errors */
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
catch (error) {
|
|
355
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
356
|
+
// Mark as failed in database
|
|
357
|
+
try {
|
|
358
|
+
setImageVLMFailed(this.db, image.id, errorMessage);
|
|
359
|
+
}
|
|
360
|
+
catch (secondaryError) {
|
|
361
|
+
console.error('[VLMPipeline] Failed to mark image as failed:', image.id, secondaryError instanceof Error ? secondaryError.message : String(secondaryError));
|
|
362
|
+
}
|
|
363
|
+
return {
|
|
364
|
+
imageId: image.id,
|
|
365
|
+
success: false,
|
|
366
|
+
error: errorMessage,
|
|
367
|
+
processingTimeMs: Date.now() - start,
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* Check if an image should be processed by VLM based on relevance analysis.
|
|
373
|
+
*
|
|
374
|
+
* Uses multi-layer heuristics to filter out:
|
|
375
|
+
* - Tiny images (likely icons)
|
|
376
|
+
* - Extreme aspect ratios (likely banners/decorative)
|
|
377
|
+
* - Low color diversity (likely logos)
|
|
378
|
+
*
|
|
379
|
+
* @param image - Image reference with dimensions
|
|
380
|
+
* @returns Object with process flag and reason
|
|
381
|
+
*/
|
|
382
|
+
async checkImageRelevance(image) {
|
|
383
|
+
const { imageOptimization } = this.config;
|
|
384
|
+
// LAYER 1: Header/footer block classification (from Datalab JSON)
|
|
385
|
+
if (image.is_header_footer) {
|
|
386
|
+
return {
|
|
387
|
+
process: false,
|
|
388
|
+
reason: `Header/footer decorative: block_type=${image.block_type ?? 'unknown'}`,
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
// LAYER 2: Figure blocks are always content — skip further checks
|
|
392
|
+
if (image.block_type === 'Figure' || image.block_type === 'FigureGroup') {
|
|
393
|
+
return { process: true, reason: 'Figure block — content image' };
|
|
394
|
+
}
|
|
395
|
+
// LAYER 3: Content hash deduplication
|
|
396
|
+
if (image.content_hash) {
|
|
397
|
+
const duplicate = findByContentHash(this.db, image.content_hash, image.id);
|
|
398
|
+
if (duplicate) {
|
|
399
|
+
// Copy VLM results from the existing processed image
|
|
400
|
+
copyVLMResult(this.db, image.id, duplicate);
|
|
401
|
+
return {
|
|
402
|
+
process: false,
|
|
403
|
+
reason: `Duplicate of image ${duplicate.id} — VLM results copied, 0 tokens used`,
|
|
404
|
+
dedupSource: duplicate,
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
// LAYER 4: Quick dimension check (no file I/O needed)
|
|
409
|
+
const width = image.dimensions?.width ?? 0;
|
|
410
|
+
const height = image.dimensions?.height ?? 0;
|
|
411
|
+
if (width > 0 && height > 0) {
|
|
412
|
+
if (Math.max(width, height) < imageOptimization.vlmSkipBelowSize) {
|
|
413
|
+
return {
|
|
414
|
+
process: false,
|
|
415
|
+
reason: `Too small: ${width}x${height} < ${imageOptimization.vlmSkipBelowSize}px`,
|
|
416
|
+
};
|
|
417
|
+
}
|
|
418
|
+
if (Math.max(width, height) < 100) {
|
|
419
|
+
return {
|
|
420
|
+
process: false,
|
|
421
|
+
reason: `Likely icon: ${width}x${height} (largest dim < 100px)`,
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
const aspectRatio = Math.max(width, height) / Math.min(width, height);
|
|
425
|
+
if (aspectRatio > 6) {
|
|
426
|
+
return {
|
|
427
|
+
process: false,
|
|
428
|
+
reason: `Extreme aspect ratio: ${aspectRatio.toFixed(1)}:1 (likely banner/separator)`,
|
|
429
|
+
};
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
// LAYER 5: Full file-based analysis (existing Python optimizer)
|
|
433
|
+
if (imageOptimization.vlmSkipLogosIcons && image.extracted_path) {
|
|
434
|
+
try {
|
|
435
|
+
const analysis = await this.optimizer.analyzeImage(image.extracted_path);
|
|
436
|
+
if (analysis.success && !analysis.should_vlm) {
|
|
437
|
+
return {
|
|
438
|
+
process: false,
|
|
439
|
+
reason: analysis.skip_reason ?? `Low relevance: ${analysis.overall_relevance}`,
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
catch (error) {
|
|
444
|
+
const errMsg = error instanceof Error ? error.message : String(error);
|
|
445
|
+
console.error(`[VLMPipeline] Relevance analysis failed for ${image.id}, skipping to avoid processing potentially irrelevant images: ${errMsg}`);
|
|
446
|
+
return {
|
|
447
|
+
process: false,
|
|
448
|
+
reason: `Relevance analysis failed: ${errMsg}. Skipping to avoid processing potentially irrelevant images.`,
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
return { process: true, reason: 'Passed all relevance checks' };
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Resize an image for VLM if it exceeds the max dimension.
|
|
456
|
+
*
|
|
457
|
+
* @param image - Image reference
|
|
458
|
+
* @returns Path to resized image, or null if no resize needed
|
|
459
|
+
*/
|
|
460
|
+
async maybeResizeForVLM(image) {
|
|
461
|
+
if (!image.extracted_path)
|
|
462
|
+
return null;
|
|
463
|
+
const { vlmMaxDimension } = this.config.imageOptimization;
|
|
464
|
+
const width = image.dimensions?.width ?? 0;
|
|
465
|
+
const height = image.dimensions?.height ?? 0;
|
|
466
|
+
const maxDim = Math.max(width, height);
|
|
467
|
+
// Unknown dimensions (Datalab images) - skip resize
|
|
468
|
+
if (maxDim === 0) {
|
|
469
|
+
return null;
|
|
470
|
+
}
|
|
471
|
+
// Dimensions known but within limit - no resize needed
|
|
472
|
+
if (maxDim <= vlmMaxDimension) {
|
|
473
|
+
return null;
|
|
474
|
+
}
|
|
475
|
+
// Try to resize
|
|
476
|
+
try {
|
|
477
|
+
const result = await this.optimizer.resizeForVLM(image.extracted_path);
|
|
478
|
+
if (result.success && 'output_path' in result) {
|
|
479
|
+
if (result.resized) {
|
|
480
|
+
console.error(`[VLMPipeline] Resized image for VLM: ${result.original_width}x${result.original_height} -> ${result.output_width}x${result.output_height}`);
|
|
481
|
+
}
|
|
482
|
+
return result.output_path;
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
catch (error) {
|
|
486
|
+
console.error(`[VLMPipeline] Failed to resize image ${image.id}, using original: ${error}`);
|
|
487
|
+
}
|
|
488
|
+
return null;
|
|
489
|
+
}
|
|
490
|
+
/**
|
|
491
|
+
* Generate embedding and store in vector database.
|
|
492
|
+
* Creates EMBEDDING provenance at depth 4 (from VLM_DESCRIPTION).
|
|
493
|
+
*
|
|
494
|
+
* @param description - VLM description text to embed
|
|
495
|
+
* @param image - Source image reference
|
|
496
|
+
* @param vlmDescriptionProvId - VLM_DESCRIPTION provenance ID for chain tracking
|
|
497
|
+
*/
|
|
498
|
+
async generateAndStoreEmbedding(description, image, vlmDescriptionProvId) {
|
|
499
|
+
// Generate embedding vector
|
|
500
|
+
const vectors = await this.embeddingClient.embedChunks([description], 1);
|
|
501
|
+
if (vectors.length === 0) {
|
|
502
|
+
throw new Error('Embedding generation returned empty result');
|
|
503
|
+
}
|
|
504
|
+
const vector = vectors[0];
|
|
505
|
+
const embeddingId = uuidv4();
|
|
506
|
+
// Store in database and vector storage - database service is REQUIRED
|
|
507
|
+
if (!this.dbService) {
|
|
508
|
+
throw new Error('VLM embedding storage requires dbService - pipeline was created without database service');
|
|
509
|
+
}
|
|
510
|
+
{
|
|
511
|
+
// Create EMBEDDING provenance if we have VLM_DESCRIPTION provenance
|
|
512
|
+
let embeddingProvId = embeddingId; // Default: use embedding ID as provenance ID
|
|
513
|
+
if (vlmDescriptionProvId) {
|
|
514
|
+
embeddingProvId = uuidv4();
|
|
515
|
+
const vlmProv = this.dbService.getProvenance(vlmDescriptionProvId);
|
|
516
|
+
if (vlmProv) {
|
|
517
|
+
// Build parent_ids: ... + VLM_DESCRIPTION
|
|
518
|
+
const parentIds = JSON.parse(vlmProv.parent_ids);
|
|
519
|
+
parentIds.push(vlmDescriptionProvId);
|
|
520
|
+
const now = new Date().toISOString();
|
|
521
|
+
const embeddingProvRecord = {
|
|
522
|
+
id: embeddingProvId,
|
|
523
|
+
type: ProvenanceType.EMBEDDING,
|
|
524
|
+
created_at: now,
|
|
525
|
+
processed_at: now,
|
|
526
|
+
source_file_created_at: null,
|
|
527
|
+
source_file_modified_at: null,
|
|
528
|
+
source_type: 'EMBEDDING',
|
|
529
|
+
source_path: null,
|
|
530
|
+
source_id: vlmDescriptionProvId, // Parent is VLM_DESCRIPTION
|
|
531
|
+
root_document_id: vlmProv.root_document_id,
|
|
532
|
+
location: {
|
|
533
|
+
page_number: image.page_number,
|
|
534
|
+
chunk_index: image.image_index,
|
|
535
|
+
},
|
|
536
|
+
content_hash: computeHash(description),
|
|
537
|
+
input_hash: vlmProv.content_hash,
|
|
538
|
+
file_hash: vlmProv.file_hash,
|
|
539
|
+
processor: EMBEDDING_MODEL,
|
|
540
|
+
processor_version: '1.5.0',
|
|
541
|
+
processing_params: { task_type: 'search_document', dimensions: 768 },
|
|
542
|
+
processing_duration_ms: null,
|
|
543
|
+
processing_quality_score: null,
|
|
544
|
+
parent_id: vlmDescriptionProvId,
|
|
545
|
+
parent_ids: JSON.stringify(parentIds),
|
|
546
|
+
chain_depth: 4, // EMBEDDING from VLM_DESCRIPTION is depth 4
|
|
547
|
+
chain_path: JSON.stringify([
|
|
548
|
+
'DOCUMENT',
|
|
549
|
+
'OCR_RESULT',
|
|
550
|
+
'IMAGE',
|
|
551
|
+
'VLM_DESCRIPTION',
|
|
552
|
+
'EMBEDDING',
|
|
553
|
+
]),
|
|
554
|
+
};
|
|
555
|
+
this.dbService.insertProvenance(embeddingProvRecord);
|
|
556
|
+
}
|
|
557
|
+
else {
|
|
558
|
+
// vlmDescriptionProvId was set but provenance not found - fall back
|
|
559
|
+
console.error(`[VLMPipeline] VLM description provenance ${vlmDescriptionProvId} not found, using embedding ID as provenance`);
|
|
560
|
+
embeddingProvId = embeddingId;
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
// Create embedding record (VLM description embeddings use image_id, not chunk_id)
|
|
564
|
+
this.dbService.insertEmbedding({
|
|
565
|
+
id: embeddingId,
|
|
566
|
+
chunk_id: null, // VLM embeddings don't have a chunk
|
|
567
|
+
image_id: image.id, // Use image ID for VLM embeddings
|
|
568
|
+
extraction_id: null, // VLM embeddings don't have an extraction
|
|
569
|
+
document_id: image.document_id,
|
|
570
|
+
original_text: description,
|
|
571
|
+
original_text_length: description.length,
|
|
572
|
+
source_file_path: image.extracted_path ?? 'unknown',
|
|
573
|
+
source_file_name: image.extracted_path?.split('/').pop() ?? 'vlm_description',
|
|
574
|
+
source_file_hash: 'vlm_generated',
|
|
575
|
+
page_number: image.page_number,
|
|
576
|
+
page_range: null,
|
|
577
|
+
character_start: 0,
|
|
578
|
+
character_end: description.length,
|
|
579
|
+
chunk_index: image.image_index,
|
|
580
|
+
total_chunks: 1,
|
|
581
|
+
model_name: EMBEDDING_MODEL,
|
|
582
|
+
model_version: '1.5.0',
|
|
583
|
+
task_type: 'search_document',
|
|
584
|
+
inference_mode: 'local',
|
|
585
|
+
gpu_device: 'cuda:0',
|
|
586
|
+
provenance_id: embeddingProvId, // Use embedding provenance ID
|
|
587
|
+
content_hash: computeHash(description),
|
|
588
|
+
generation_duration_ms: null,
|
|
589
|
+
});
|
|
590
|
+
// Store vector
|
|
591
|
+
this.vectorService.storeVector(embeddingId, vector);
|
|
592
|
+
}
|
|
593
|
+
return embeddingId;
|
|
594
|
+
}
|
|
595
|
+
/**
|
|
596
|
+
* Convert ImageAnalysis to VLMStructuredData format.
|
|
597
|
+
*/
|
|
598
|
+
convertToStructuredData(analysis) {
|
|
599
|
+
return {
|
|
600
|
+
imageType: analysis.imageType,
|
|
601
|
+
primarySubject: analysis.primarySubject,
|
|
602
|
+
extractedText: analysis.extractedText,
|
|
603
|
+
dates: analysis.dates,
|
|
604
|
+
names: analysis.names,
|
|
605
|
+
numbers: analysis.numbers,
|
|
606
|
+
paragraph1: analysis.paragraph1,
|
|
607
|
+
paragraph2: analysis.paragraph2,
|
|
608
|
+
paragraph3: analysis.paragraph3,
|
|
609
|
+
};
|
|
610
|
+
}
|
|
611
|
+
/**
|
|
612
|
+
* Track VLM_DESCRIPTION provenance for VLM processing output.
|
|
613
|
+
* Chain: DOCUMENT (0) -> OCR_RESULT (1) -> IMAGE (2) -> VLM_DESCRIPTION (3)
|
|
614
|
+
*
|
|
615
|
+
* @param image - Source image reference with provenance_id
|
|
616
|
+
* @param vlmResult - VLM analysis result
|
|
617
|
+
* @returns Provenance ID for the VLM_DESCRIPTION record (used for embedding chain)
|
|
618
|
+
*/
|
|
619
|
+
trackProvenance(image, vlmResult) {
|
|
620
|
+
if (!this.dbService) {
|
|
621
|
+
throw new Error('DatabaseService required for provenance tracking');
|
|
622
|
+
}
|
|
623
|
+
const provenanceId = uuidv4();
|
|
624
|
+
const now = new Date().toISOString();
|
|
625
|
+
// Get IMAGE provenance to build parent chain
|
|
626
|
+
if (!image.provenance_id) {
|
|
627
|
+
throw new Error(`Image ${image.id} has no provenance_id - cannot track VLM provenance`);
|
|
628
|
+
}
|
|
629
|
+
const imageProv = this.dbService.getProvenance(image.provenance_id);
|
|
630
|
+
if (!imageProv) {
|
|
631
|
+
throw new Error(`Image provenance not found: ${image.provenance_id}`);
|
|
632
|
+
}
|
|
633
|
+
// Build parent_ids: document + OCR + IMAGE
|
|
634
|
+
const parentIds = JSON.parse(imageProv.parent_ids);
|
|
635
|
+
parentIds.push(image.provenance_id);
|
|
636
|
+
const record = {
|
|
637
|
+
id: provenanceId,
|
|
638
|
+
type: ProvenanceType.VLM_DESCRIPTION, // CORRECT type for VLM descriptions
|
|
639
|
+
created_at: now,
|
|
640
|
+
processed_at: now,
|
|
641
|
+
source_file_created_at: null,
|
|
642
|
+
source_file_modified_at: null,
|
|
643
|
+
source_type: 'VLM', // CORRECT source type
|
|
644
|
+
source_path: image.extracted_path,
|
|
645
|
+
source_id: image.provenance_id, // Parent is IMAGE
|
|
646
|
+
root_document_id: imageProv.root_document_id,
|
|
647
|
+
location: {
|
|
648
|
+
page_number: image.page_number,
|
|
649
|
+
chunk_index: image.image_index,
|
|
650
|
+
},
|
|
651
|
+
content_hash: computeHash(vlmResult.description),
|
|
652
|
+
input_hash: imageProv.content_hash, // Input was the image
|
|
653
|
+
file_hash: imageProv.file_hash,
|
|
654
|
+
processor: `gemini-vlm:${vlmResult.model}`,
|
|
655
|
+
processor_version: '3.0',
|
|
656
|
+
processing_params: {
|
|
657
|
+
type: 'vlm_description',
|
|
658
|
+
confidence: vlmResult.analysis.confidence,
|
|
659
|
+
tokensUsed: vlmResult.tokensUsed,
|
|
660
|
+
},
|
|
661
|
+
processing_duration_ms: vlmResult.processingTimeMs,
|
|
662
|
+
processing_quality_score: vlmResult.analysis.confidence,
|
|
663
|
+
parent_id: image.provenance_id,
|
|
664
|
+
parent_ids: JSON.stringify(parentIds),
|
|
665
|
+
chain_depth: 3, // VLM_DESCRIPTION is depth 3
|
|
666
|
+
chain_path: JSON.stringify(['DOCUMENT', 'OCR_RESULT', 'IMAGE', 'VLM_DESCRIPTION']),
|
|
667
|
+
};
|
|
668
|
+
this.dbService.insertProvenance(record);
|
|
669
|
+
return provenanceId; // Return the ID so we can use it for embedding provenance
|
|
670
|
+
}
|
|
671
|
+
/**
|
|
672
|
+
* Track VLM_DESCRIPTION provenance for a deduplicated image.
|
|
673
|
+
* Creates provenance record documenting that VLM results were copied from a source image
|
|
674
|
+
* with identical content hash, preserving full chain: DOCUMENT(0) -> OCR_RESULT(1) -> IMAGE(2) -> VLM_DESCRIPTION(3).
|
|
675
|
+
*
|
|
676
|
+
* @param image - The dedup copy image that received copied VLM results
|
|
677
|
+
* @param source - The source image whose VLM results were copied
|
|
678
|
+
*/
|
|
679
|
+
trackDedupProvenance(image, source) {
|
|
680
|
+
if (!this.dbService || this.config.skipProvenance)
|
|
681
|
+
return;
|
|
682
|
+
if (!image.provenance_id) {
|
|
683
|
+
console.error(`[VLMPipeline] Cannot track dedup provenance: image ${image.id} has no provenance_id`);
|
|
684
|
+
return;
|
|
685
|
+
}
|
|
686
|
+
const imageProv = this.dbService.getProvenance(image.provenance_id);
|
|
687
|
+
if (!imageProv) {
|
|
688
|
+
console.error(`[VLMPipeline] Image provenance not found: ${image.provenance_id}`);
|
|
689
|
+
return;
|
|
690
|
+
}
|
|
691
|
+
const provenanceId = uuidv4();
|
|
692
|
+
const now = new Date().toISOString();
|
|
693
|
+
const parentIds = JSON.parse(imageProv.parent_ids);
|
|
694
|
+
parentIds.push(image.provenance_id);
|
|
695
|
+
if (!source.vlm_description) {
|
|
696
|
+
console.error(`[VLMPipeline] Cannot create dedup provenance: source image ${source.id} has null vlm_description despite vlm_status=complete`);
|
|
697
|
+
return;
|
|
698
|
+
}
|
|
699
|
+
const record = {
|
|
700
|
+
id: provenanceId,
|
|
701
|
+
type: ProvenanceType.VLM_DESCRIPTION,
|
|
702
|
+
created_at: now,
|
|
703
|
+
processed_at: now,
|
|
704
|
+
source_file_created_at: null,
|
|
705
|
+
source_file_modified_at: null,
|
|
706
|
+
source_type: 'VLM_DEDUP',
|
|
707
|
+
source_path: image.extracted_path,
|
|
708
|
+
source_id: image.provenance_id,
|
|
709
|
+
root_document_id: imageProv.root_document_id,
|
|
710
|
+
location: {
|
|
711
|
+
page_number: image.page_number,
|
|
712
|
+
chunk_index: image.image_index,
|
|
713
|
+
},
|
|
714
|
+
content_hash: computeHash(source.vlm_description),
|
|
715
|
+
input_hash: imageProv.content_hash,
|
|
716
|
+
file_hash: imageProv.file_hash,
|
|
717
|
+
processor: 'dedup-copy',
|
|
718
|
+
processor_version: '1.0.0',
|
|
719
|
+
processing_params: {
|
|
720
|
+
type: 'vlm_dedup_copy',
|
|
721
|
+
source_image_id: source.id,
|
|
722
|
+
content_hash: image.content_hash,
|
|
723
|
+
},
|
|
724
|
+
processing_duration_ms: 0,
|
|
725
|
+
processing_quality_score: source.vlm_confidence,
|
|
726
|
+
parent_id: image.provenance_id,
|
|
727
|
+
parent_ids: JSON.stringify(parentIds),
|
|
728
|
+
chain_depth: 3,
|
|
729
|
+
chain_path: JSON.stringify(['DOCUMENT', 'OCR_RESULT', 'IMAGE', 'VLM_DESCRIPTION']),
|
|
730
|
+
};
|
|
731
|
+
this.dbService.insertProvenance(record);
|
|
732
|
+
console.error(`[VLMPipeline] Created dedup VLM_DESCRIPTION provenance: ${provenanceId} (source: ${source.id})`);
|
|
733
|
+
// If source has an embedding, create EMBEDDING provenance linking to target's chain
|
|
734
|
+
// This ensures the dedup target has a complete provenance chain including the shared embedding
|
|
735
|
+
if (source.vlm_embedding_id) {
|
|
736
|
+
const embProvId = uuidv4();
|
|
737
|
+
const embParentIds = [...parentIds, provenanceId];
|
|
738
|
+
const embRecord = {
|
|
739
|
+
id: embProvId,
|
|
740
|
+
type: ProvenanceType.EMBEDDING,
|
|
741
|
+
created_at: now,
|
|
742
|
+
processed_at: now,
|
|
743
|
+
source_file_created_at: null,
|
|
744
|
+
source_file_modified_at: null,
|
|
745
|
+
source_type: 'EMBEDDING',
|
|
746
|
+
source_path: null,
|
|
747
|
+
source_id: provenanceId, // Parent is the VLM_DESCRIPTION we just created
|
|
748
|
+
root_document_id: imageProv.root_document_id,
|
|
749
|
+
location: {
|
|
750
|
+
page_number: image.page_number,
|
|
751
|
+
chunk_index: image.image_index,
|
|
752
|
+
},
|
|
753
|
+
content_hash: record.content_hash, // Same content as VLM description
|
|
754
|
+
input_hash: record.content_hash,
|
|
755
|
+
file_hash: imageProv.file_hash,
|
|
756
|
+
processor: 'vlm-dedup-embedding-link',
|
|
757
|
+
processor_version: '1.0.0',
|
|
758
|
+
processing_params: {
|
|
759
|
+
source_image_id: source.id,
|
|
760
|
+
source_embedding_id: source.vlm_embedding_id,
|
|
761
|
+
dedup_reason: 'content_hash_match',
|
|
762
|
+
},
|
|
763
|
+
processing_duration_ms: 0,
|
|
764
|
+
processing_quality_score: null,
|
|
765
|
+
parent_id: provenanceId,
|
|
766
|
+
parent_ids: JSON.stringify(embParentIds),
|
|
767
|
+
chain_depth: 4, // EMBEDDING from VLM_DESCRIPTION is depth 4
|
|
768
|
+
chain_path: JSON.stringify([
|
|
769
|
+
'DOCUMENT',
|
|
770
|
+
'OCR_RESULT',
|
|
771
|
+
'IMAGE',
|
|
772
|
+
'VLM_DESCRIPTION',
|
|
773
|
+
'EMBEDDING',
|
|
774
|
+
]),
|
|
775
|
+
};
|
|
776
|
+
this.dbService.insertProvenance(embRecord);
|
|
777
|
+
console.error(`[VLMPipeline] Created dedup EMBEDDING provenance: ${embProvId} (source embedding: ${source.vlm_embedding_id})`);
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
/**
|
|
781
|
+
* Get processing statistics.
|
|
782
|
+
*/
|
|
783
|
+
getStats() {
|
|
784
|
+
return {
|
|
785
|
+
images: getImageStats(this.db),
|
|
786
|
+
vlm: this.vlm.getStatus(),
|
|
787
|
+
};
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
/**
|
|
791
|
+
* Create a VLMPipeline with full service integration.
|
|
792
|
+
*/
|
|
793
|
+
export function createVLMPipeline(dbService, vectorService, config) {
|
|
794
|
+
return new VLMPipeline(dbService.getConnection(), {
|
|
795
|
+
config,
|
|
796
|
+
dbService,
|
|
797
|
+
vectorService,
|
|
798
|
+
});
|
|
799
|
+
}
|
|
800
|
+
//# sourceMappingURL=pipeline.js.map
|