ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,1659 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ingestion MCP Tools
|
|
3
|
+
*
|
|
4
|
+
* Extracted from src/index.ts Task 20.
|
|
5
|
+
* Tools: ocr_ingest_directory, ocr_ingest_files, ocr_process_pending, ocr_status
|
|
6
|
+
*
|
|
7
|
+
* CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
|
|
8
|
+
* Use console.error() for all logging.
|
|
9
|
+
*
|
|
10
|
+
* @module tools/ingestion
|
|
11
|
+
*/
|
|
12
|
+
import { z } from 'zod';
|
|
13
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
14
|
+
import { existsSync, statSync, lstatSync, readdirSync, mkdirSync, writeFileSync } from 'fs';
|
|
15
|
+
import { resolve, extname, basename } from 'path';
|
|
16
|
+
import { OCRProcessor } from '../services/ocr/processor.js';
|
|
17
|
+
import { DatalabClient } from '../services/ocr/datalab.js';
|
|
18
|
+
import { chunkHybridSectionAware, DEFAULT_CHUNKING_CONFIG, } from '../services/chunking/chunker.js';
|
|
19
|
+
import { extractPageOffsetsFromText } from '../services/chunking/markdown-parser.js';
|
|
20
|
+
import { EmbeddingService } from '../services/embedding/embedder.js';
|
|
21
|
+
import { ProvenanceTracker } from '../services/provenance/tracker.js';
|
|
22
|
+
import { computeHash, hashFile, computeFileHashSync } from '../utils/hash.js';
|
|
23
|
+
import { state, requireDatabase, getConfig, withDatabaseOperation } from '../server/state.js';
|
|
24
|
+
import { successResult } from '../server/types.js';
|
|
25
|
+
import { validateInput, sanitizePath, IngestDirectoryInput, IngestFilesInput, ProcessPendingInput, OCRStatusInput, RetryFailedInput, DEFAULT_FILE_TYPES, } from '../utils/validation.js';
|
|
26
|
+
import { pathNotFoundError, pathNotDirectoryError, documentNotFoundError, } from '../server/errors.js';
|
|
27
|
+
import { formatResponse, handleError } from './shared.js';
|
|
28
|
+
import { ProvenanceType } from '../models/provenance.js';
|
|
29
|
+
import { insertImageBatch, updateImageProvenance, } from '../services/storage/database/image-operations.js';
|
|
30
|
+
import { getProvenanceTracker } from '../services/provenance/index.js';
|
|
31
|
+
import { createVLMPipeline } from '../services/vlm/pipeline.js';
|
|
32
|
+
import { ImageExtractor } from '../services/images/extractor.js';
|
|
33
|
+
import { computeBlockTypeStats, detectRepeatedHeadersFooters, isRepeatedHeaderFooter } from '../services/chunking/json-block-analyzer.js';
|
|
34
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
35
|
+
// HELPER FUNCTIONS
|
|
36
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
37
|
+
/**
|
|
38
|
+
* Store chunks in database with provenance records
|
|
39
|
+
*
|
|
40
|
+
* Creates CHUNK provenance records (chain_depth=2) and inserts chunk records.
|
|
41
|
+
* Returns array of stored Chunk objects for embedding.
|
|
42
|
+
*/
|
|
43
|
+
function storeChunks(db, doc, ocrResult, chunkResults, config = DEFAULT_CHUNKING_CONFIG) {
|
|
44
|
+
const provenanceTracker = new ProvenanceTracker(db);
|
|
45
|
+
const chunks = [];
|
|
46
|
+
const now = new Date().toISOString();
|
|
47
|
+
for (let i = 0; i < chunkResults.length; i++) {
|
|
48
|
+
const cr = chunkResults[i];
|
|
49
|
+
const chunkId = uuidv4();
|
|
50
|
+
const textHash = computeHash(cr.text);
|
|
51
|
+
// Create chunk provenance (chain_depth=2)
|
|
52
|
+
const chunkProvId = provenanceTracker.createProvenance({
|
|
53
|
+
type: ProvenanceType.CHUNK,
|
|
54
|
+
source_type: 'CHUNKING',
|
|
55
|
+
source_id: ocrResult.provenance_id,
|
|
56
|
+
root_document_id: doc.provenance_id,
|
|
57
|
+
content_hash: textHash,
|
|
58
|
+
input_hash: ocrResult.content_hash,
|
|
59
|
+
file_hash: doc.file_hash,
|
|
60
|
+
processor: 'chunker',
|
|
61
|
+
processor_version: '2.0.0',
|
|
62
|
+
processing_params: {
|
|
63
|
+
strategy: 'hybrid_section',
|
|
64
|
+
max_chunk_size: config.maxChunkSize,
|
|
65
|
+
chunk_size: config.chunkSize,
|
|
66
|
+
overlap_percent: config.overlapPercent,
|
|
67
|
+
chunk_index: i,
|
|
68
|
+
total_chunks: chunkResults.length,
|
|
69
|
+
character_start: cr.startOffset,
|
|
70
|
+
character_end: cr.endOffset,
|
|
71
|
+
heading_context: cr.headingContext ?? null,
|
|
72
|
+
section_path: cr.sectionPath ?? null,
|
|
73
|
+
is_atomic: cr.isAtomic,
|
|
74
|
+
content_types: cr.contentTypes,
|
|
75
|
+
...(cr.tableMetadata ? {
|
|
76
|
+
table_columns: cr.tableMetadata.columnHeaders,
|
|
77
|
+
table_row_count: cr.tableMetadata.rowCount,
|
|
78
|
+
table_column_count: cr.tableMetadata.columnCount,
|
|
79
|
+
...(cr.tableMetadata.summary ? { table_summary: cr.tableMetadata.summary } : {}),
|
|
80
|
+
...(cr.tableMetadata.caption ? { table_caption: cr.tableMetadata.caption } : {}),
|
|
81
|
+
...(cr.tableMetadata.continuationOf !== undefined ? { table_continuation_of: cr.tableMetadata.continuationOf } : {}),
|
|
82
|
+
} : {}),
|
|
83
|
+
},
|
|
84
|
+
location: {
|
|
85
|
+
chunk_index: i,
|
|
86
|
+
character_start: cr.startOffset,
|
|
87
|
+
character_end: cr.endOffset,
|
|
88
|
+
page_number: cr.pageNumber ?? undefined,
|
|
89
|
+
page_range: cr.pageRange ?? undefined,
|
|
90
|
+
},
|
|
91
|
+
});
|
|
92
|
+
db.insertChunk({
|
|
93
|
+
id: chunkId,
|
|
94
|
+
document_id: doc.id,
|
|
95
|
+
ocr_result_id: ocrResult.id,
|
|
96
|
+
text: cr.text,
|
|
97
|
+
text_hash: textHash,
|
|
98
|
+
chunk_index: i,
|
|
99
|
+
character_start: cr.startOffset,
|
|
100
|
+
character_end: cr.endOffset,
|
|
101
|
+
page_number: cr.pageNumber,
|
|
102
|
+
page_range: cr.pageRange,
|
|
103
|
+
overlap_previous: cr.overlapWithPrevious,
|
|
104
|
+
overlap_next: cr.overlapWithNext,
|
|
105
|
+
provenance_id: chunkProvId,
|
|
106
|
+
ocr_quality_score: ocrResult.parse_quality_score ?? null,
|
|
107
|
+
heading_context: cr.headingContext ?? null,
|
|
108
|
+
heading_level: cr.headingLevel ?? null,
|
|
109
|
+
section_path: cr.sectionPath ?? null,
|
|
110
|
+
content_types: JSON.stringify(cr.contentTypes),
|
|
111
|
+
is_atomic: cr.isAtomic ? 1 : 0,
|
|
112
|
+
chunking_strategy: 'hybrid_section',
|
|
113
|
+
});
|
|
114
|
+
// Build Chunk object directly from insert data (avoids re-fetching from DB)
|
|
115
|
+
chunks.push({
|
|
116
|
+
id: chunkId,
|
|
117
|
+
document_id: doc.id,
|
|
118
|
+
ocr_result_id: ocrResult.id,
|
|
119
|
+
text: cr.text,
|
|
120
|
+
text_hash: textHash,
|
|
121
|
+
chunk_index: i,
|
|
122
|
+
character_start: cr.startOffset,
|
|
123
|
+
character_end: cr.endOffset,
|
|
124
|
+
page_number: cr.pageNumber,
|
|
125
|
+
page_range: cr.pageRange,
|
|
126
|
+
overlap_previous: cr.overlapWithPrevious,
|
|
127
|
+
overlap_next: cr.overlapWithNext,
|
|
128
|
+
provenance_id: chunkProvId,
|
|
129
|
+
created_at: now,
|
|
130
|
+
embedding_status: 'pending',
|
|
131
|
+
embedded_at: null,
|
|
132
|
+
ocr_quality_score: ocrResult.parse_quality_score ?? null,
|
|
133
|
+
heading_context: cr.headingContext ?? null,
|
|
134
|
+
heading_level: cr.headingLevel ?? null,
|
|
135
|
+
section_path: cr.sectionPath ?? null,
|
|
136
|
+
content_types: JSON.stringify(cr.contentTypes),
|
|
137
|
+
is_atomic: cr.isAtomic ? 1 : 0,
|
|
138
|
+
chunking_strategy: 'hybrid_section',
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
return chunks;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Extract a context text window from OCR text for a target page.
|
|
145
|
+
*
|
|
146
|
+
* When pageOffsets are provided, uses exact character boundaries from OCR.
|
|
147
|
+
* Falls back to heuristic estimation when pageOffsets are unavailable.
|
|
148
|
+
*
|
|
149
|
+
* @param ocrText - Full OCR extracted text
|
|
150
|
+
* @param pageCount - Total number of pages in the document
|
|
151
|
+
* @param targetPage - The page number to extract context for (1-indexed)
|
|
152
|
+
* @param pageOffsets - Optional exact page offset data from OCR
|
|
153
|
+
* @returns Context text window (max ~1000 chars)
|
|
154
|
+
*/
|
|
155
|
+
function extractContextText(ocrText, pageCount, targetPage, pageOffsets) {
|
|
156
|
+
if (!ocrText || ocrText.length === 0 || pageCount <= 0) {
|
|
157
|
+
return '';
|
|
158
|
+
}
|
|
159
|
+
const textLength = ocrText.length;
|
|
160
|
+
// Use exact page boundaries when available
|
|
161
|
+
if (pageOffsets && pageOffsets.length > 0) {
|
|
162
|
+
const pageInfo = pageOffsets.find((p) => p.page === targetPage);
|
|
163
|
+
if (pageInfo) {
|
|
164
|
+
const start = Math.max(0, Math.min(pageInfo.charStart, textLength));
|
|
165
|
+
const end = Math.min(pageInfo.charEnd, textLength);
|
|
166
|
+
// Cap at 1000 chars to match original behavior
|
|
167
|
+
return ocrText.slice(start, Math.min(end, start + 1000)).trim();
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
// Fallback: heuristic estimation
|
|
171
|
+
const safePageCount = Math.max(1, pageCount);
|
|
172
|
+
const safePage = Math.max(1, Math.min(targetPage, safePageCount));
|
|
173
|
+
// Estimate position in text for this page
|
|
174
|
+
// Use (safePageCount - 1) as denominator so last page maps to end of text
|
|
175
|
+
const estimatedPosition = Math.floor(((safePage - 1) / Math.max(1, safePageCount - 1)) * textLength);
|
|
176
|
+
// Take ±500 char window
|
|
177
|
+
const windowStart = Math.max(0, estimatedPosition - 500);
|
|
178
|
+
const windowEnd = Math.min(textLength, estimatedPosition + 500);
|
|
179
|
+
let context = ocrText.slice(windowStart, windowEnd);
|
|
180
|
+
// Trim to word boundaries
|
|
181
|
+
if (windowStart > 0) {
|
|
182
|
+
const firstSpace = context.indexOf(' ');
|
|
183
|
+
if (firstSpace > 0 && firstSpace < 50) {
|
|
184
|
+
context = context.slice(firstSpace + 1);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
if (windowEnd < textLength) {
|
|
188
|
+
const lastSpace = context.lastIndexOf(' ');
|
|
189
|
+
if (lastSpace > 0 && lastSpace > context.length - 50) {
|
|
190
|
+
context = context.slice(0, lastSpace);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return context.trim();
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Parse Datalab block type from image filename.
|
|
197
|
+
* Datalab names images like: _page_0_Picture_21.jpeg, _page_0_Figure_3.jpeg
|
|
198
|
+
* Returns block_type string or null if pattern doesn't match.
|
|
199
|
+
*/
|
|
200
|
+
export function parseBlockTypeFromFilename(filename) {
|
|
201
|
+
const match = filename.match(/_page_\d+_([A-Za-z]+)_\d+\./);
|
|
202
|
+
return match ? match[1] : null;
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* From Datalab JSON block hierarchy, classify each page's image regions.
|
|
206
|
+
* Returns a map: pageNumber -> PageImageClassification
|
|
207
|
+
*
|
|
208
|
+
* The JSON structure has top-level children (pages), each page has children (blocks).
|
|
209
|
+
* Image blocks have block_type 'Figure', 'Picture', 'FigureGroup', 'PictureGroup'.
|
|
210
|
+
* Layout blocks have block_type 'PageHeader', 'PageFooter'.
|
|
211
|
+
*/
|
|
212
|
+
export function buildPageBlockClassification(jsonBlocks) {
|
|
213
|
+
const pageMap = new Map();
|
|
214
|
+
const topChildren = jsonBlocks.children ??
|
|
215
|
+
jsonBlocks.blocks ??
|
|
216
|
+
[];
|
|
217
|
+
if (!Array.isArray(topChildren)) {
|
|
218
|
+
console.error('[WARN] JSON blocks has no children/blocks array');
|
|
219
|
+
return pageMap;
|
|
220
|
+
}
|
|
221
|
+
let pageNum = 0;
|
|
222
|
+
for (const pageBlock of topChildren) {
|
|
223
|
+
const block = pageBlock;
|
|
224
|
+
if (block.block_type === 'Page' || !block.block_type) {
|
|
225
|
+
pageNum++;
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
const classification = {
|
|
231
|
+
hasFigure: false,
|
|
232
|
+
hasPicture: false,
|
|
233
|
+
pictureInHeaderFooter: 0,
|
|
234
|
+
pictureInBody: 0,
|
|
235
|
+
figureCount: 0,
|
|
236
|
+
};
|
|
237
|
+
const walkChildren = (children, inHeaderFooter) => {
|
|
238
|
+
if (!Array.isArray(children))
|
|
239
|
+
return;
|
|
240
|
+
for (const child of children) {
|
|
241
|
+
const c = child;
|
|
242
|
+
const btype = c.block_type;
|
|
243
|
+
const isHF = inHeaderFooter || btype === 'PageHeader' || btype === 'PageFooter';
|
|
244
|
+
if (btype === 'Figure' || btype === 'FigureGroup') {
|
|
245
|
+
classification.hasFigure = true;
|
|
246
|
+
classification.figureCount++;
|
|
247
|
+
}
|
|
248
|
+
if (btype === 'Picture' || btype === 'PictureGroup') {
|
|
249
|
+
classification.hasPicture = true;
|
|
250
|
+
if (isHF) {
|
|
251
|
+
classification.pictureInHeaderFooter++;
|
|
252
|
+
}
|
|
253
|
+
else {
|
|
254
|
+
classification.pictureInBody++;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
if (c.children) {
|
|
258
|
+
walkChildren(c.children, isHF);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
};
|
|
262
|
+
walkChildren(block.children ?? [], false);
|
|
263
|
+
pageMap.set(pageNum, classification);
|
|
264
|
+
}
|
|
265
|
+
return pageMap;
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Save images from Datalab to disk and store references in database.
|
|
269
|
+
*
|
|
270
|
+
* Images come from Datalab as {filename: base64_data}.
|
|
271
|
+
* This function:
|
|
272
|
+
* 1. Creates output directory
|
|
273
|
+
* 2. Saves each image to disk
|
|
274
|
+
* 3. Creates image records in database for VLM processing
|
|
275
|
+
*
|
|
276
|
+
* @param db - Database connection
|
|
277
|
+
* @param doc - Document record
|
|
278
|
+
* @param ocrResult - OCR result for provenance chain
|
|
279
|
+
* @param images - Images from Datalab: {filename: base64}
|
|
280
|
+
* @param outputDir - Directory to save images
|
|
281
|
+
* @returns Array of stored ImageReference records
|
|
282
|
+
*/
|
|
283
|
+
function saveAndStoreImages(db, doc, ocrResult, images, outputDir, jsonBlocks, pageOffsets) {
|
|
284
|
+
// Create output directory
|
|
285
|
+
if (!existsSync(outputDir)) {
|
|
286
|
+
mkdirSync(outputDir, { recursive: true });
|
|
287
|
+
}
|
|
288
|
+
// Build page-level image classification from JSON blocks
|
|
289
|
+
const pageClassification = jsonBlocks
|
|
290
|
+
? buildPageBlockClassification(jsonBlocks)
|
|
291
|
+
: new Map();
|
|
292
|
+
const imageRefs = [];
|
|
293
|
+
const pageImageCounts = new Map();
|
|
294
|
+
for (const filename of Object.keys(images)) {
|
|
295
|
+
const buffer = Buffer.from(images[filename], 'base64');
|
|
296
|
+
// Release base64 string immediately to reduce peak memory
|
|
297
|
+
delete images[filename];
|
|
298
|
+
const filePath = resolve(outputDir, filename);
|
|
299
|
+
writeFileSync(filePath, buffer);
|
|
300
|
+
// Parse page number from filename (e.g., "page_1_image_0.png" or "p001_i000.png")
|
|
301
|
+
const pageMatch = filename.match(/page_(\d+)|p(\d+)/i);
|
|
302
|
+
const pageNumber = pageMatch ? parseInt(pageMatch[1] || pageMatch[2], 10) + 1 : 1;
|
|
303
|
+
// Per-page image index
|
|
304
|
+
const currentPageCount = pageImageCounts.get(pageNumber) ?? 0;
|
|
305
|
+
pageImageCounts.set(pageNumber, currentPageCount + 1);
|
|
306
|
+
const imageIndex = currentPageCount;
|
|
307
|
+
const contentHash = computeHash(buffer);
|
|
308
|
+
// Parse block type from Datalab filename
|
|
309
|
+
const blockType = parseBlockTypeFromFilename(filename);
|
|
310
|
+
// Determine if image is in header/footer region
|
|
311
|
+
const pageInfo = pageClassification.get(pageNumber);
|
|
312
|
+
const isHeaderFooter = blockType === 'PageHeader' ||
|
|
313
|
+
blockType === 'PageFooter' ||
|
|
314
|
+
(pageInfo !== undefined &&
|
|
315
|
+
!pageInfo.hasFigure &&
|
|
316
|
+
pageInfo.pictureInHeaderFooter > 0 &&
|
|
317
|
+
pageInfo.pictureInBody === 0);
|
|
318
|
+
// Get image format from extension
|
|
319
|
+
const ext = extname(filename).slice(1).toLowerCase();
|
|
320
|
+
const format = ext || 'png';
|
|
321
|
+
// Extract context text from OCR for this page (uses exact pageOffsets when available)
|
|
322
|
+
const contextText = extractContextText(ocrResult.extracted_text, ocrResult.page_count ?? 1, pageNumber, pageOffsets);
|
|
323
|
+
// Create image reference for database
|
|
324
|
+
// Note: dimensions will be estimated - VLM pipeline can update if needed
|
|
325
|
+
imageRefs.push({
|
|
326
|
+
document_id: doc.id,
|
|
327
|
+
ocr_result_id: ocrResult.id,
|
|
328
|
+
page_number: pageNumber,
|
|
329
|
+
bounding_box: { x: 0, y: 0, width: 0, height: 0 }, // Datalab doesn't provide bbox
|
|
330
|
+
image_index: imageIndex,
|
|
331
|
+
format,
|
|
332
|
+
dimensions: { width: 0, height: 0 }, // Datalab does not provide dimensions; filtering pipeline bypasses dimension check when both are 0
|
|
333
|
+
extracted_path: filePath,
|
|
334
|
+
file_size: buffer.length,
|
|
335
|
+
context_text: contextText || null,
|
|
336
|
+
provenance_id: null, // Will be set after insert with provenance record
|
|
337
|
+
block_type: blockType,
|
|
338
|
+
is_header_footer: isHeaderFooter,
|
|
339
|
+
content_hash: contentHash,
|
|
340
|
+
});
|
|
341
|
+
}
|
|
342
|
+
// Batch insert all images
|
|
343
|
+
if (imageRefs.length > 0) {
|
|
344
|
+
const insertedImages = insertImageBatch(db.getConnection(), imageRefs);
|
|
345
|
+
// Create IMAGE provenance records and update image records
|
|
346
|
+
const tracker = getProvenanceTracker(db);
|
|
347
|
+
for (const img of insertedImages) {
|
|
348
|
+
try {
|
|
349
|
+
const provenanceId = tracker.createProvenance({
|
|
350
|
+
type: ProvenanceType.IMAGE,
|
|
351
|
+
source_type: 'IMAGE_EXTRACTION',
|
|
352
|
+
source_id: ocrResult.provenance_id,
|
|
353
|
+
root_document_id: doc.provenance_id,
|
|
354
|
+
content_hash: img.content_hash ??
|
|
355
|
+
(img.extracted_path && existsSync(img.extracted_path)
|
|
356
|
+
? computeFileHashSync(img.extracted_path)
|
|
357
|
+
: computeHash(img.id)),
|
|
358
|
+
source_path: img.extracted_path ?? undefined,
|
|
359
|
+
processor: 'datalab-image-extraction',
|
|
360
|
+
processor_version: '1.0.0',
|
|
361
|
+
processing_params: {
|
|
362
|
+
page_number: img.page_number,
|
|
363
|
+
image_index: img.image_index,
|
|
364
|
+
format: img.format,
|
|
365
|
+
block_type: img.block_type,
|
|
366
|
+
is_header_footer: img.is_header_footer,
|
|
367
|
+
},
|
|
368
|
+
location: {
|
|
369
|
+
page_number: img.page_number,
|
|
370
|
+
},
|
|
371
|
+
});
|
|
372
|
+
// Update the image record with the provenance ID
|
|
373
|
+
updateImageProvenance(db.getConnection(), img.id, provenanceId);
|
|
374
|
+
img.provenance_id = provenanceId;
|
|
375
|
+
}
|
|
376
|
+
catch (error) {
|
|
377
|
+
console.error(`[WARN] Failed to create IMAGE provenance for ${img.id}: ${error instanceof Error ? error.message : String(error)}`);
|
|
378
|
+
throw error;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
return insertedImages;
|
|
382
|
+
}
|
|
383
|
+
return [];
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Process a single document through the full OCR pipeline.
|
|
387
|
+
*
|
|
388
|
+
* Pipeline: OCR -> Extract Images -> Chunk -> Embed -> VLM -> Structured Extraction -> Complete
|
|
389
|
+
*
|
|
390
|
+
* This function is the core processing unit used by both handleProcessPending (batch)
|
|
391
|
+
* and handleReprocess (single document). Extracting it prevents the race condition
|
|
392
|
+
* where handleReprocess calls handleProcessPending and the target document may not
|
|
393
|
+
* be claimed when other pending documents exist (M-11).
|
|
394
|
+
*
|
|
395
|
+
* @param doc - Document record (must already have status='processing')
|
|
396
|
+
* @param params - Processing parameters
|
|
397
|
+
* @returns void on success, throws on failure
|
|
398
|
+
*/
|
|
399
|
+
async function processOneDocument(doc, params) {
|
|
400
|
+
const warnings = [];
|
|
401
|
+
const { db, vector, ocrMode, ocrOptions, pageSchema, imagesBaseDir } = params;
|
|
402
|
+
console.error(`[INFO] Processing document: ${doc.id} (${doc.file_name})`);
|
|
403
|
+
// Step 1: OCR via Datalab
|
|
404
|
+
// OCRProcessor.processDocument() throws on failure (FAIL-FAST).
|
|
405
|
+
// It handles status='processing' internally and marks 'failed' before throwing.
|
|
406
|
+
const ocrProcessor = new OCRProcessor(db);
|
|
407
|
+
const processResult = await ocrProcessor.processDocument(doc.id, ocrMode, ocrOptions);
|
|
408
|
+
// Get the OCR result
|
|
409
|
+
const ocrResult = db.getOCRResultByDocumentId(doc.id);
|
|
410
|
+
if (!ocrResult) {
|
|
411
|
+
throw new Error('OCR result not found after processing');
|
|
412
|
+
}
|
|
413
|
+
console.error(`[INFO] OCR complete: ${ocrResult.text_length} chars, ${ocrResult.page_count} pages`);
|
|
414
|
+
// Step 1.5: Extract and store images from OCR result (if any)
|
|
415
|
+
let imageCount = 0;
|
|
416
|
+
const imageOutputDir = resolve(imagesBaseDir, doc.id);
|
|
417
|
+
if (processResult.images && Object.keys(processResult.images).length > 0) {
|
|
418
|
+
const imageRefs = saveAndStoreImages(db, doc, ocrResult, processResult.images, imageOutputDir, processResult.jsonBlocks, processResult.pageOffsets);
|
|
419
|
+
imageCount = imageRefs.length;
|
|
420
|
+
console.error(`[INFO] Images from Datalab: ${imageCount}`);
|
|
421
|
+
}
|
|
422
|
+
// Step 1.6: File-based image extraction fallback
|
|
423
|
+
// If Datalab didn't return images, extract directly from file (PDF or DOCX)
|
|
424
|
+
if (imageCount === 0 &&
|
|
425
|
+
!ocrOptions.disableImageExtraction &&
|
|
426
|
+
ImageExtractor.isSupported(doc.file_path)) {
|
|
427
|
+
console.error(`[INFO] No images from Datalab for ${doc.file_type} file, running file-based extraction`);
|
|
428
|
+
const extractor = new ImageExtractor();
|
|
429
|
+
const extractedImages = await extractor.extractImages(doc.file_path, {
|
|
430
|
+
outputDir: imageOutputDir,
|
|
431
|
+
minSize: 50,
|
|
432
|
+
maxImages: 500,
|
|
433
|
+
});
|
|
434
|
+
if (extractedImages.length > 0) {
|
|
435
|
+
// Build page classification from JSON blocks for header/footer detection
|
|
436
|
+
const pageClassification = processResult.jsonBlocks
|
|
437
|
+
? buildPageBlockClassification(processResult.jsonBlocks)
|
|
438
|
+
: new Map();
|
|
439
|
+
const imageRefs = extractedImages.map((img) => {
|
|
440
|
+
const contentHash = computeFileHashSync(img.path);
|
|
441
|
+
const pageInfo = pageClassification.get(img.page);
|
|
442
|
+
const isHeaderFooter = pageInfo !== undefined &&
|
|
443
|
+
!pageInfo.hasFigure &&
|
|
444
|
+
pageInfo.pictureInHeaderFooter > 0 &&
|
|
445
|
+
pageInfo.pictureInBody === 0;
|
|
446
|
+
const contextText = extractContextText(ocrResult.extracted_text, ocrResult.page_count ?? 1, img.page);
|
|
447
|
+
return {
|
|
448
|
+
document_id: doc.id,
|
|
449
|
+
ocr_result_id: ocrResult.id,
|
|
450
|
+
page_number: img.page,
|
|
451
|
+
bounding_box: img.bbox,
|
|
452
|
+
image_index: img.index,
|
|
453
|
+
format: img.format,
|
|
454
|
+
dimensions: { width: img.width, height: img.height },
|
|
455
|
+
extracted_path: img.path,
|
|
456
|
+
file_size: img.size,
|
|
457
|
+
context_text: contextText || null,
|
|
458
|
+
provenance_id: null,
|
|
459
|
+
block_type: null, // File-based extraction has no block type
|
|
460
|
+
is_header_footer: isHeaderFooter,
|
|
461
|
+
content_hash: contentHash,
|
|
462
|
+
};
|
|
463
|
+
});
|
|
464
|
+
const insertedImages = insertImageBatch(db.getConnection(), imageRefs);
|
|
465
|
+
// Create IMAGE provenance records
|
|
466
|
+
const tracker = getProvenanceTracker(db);
|
|
467
|
+
for (const img of insertedImages) {
|
|
468
|
+
try {
|
|
469
|
+
const provenanceId = tracker.createProvenance({
|
|
470
|
+
type: ProvenanceType.IMAGE,
|
|
471
|
+
source_type: 'IMAGE_EXTRACTION',
|
|
472
|
+
source_id: ocrResult.provenance_id,
|
|
473
|
+
root_document_id: doc.provenance_id,
|
|
474
|
+
content_hash: img.content_hash ??
|
|
475
|
+
(img.extracted_path && existsSync(img.extracted_path)
|
|
476
|
+
? computeFileHashSync(img.extracted_path)
|
|
477
|
+
: computeHash(img.id)),
|
|
478
|
+
source_path: img.extracted_path ?? undefined,
|
|
479
|
+
processor: `${doc.file_type}-image-extraction`,
|
|
480
|
+
processor_version: '1.0.0',
|
|
481
|
+
processing_params: {
|
|
482
|
+
page_number: img.page_number,
|
|
483
|
+
image_index: img.image_index,
|
|
484
|
+
format: img.format,
|
|
485
|
+
extraction_method: 'file-based',
|
|
486
|
+
is_header_footer: img.is_header_footer,
|
|
487
|
+
},
|
|
488
|
+
location: {
|
|
489
|
+
page_number: img.page_number,
|
|
490
|
+
},
|
|
491
|
+
});
|
|
492
|
+
updateImageProvenance(db.getConnection(), img.id, provenanceId);
|
|
493
|
+
}
|
|
494
|
+
catch (provError) {
|
|
495
|
+
console.error(`[ERROR] Failed to create IMAGE provenance for ${img.id}: ` +
|
|
496
|
+
`${provError instanceof Error ? provError.message : String(provError)}`);
|
|
497
|
+
throw provError;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
imageCount = insertedImages.length;
|
|
501
|
+
console.error(`[INFO] File-based extraction: ${imageCount} images`);
|
|
502
|
+
}
|
|
503
|
+
else {
|
|
504
|
+
console.error(`[INFO] File-based extraction: no images found in document`);
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
// Step 2: Chunk the OCR text using hybrid section-aware chunker
|
|
508
|
+
const chunkConfig = {
|
|
509
|
+
chunkSize: state.config.chunkSize,
|
|
510
|
+
overlapPercent: state.config.chunkOverlapPercent,
|
|
511
|
+
maxChunkSize: state.config.maxChunkSize,
|
|
512
|
+
};
|
|
513
|
+
let pageOffsets = processResult.pageOffsets ?? [];
|
|
514
|
+
// Fallback: if Python returned a single page offset covering the entire text,
|
|
515
|
+
// re-extract using TypeScript's extractPageOffsetsFromText which handles both
|
|
516
|
+
// HTML comment (<!-- Page N -->) and Datalab ({N}---) separator formats.
|
|
517
|
+
if (pageOffsets.length <= 1 && ocrResult.extracted_text.length > 0) {
|
|
518
|
+
const extracted = extractPageOffsetsFromText(ocrResult.extracted_text);
|
|
519
|
+
if (extracted.length > pageOffsets.length) {
|
|
520
|
+
pageOffsets = extracted;
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
const chunkResults = chunkHybridSectionAware(ocrResult.extracted_text, pageOffsets, processResult.jsonBlocks ?? null, chunkConfig);
|
|
524
|
+
console.error(`[INFO] Chunking complete: ${chunkResults.length} chunks`);
|
|
525
|
+
// Step 3: Store chunks in database with provenance
|
|
526
|
+
const chunks = storeChunks(db, doc, ocrResult, chunkResults, chunkConfig);
|
|
527
|
+
console.error(`[INFO] Chunks stored: ${chunks.length}`);
|
|
528
|
+
// Step 3.4: Detect repeated headers/footers and tag matching chunks (T2.8)
|
|
529
|
+
if (processResult.jsonBlocks) {
|
|
530
|
+
try {
|
|
531
|
+
const headerFooterInfo = detectRepeatedHeadersFooters(processResult.jsonBlocks);
|
|
532
|
+
const allRepeated = [...headerFooterInfo.repeatedHeaders, ...headerFooterInfo.repeatedFooters];
|
|
533
|
+
if (allRepeated.length > 0) {
|
|
534
|
+
const conn = db.getConnection();
|
|
535
|
+
let tagRow = conn.prepare("SELECT id FROM tags WHERE name = ?").get('system:repeated_header_footer');
|
|
536
|
+
if (!tagRow) {
|
|
537
|
+
const tagId = uuidv4();
|
|
538
|
+
conn.prepare("INSERT INTO tags (id, name, description, color) VALUES (?, ?, ?, ?)").run(tagId, 'system:repeated_header_footer', 'Auto-detected repeated page header or footer content', '#888888');
|
|
539
|
+
tagRow = { id: tagId };
|
|
540
|
+
}
|
|
541
|
+
let taggedCount = 0;
|
|
542
|
+
for (const chunk of chunks) {
|
|
543
|
+
if (isRepeatedHeaderFooter(chunk.text, allRepeated)) {
|
|
544
|
+
const entityTagId = uuidv4();
|
|
545
|
+
conn.prepare("INSERT OR IGNORE INTO entity_tags (id, tag_id, entity_id, entity_type) VALUES (?, ?, ?, 'chunk')").run(entityTagId, tagRow.id, chunk.id);
|
|
546
|
+
taggedCount++;
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
console.error(`[T2.8] Tagged ${taggedCount} chunks as repeated header/footer (${allRepeated.length} patterns detected) for document ${doc.id}`);
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
catch (tagError) {
|
|
553
|
+
const tagErrMsg = tagError instanceof Error ? tagError.message : String(tagError);
|
|
554
|
+
console.error(`[WARN] Header/footer tagging failed for ${doc.id}: ${tagErrMsg}`);
|
|
555
|
+
warnings.push(`Header/footer auto-tagging failed: ${tagErrMsg}. Chunks stored but repeated headers/footers not tagged.`);
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
// Step 3.5: Enrich extras_json with block stats, links, and structural fingerprint
|
|
559
|
+
// (Tasks 4.1, 4.2, 4.4 - Ingestion Pipeline Enrichment)
|
|
560
|
+
try {
|
|
561
|
+
const existingExtras = ocrResult.extras_json
|
|
562
|
+
? JSON.parse(ocrResult.extras_json)
|
|
563
|
+
: {};
|
|
564
|
+
// Task 4.1: Block-type statistics from json_blocks
|
|
565
|
+
const blockStats = computeBlockTypeStats(processResult.jsonBlocks ?? null);
|
|
566
|
+
if (blockStats) {
|
|
567
|
+
existingExtras.block_type_stats = blockStats;
|
|
568
|
+
}
|
|
569
|
+
// Task 4.2: Extract structured hyperlinks from Datalab metadata
|
|
570
|
+
const metadataObj = (existingExtras.metadata ?? processResult.metadata ?? null);
|
|
571
|
+
if (metadataObj) {
|
|
572
|
+
// Datalab stores links under metadata.extras_features.links or metadata.links
|
|
573
|
+
const extrasFeatures = metadataObj.extras_features;
|
|
574
|
+
const rawLinks = (extrasFeatures?.links ?? metadataObj.links ?? null);
|
|
575
|
+
if (Array.isArray(rawLinks) && rawLinks.length > 0) {
|
|
576
|
+
const structuredLinks = rawLinks
|
|
577
|
+
.filter((link) => {
|
|
578
|
+
const url = (link.url ?? link.href ?? '');
|
|
579
|
+
return url.length > 0;
|
|
580
|
+
})
|
|
581
|
+
.map((link) => ({
|
|
582
|
+
url: (link.url ?? link.href),
|
|
583
|
+
anchor_text: (link.anchor_text ?? link.text ?? link.title ?? ''),
|
|
584
|
+
page_number: (link.page_number ?? link.page ?? null),
|
|
585
|
+
}));
|
|
586
|
+
existingExtras.structured_links = structuredLinks;
|
|
587
|
+
existingExtras.link_count = structuredLinks.length;
|
|
588
|
+
}
|
|
589
|
+
else {
|
|
590
|
+
existingExtras.link_count = 0;
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
// Task 4.4: Structural fingerprint from chunks
|
|
594
|
+
const headingDepths = {};
|
|
595
|
+
let totalChunkSize = 0;
|
|
596
|
+
let atomicChunkCount = 0;
|
|
597
|
+
let tableCount = 0;
|
|
598
|
+
let figureCount = 0;
|
|
599
|
+
const contentTypeDist = {};
|
|
600
|
+
for (const cr of chunkResults) {
|
|
601
|
+
totalChunkSize += cr.text.length;
|
|
602
|
+
if (cr.isAtomic)
|
|
603
|
+
atomicChunkCount++;
|
|
604
|
+
// Count heading depths from heading level
|
|
605
|
+
if (cr.headingLevel !== null && cr.headingLevel !== undefined) {
|
|
606
|
+
const key = `h${cr.headingLevel}`;
|
|
607
|
+
headingDepths[key] = (headingDepths[key] ?? 0) + 1;
|
|
608
|
+
}
|
|
609
|
+
// Count content types
|
|
610
|
+
for (const ct of cr.contentTypes) {
|
|
611
|
+
contentTypeDist[ct] = (contentTypeDist[ct] ?? 0) + 1;
|
|
612
|
+
if (ct === 'Table' || ct === 'TableGroup')
|
|
613
|
+
tableCount++;
|
|
614
|
+
if (ct === 'Figure' || ct === 'FigureGroup')
|
|
615
|
+
figureCount++;
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
existingExtras.structural_fingerprint = {
|
|
619
|
+
page_count: ocrResult.page_count ?? 0,
|
|
620
|
+
chunk_count: chunkResults.length,
|
|
621
|
+
table_count: tableCount,
|
|
622
|
+
figure_count: figureCount,
|
|
623
|
+
heading_depths: headingDepths,
|
|
624
|
+
avg_chunk_size: chunkResults.length > 0
|
|
625
|
+
? Math.round(totalChunkSize / chunkResults.length)
|
|
626
|
+
: 0,
|
|
627
|
+
atomic_chunk_ratio: chunkResults.length > 0
|
|
628
|
+
? Math.round((atomicChunkCount / chunkResults.length) * 100) / 100
|
|
629
|
+
: 0,
|
|
630
|
+
content_type_distribution: contentTypeDist,
|
|
631
|
+
};
|
|
632
|
+
// Persist enriched extras_json back to ocr_results
|
|
633
|
+
const updatedExtrasJson = JSON.stringify(existingExtras);
|
|
634
|
+
db.getConnection()
|
|
635
|
+
.prepare('UPDATE ocr_results SET extras_json = ? WHERE id = ?')
|
|
636
|
+
.run(updatedExtrasJson, ocrResult.id);
|
|
637
|
+
console.error(`[INFO] Extras enriched: block_stats=${blockStats ? 'yes' : 'no'}, ` +
|
|
638
|
+
`links=${existingExtras.link_count ?? 0}, fingerprint=yes`);
|
|
639
|
+
}
|
|
640
|
+
catch (enrichError) {
|
|
641
|
+
const enrichErrMsg = enrichError instanceof Error ? enrichError.message : String(enrichError);
|
|
642
|
+
console.error(`[WARN] Extras enrichment failed for ${doc.id}: ${enrichErrMsg}`);
|
|
643
|
+
warnings.push(`Metadata enrichment failed: ${enrichErrMsg}. Document complete but block stats, links, and structural fingerprint are missing.`);
|
|
644
|
+
}
|
|
645
|
+
// Step 4: Generate embeddings for text chunks
|
|
646
|
+
const embeddingService = new EmbeddingService();
|
|
647
|
+
const documentInfo = {
|
|
648
|
+
documentId: doc.id,
|
|
649
|
+
filePath: doc.file_path,
|
|
650
|
+
fileName: doc.file_name,
|
|
651
|
+
fileHash: doc.file_hash,
|
|
652
|
+
documentProvenanceId: doc.provenance_id,
|
|
653
|
+
};
|
|
654
|
+
const embedResult = await embeddingService.embedDocumentChunks(db, vector, chunks, documentInfo);
|
|
655
|
+
if (!embedResult.success) {
|
|
656
|
+
throw new Error(embedResult.error ?? 'Embedding generation failed');
|
|
657
|
+
}
|
|
658
|
+
console.error(`[INFO] Embeddings complete: ${embedResult.embeddingIds.length} embeddings in ${embedResult.elapsedMs}ms`);
|
|
659
|
+
// Step 5: VLM process images (generate 3+ paragraph descriptions)
|
|
660
|
+
// Only run if document had images extracted.
|
|
661
|
+
// VLM failures for individual images are logged as warnings but do NOT fail
|
|
662
|
+
// the document -- OCR, chunking, and embeddings already succeeded. Each image
|
|
663
|
+
// has its own vlm_status ('complete'|'failed'|'skipped') tracked independently.
|
|
664
|
+
if (imageCount > 0) {
|
|
665
|
+
const vlmPipeline = createVLMPipeline(db, vector, {
|
|
666
|
+
batchSize: 5,
|
|
667
|
+
concurrency: 3,
|
|
668
|
+
minConfidence: 0.5,
|
|
669
|
+
});
|
|
670
|
+
const vlmResult = await vlmPipeline.processDocument(doc.id);
|
|
671
|
+
console.error(`[INFO] VLM complete: ${vlmResult.successful}/${vlmResult.total} images processed, ` +
|
|
672
|
+
`${vlmResult.skipped} skipped, ${vlmResult.failed} failed, ` +
|
|
673
|
+
`${vlmResult.totalTokens} tokens used`);
|
|
674
|
+
if (vlmResult.failed > 0) {
|
|
675
|
+
const failedDetails = vlmResult.results
|
|
676
|
+
.filter((r) => !r.success)
|
|
677
|
+
.map((r) => `${r.imageId}: ${r.error ?? 'unknown error'}`)
|
|
678
|
+
.join('; ');
|
|
679
|
+
console.error(`[WARN] VLM processing failed for ${vlmResult.failed}/${vlmResult.total} images ` +
|
|
680
|
+
`in document ${doc.id}. Individual images marked as failed; document will ` +
|
|
681
|
+
`complete normally. Details: ${failedDetails}`);
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
// Step 5.5: Store structured extraction if present
|
|
685
|
+
// Errors propagate to fail the document (no swallowing)
|
|
686
|
+
if (processResult.extractionJson && pageSchema) {
|
|
687
|
+
const extractionContent = JSON.stringify(processResult.extractionJson);
|
|
688
|
+
const extractionHash = computeHash(extractionContent);
|
|
689
|
+
// Create EXTRACTION provenance record
|
|
690
|
+
const extractionProvId = uuidv4();
|
|
691
|
+
const ocrProvId = processResult.provenanceId;
|
|
692
|
+
const docProvId = doc.provenance_id;
|
|
693
|
+
const now = new Date().toISOString();
|
|
694
|
+
db.insertProvenance({
|
|
695
|
+
id: extractionProvId,
|
|
696
|
+
type: ProvenanceType.EXTRACTION,
|
|
697
|
+
created_at: now,
|
|
698
|
+
processed_at: now,
|
|
699
|
+
source_file_created_at: null,
|
|
700
|
+
source_file_modified_at: null,
|
|
701
|
+
source_type: 'EXTRACTION',
|
|
702
|
+
source_path: doc.file_path,
|
|
703
|
+
source_id: ocrProvId,
|
|
704
|
+
root_document_id: docProvId,
|
|
705
|
+
location: null,
|
|
706
|
+
content_hash: extractionHash,
|
|
707
|
+
input_hash: ocrResult.content_hash,
|
|
708
|
+
file_hash: doc.file_hash,
|
|
709
|
+
processor: 'datalab-extraction',
|
|
710
|
+
processor_version: '1.0.0',
|
|
711
|
+
processing_params: { page_schema: pageSchema },
|
|
712
|
+
processing_duration_ms: null,
|
|
713
|
+
processing_quality_score: null,
|
|
714
|
+
parent_id: ocrProvId,
|
|
715
|
+
parent_ids: JSON.stringify([docProvId, ocrProvId]),
|
|
716
|
+
chain_depth: 2,
|
|
717
|
+
chain_path: JSON.stringify(['DOCUMENT', 'OCR_RESULT', 'EXTRACTION']),
|
|
718
|
+
});
|
|
719
|
+
db.insertExtraction({
|
|
720
|
+
id: uuidv4(),
|
|
721
|
+
document_id: doc.id,
|
|
722
|
+
ocr_result_id: ocrResult.id,
|
|
723
|
+
schema_json: pageSchema,
|
|
724
|
+
extraction_json: extractionContent,
|
|
725
|
+
content_hash: extractionHash,
|
|
726
|
+
provenance_id: extractionProvId,
|
|
727
|
+
created_at: now,
|
|
728
|
+
});
|
|
729
|
+
console.error(`[INFO] Stored structured extraction for document ${doc.id}`);
|
|
730
|
+
}
|
|
731
|
+
// Step 5.6: Update document metadata if available
|
|
732
|
+
if (processResult.docTitle || processResult.docAuthor || processResult.docSubject) {
|
|
733
|
+
db.updateDocumentMetadata(doc.id, {
|
|
734
|
+
docTitle: processResult.docTitle ?? null,
|
|
735
|
+
docAuthor: processResult.docAuthor ?? null,
|
|
736
|
+
docSubject: processResult.docSubject ?? null,
|
|
737
|
+
});
|
|
738
|
+
}
|
|
739
|
+
// Step 6: Mark document complete (OCR + chunks + embeddings succeeded)
|
|
740
|
+
// Note: Generation validation is handled by withDatabaseOperation() in the caller.
|
|
741
|
+
db.updateDocumentStatus(doc.id, 'complete');
|
|
742
|
+
console.error(`[INFO] Document ${doc.id} processing complete`);
|
|
743
|
+
return warnings;
|
|
744
|
+
}
|
|
745
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
746
|
+
// INGESTION TOOL HANDLERS
|
|
747
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
748
|
+
/**
|
|
749
|
+
* Handle ocr_ingest_directory - Ingest all documents from a directory
|
|
750
|
+
*/
|
|
751
|
+
export async function handleIngestDirectory(params) {
|
|
752
|
+
try {
|
|
753
|
+
const input = validateInput(IngestDirectoryInput, params);
|
|
754
|
+
const { db } = requireDatabase();
|
|
755
|
+
const safeDirPath = sanitizePath(input.directory_path);
|
|
756
|
+
// Validate directory exists - FAIL FAST
|
|
757
|
+
if (!existsSync(safeDirPath)) {
|
|
758
|
+
throw pathNotFoundError(safeDirPath);
|
|
759
|
+
}
|
|
760
|
+
const dirStats = statSync(safeDirPath);
|
|
761
|
+
if (!dirStats.isDirectory()) {
|
|
762
|
+
throw pathNotDirectoryError(safeDirPath);
|
|
763
|
+
}
|
|
764
|
+
const fileTypes = input.file_types ?? [...DEFAULT_FILE_TYPES];
|
|
765
|
+
const items = [];
|
|
766
|
+
const collectFiles = (dirPath) => {
|
|
767
|
+
const files = [];
|
|
768
|
+
const entries = readdirSync(dirPath, { withFileTypes: true });
|
|
769
|
+
for (const entry of entries) {
|
|
770
|
+
const fullPath = resolve(dirPath, entry.name);
|
|
771
|
+
try {
|
|
772
|
+
if (lstatSync(fullPath).isSymbolicLink()) {
|
|
773
|
+
console.error(`[WARN] Skipping symlink during ingestion: ${fullPath}`);
|
|
774
|
+
continue;
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
catch (error) {
|
|
778
|
+
console.error(`[WARN] Could not stat entry, skipping: ${fullPath}:`, error instanceof Error ? error.message : String(error));
|
|
779
|
+
continue;
|
|
780
|
+
}
|
|
781
|
+
if (entry.isDirectory() && input.recursive) {
|
|
782
|
+
files.push(...collectFiles(fullPath));
|
|
783
|
+
}
|
|
784
|
+
else if (entry.isFile()) {
|
|
785
|
+
const ext = extname(entry.name).slice(1).toLowerCase();
|
|
786
|
+
if (fileTypes.includes(ext)) {
|
|
787
|
+
files.push(fullPath);
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
return files;
|
|
792
|
+
};
|
|
793
|
+
const files = collectFiles(safeDirPath);
|
|
794
|
+
// Ingest each file
|
|
795
|
+
for (const filePath of files) {
|
|
796
|
+
try {
|
|
797
|
+
// Check if already ingested by path
|
|
798
|
+
const existingByPath = db.getDocumentByPath(filePath);
|
|
799
|
+
const stats = statSync(filePath);
|
|
800
|
+
const fileHash = await hashFile(filePath);
|
|
801
|
+
if (existingByPath) {
|
|
802
|
+
if (fileHash === existingByPath.file_hash) {
|
|
803
|
+
items.push({
|
|
804
|
+
file_path: filePath,
|
|
805
|
+
file_name: basename(filePath),
|
|
806
|
+
document_id: existingByPath.id,
|
|
807
|
+
status: 'skipped',
|
|
808
|
+
error_message: 'Already ingested, content unchanged',
|
|
809
|
+
});
|
|
810
|
+
continue;
|
|
811
|
+
}
|
|
812
|
+
// Version change detected - continue with normal ingestion flow below
|
|
813
|
+
console.error(`[Ingestion] Version update detected for ${filePath}: ${existingByPath.file_hash} -> ${fileHash}`);
|
|
814
|
+
}
|
|
815
|
+
else {
|
|
816
|
+
// Check for duplicate by file hash (same content, different path)
|
|
817
|
+
const existingByHash = db.getDocumentByHash(fileHash);
|
|
818
|
+
if (existingByHash) {
|
|
819
|
+
items.push({
|
|
820
|
+
file_path: filePath,
|
|
821
|
+
file_name: basename(filePath),
|
|
822
|
+
document_id: existingByHash.id,
|
|
823
|
+
status: 'skipped',
|
|
824
|
+
error_message: `Duplicate file (same hash as ${existingByHash.file_path})`,
|
|
825
|
+
});
|
|
826
|
+
continue;
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
// Determine if this is a version update
|
|
830
|
+
const isVersionUpdate = !!existingByPath;
|
|
831
|
+
// Create document record
|
|
832
|
+
const documentId = uuidv4();
|
|
833
|
+
const provenanceId = uuidv4();
|
|
834
|
+
const now = new Date().toISOString();
|
|
835
|
+
const ext = extname(filePath).slice(1).toLowerCase();
|
|
836
|
+
// Create document provenance
|
|
837
|
+
db.insertProvenance({
|
|
838
|
+
id: provenanceId,
|
|
839
|
+
type: ProvenanceType.DOCUMENT,
|
|
840
|
+
created_at: now,
|
|
841
|
+
processed_at: now,
|
|
842
|
+
source_file_created_at: null,
|
|
843
|
+
source_file_modified_at: null,
|
|
844
|
+
source_type: 'FILE',
|
|
845
|
+
source_path: filePath,
|
|
846
|
+
source_id: null,
|
|
847
|
+
root_document_id: provenanceId,
|
|
848
|
+
location: null,
|
|
849
|
+
content_hash: fileHash,
|
|
850
|
+
input_hash: null,
|
|
851
|
+
file_hash: fileHash,
|
|
852
|
+
processor: 'file-scanner',
|
|
853
|
+
processor_version: '1.0.0',
|
|
854
|
+
processing_params: {
|
|
855
|
+
directory_path: safeDirPath,
|
|
856
|
+
recursive: input.recursive,
|
|
857
|
+
...(isVersionUpdate ? { previous_version_id: existingByPath.id } : {}),
|
|
858
|
+
},
|
|
859
|
+
processing_duration_ms: null,
|
|
860
|
+
processing_quality_score: null,
|
|
861
|
+
parent_id: null,
|
|
862
|
+
parent_ids: '[]',
|
|
863
|
+
chain_depth: 0,
|
|
864
|
+
chain_path: '["DOCUMENT"]',
|
|
865
|
+
});
|
|
866
|
+
// Insert document
|
|
867
|
+
db.insertDocument({
|
|
868
|
+
id: documentId,
|
|
869
|
+
file_path: filePath,
|
|
870
|
+
file_name: basename(filePath),
|
|
871
|
+
file_hash: fileHash,
|
|
872
|
+
file_size: stats.size,
|
|
873
|
+
file_type: ext,
|
|
874
|
+
status: 'pending',
|
|
875
|
+
page_count: null,
|
|
876
|
+
provenance_id: provenanceId,
|
|
877
|
+
error_message: null,
|
|
878
|
+
modified_at: null,
|
|
879
|
+
ocr_completed_at: null,
|
|
880
|
+
doc_title: null,
|
|
881
|
+
doc_author: null,
|
|
882
|
+
doc_subject: null,
|
|
883
|
+
datalab_file_id: null,
|
|
884
|
+
});
|
|
885
|
+
items.push({
|
|
886
|
+
file_path: filePath,
|
|
887
|
+
file_name: basename(filePath),
|
|
888
|
+
document_id: documentId,
|
|
889
|
+
status: isVersionUpdate ? 'version_updated' : 'pending',
|
|
890
|
+
...(isVersionUpdate ? { previous_version_id: existingByPath.id } : {}),
|
|
891
|
+
});
|
|
892
|
+
}
|
|
893
|
+
catch (error) {
|
|
894
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
895
|
+
console.error(`[ERROR] Failed to ingest ${filePath}: ${errorMsg}`);
|
|
896
|
+
items.push({
|
|
897
|
+
file_path: filePath,
|
|
898
|
+
file_name: basename(filePath),
|
|
899
|
+
document_id: '',
|
|
900
|
+
status: 'error',
|
|
901
|
+
error_message: errorMsg,
|
|
902
|
+
});
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
const result = {
|
|
906
|
+
directory_path: safeDirPath,
|
|
907
|
+
files_found: files.length,
|
|
908
|
+
files_ingested: items.filter((i) => i.status === 'pending').length,
|
|
909
|
+
files_version_updated: items.filter((i) => i.status === 'version_updated').length,
|
|
910
|
+
files_skipped: items.filter((i) => i.status === 'skipped').length,
|
|
911
|
+
files_errored: items.filter((i) => i.status === 'error').length,
|
|
912
|
+
items,
|
|
913
|
+
next_steps: [
|
|
914
|
+
{ tool: 'ocr_process_pending', description: 'Run OCR pipeline on the ingested files' },
|
|
915
|
+
],
|
|
916
|
+
};
|
|
917
|
+
return formatResponse(successResult(result));
|
|
918
|
+
}
|
|
919
|
+
catch (error) {
|
|
920
|
+
return handleError(error);
|
|
921
|
+
}
|
|
922
|
+
}
|
|
923
|
+
/**
|
|
924
|
+
* Handle ocr_ingest_files - Ingest specific files
|
|
925
|
+
*/
|
|
926
|
+
export async function handleIngestFiles(params) {
|
|
927
|
+
try {
|
|
928
|
+
const input = validateInput(IngestFilesInput, params);
|
|
929
|
+
const { db } = requireDatabase();
|
|
930
|
+
const items = [];
|
|
931
|
+
for (const rawFilePath of input.file_paths) {
|
|
932
|
+
const filePath = sanitizePath(rawFilePath);
|
|
933
|
+
try {
|
|
934
|
+
// Validate file exists - FAIL FAST
|
|
935
|
+
if (!existsSync(filePath)) {
|
|
936
|
+
items.push({
|
|
937
|
+
file_path: filePath,
|
|
938
|
+
file_name: basename(filePath),
|
|
939
|
+
document_id: '',
|
|
940
|
+
status: 'error',
|
|
941
|
+
error_message: 'File not found',
|
|
942
|
+
});
|
|
943
|
+
continue;
|
|
944
|
+
}
|
|
945
|
+
const stats = statSync(filePath);
|
|
946
|
+
if (!stats.isFile()) {
|
|
947
|
+
items.push({
|
|
948
|
+
file_path: filePath,
|
|
949
|
+
file_name: basename(filePath),
|
|
950
|
+
document_id: '',
|
|
951
|
+
status: 'error',
|
|
952
|
+
error_message: 'Path is not a file',
|
|
953
|
+
});
|
|
954
|
+
continue;
|
|
955
|
+
}
|
|
956
|
+
// Check if already ingested
|
|
957
|
+
const existingByPath = db.getDocumentByPath(filePath);
|
|
958
|
+
// Create document record
|
|
959
|
+
const documentId = uuidv4();
|
|
960
|
+
const provenanceId = uuidv4();
|
|
961
|
+
const now = new Date().toISOString();
|
|
962
|
+
const ext = extname(filePath).slice(1).toLowerCase();
|
|
963
|
+
// Validate file type is supported
|
|
964
|
+
if (!DEFAULT_FILE_TYPES.includes(ext)) {
|
|
965
|
+
items.push({
|
|
966
|
+
file_path: filePath,
|
|
967
|
+
file_name: basename(filePath),
|
|
968
|
+
document_id: '',
|
|
969
|
+
status: 'error',
|
|
970
|
+
error_message: `Unsupported file type: .${ext}. Supported: ${DEFAULT_FILE_TYPES.join(', ')}`,
|
|
971
|
+
});
|
|
972
|
+
continue;
|
|
973
|
+
}
|
|
974
|
+
const fileHash = await hashFile(filePath);
|
|
975
|
+
if (existingByPath) {
|
|
976
|
+
if (fileHash === existingByPath.file_hash) {
|
|
977
|
+
items.push({
|
|
978
|
+
file_path: filePath,
|
|
979
|
+
file_name: basename(filePath),
|
|
980
|
+
document_id: existingByPath.id,
|
|
981
|
+
status: 'skipped',
|
|
982
|
+
error_message: 'Already ingested, content unchanged',
|
|
983
|
+
});
|
|
984
|
+
continue;
|
|
985
|
+
}
|
|
986
|
+
// Version change detected - continue with normal ingestion flow below
|
|
987
|
+
console.error(`[Ingestion] Version update detected for ${filePath}: ${existingByPath.file_hash} -> ${fileHash}`);
|
|
988
|
+
}
|
|
989
|
+
else {
|
|
990
|
+
// Check for duplicate by file hash (same content, different path)
|
|
991
|
+
const existingByHash = db.getDocumentByHash(fileHash);
|
|
992
|
+
if (existingByHash) {
|
|
993
|
+
items.push({
|
|
994
|
+
file_path: filePath,
|
|
995
|
+
file_name: basename(filePath),
|
|
996
|
+
document_id: existingByHash.id,
|
|
997
|
+
status: 'skipped',
|
|
998
|
+
error_message: `Duplicate file (same hash as ${existingByHash.file_path})`,
|
|
999
|
+
});
|
|
1000
|
+
continue;
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
// Determine if this is a version update
|
|
1004
|
+
const isVersionUpdate = !!existingByPath;
|
|
1005
|
+
// Create document provenance
|
|
1006
|
+
db.insertProvenance({
|
|
1007
|
+
id: provenanceId,
|
|
1008
|
+
type: ProvenanceType.DOCUMENT,
|
|
1009
|
+
created_at: now,
|
|
1010
|
+
processed_at: now,
|
|
1011
|
+
source_file_created_at: null,
|
|
1012
|
+
source_file_modified_at: null,
|
|
1013
|
+
source_type: 'FILE',
|
|
1014
|
+
source_path: filePath,
|
|
1015
|
+
source_id: null,
|
|
1016
|
+
root_document_id: provenanceId,
|
|
1017
|
+
location: null,
|
|
1018
|
+
content_hash: fileHash,
|
|
1019
|
+
input_hash: null,
|
|
1020
|
+
file_hash: fileHash,
|
|
1021
|
+
processor: 'file-scanner',
|
|
1022
|
+
processor_version: '1.0.0',
|
|
1023
|
+
processing_params: isVersionUpdate ? { previous_version_id: existingByPath.id } : {},
|
|
1024
|
+
processing_duration_ms: null,
|
|
1025
|
+
processing_quality_score: null,
|
|
1026
|
+
parent_id: null,
|
|
1027
|
+
parent_ids: '[]',
|
|
1028
|
+
chain_depth: 0,
|
|
1029
|
+
chain_path: '["DOCUMENT"]',
|
|
1030
|
+
});
|
|
1031
|
+
// Insert document
|
|
1032
|
+
db.insertDocument({
|
|
1033
|
+
id: documentId,
|
|
1034
|
+
file_path: filePath,
|
|
1035
|
+
file_name: basename(filePath),
|
|
1036
|
+
file_hash: fileHash,
|
|
1037
|
+
file_size: stats.size,
|
|
1038
|
+
file_type: ext,
|
|
1039
|
+
status: 'pending',
|
|
1040
|
+
page_count: null,
|
|
1041
|
+
provenance_id: provenanceId,
|
|
1042
|
+
error_message: null,
|
|
1043
|
+
modified_at: null,
|
|
1044
|
+
ocr_completed_at: null,
|
|
1045
|
+
doc_title: null,
|
|
1046
|
+
doc_author: null,
|
|
1047
|
+
doc_subject: null,
|
|
1048
|
+
datalab_file_id: null,
|
|
1049
|
+
});
|
|
1050
|
+
items.push({
|
|
1051
|
+
file_path: filePath,
|
|
1052
|
+
file_name: basename(filePath),
|
|
1053
|
+
document_id: documentId,
|
|
1054
|
+
status: isVersionUpdate ? 'version_updated' : 'pending',
|
|
1055
|
+
...(isVersionUpdate ? { previous_version_id: existingByPath.id } : {}),
|
|
1056
|
+
});
|
|
1057
|
+
}
|
|
1058
|
+
catch (error) {
|
|
1059
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1060
|
+
console.error(`[ERROR] Failed to ingest ${filePath}: ${errorMsg}`);
|
|
1061
|
+
items.push({
|
|
1062
|
+
file_path: filePath,
|
|
1063
|
+
file_name: basename(filePath),
|
|
1064
|
+
document_id: '',
|
|
1065
|
+
status: 'error',
|
|
1066
|
+
error_message: errorMsg,
|
|
1067
|
+
});
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
return formatResponse(successResult({
|
|
1071
|
+
files_ingested: items.filter((i) => i.status === 'pending').length,
|
|
1072
|
+
files_version_updated: items.filter((i) => i.status === 'version_updated').length,
|
|
1073
|
+
files_skipped: items.filter((i) => i.status === 'skipped').length,
|
|
1074
|
+
files_errored: items.filter((i) => i.status === 'error').length,
|
|
1075
|
+
items,
|
|
1076
|
+
next_steps: [
|
|
1077
|
+
{ tool: 'ocr_process_pending', description: 'Run OCR pipeline on the ingested files' },
|
|
1078
|
+
],
|
|
1079
|
+
}));
|
|
1080
|
+
}
|
|
1081
|
+
catch (error) {
|
|
1082
|
+
return handleError(error);
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
/**
|
|
1086
|
+
* Handle ocr_process_pending - Process pending documents through full OCR pipeline
|
|
1087
|
+
*
|
|
1088
|
+
* Pipeline: OCR -> Extract Images -> Chunk -> Embed -> VLM Process Images -> Complete
|
|
1089
|
+
* Provenance chain: DOCUMENT(0) -> OCR_RESULT(1) -> CHUNK(2)/IMAGE(2) -> EMBEDDING(3)/VLM_DESC(3)
|
|
1090
|
+
*/
|
|
1091
|
+
export async function handleProcessPending(params) {
|
|
1092
|
+
try {
|
|
1093
|
+
const input = validateInput(ProcessPendingInput, params);
|
|
1094
|
+
if (!process.env.DATALAB_API_KEY) {
|
|
1095
|
+
throw new Error('DATALAB_API_KEY environment variable is required for OCR processing');
|
|
1096
|
+
}
|
|
1097
|
+
// H-1/H-2: Use withDatabaseOperation to track this long-running async operation.
|
|
1098
|
+
// This prevents database switches while processing is in-flight and validates
|
|
1099
|
+
// generation on completion.
|
|
1100
|
+
return await withDatabaseOperation(async ({ db, vector, generation }) => {
|
|
1101
|
+
// Atomic document claiming: UPDATE then SELECT to prevent concurrent callers
|
|
1102
|
+
// from processing the same documents (F-INTEG-3)
|
|
1103
|
+
const claimLimit = input.max_concurrent ?? 3;
|
|
1104
|
+
const conn = db.getConnection();
|
|
1105
|
+
conn
|
|
1106
|
+
.prepare(`UPDATE documents SET status = 'processing', modified_at = ?
|
|
1107
|
+
WHERE id IN (SELECT id FROM documents WHERE status = 'pending' ORDER BY created_at ASC LIMIT ?)`)
|
|
1108
|
+
.run(new Date().toISOString(), claimLimit);
|
|
1109
|
+
const pendingDocs = db.listDocuments({ status: 'processing', limit: claimLimit });
|
|
1110
|
+
if (pendingDocs.length === 0) {
|
|
1111
|
+
return formatResponse(successResult({
|
|
1112
|
+
processed: 0,
|
|
1113
|
+
failed: 0,
|
|
1114
|
+
remaining: 0,
|
|
1115
|
+
message: 'No pending documents to process',
|
|
1116
|
+
next_steps: [{ tool: 'ocr_status', description: 'Check overall processing status' }],
|
|
1117
|
+
}));
|
|
1118
|
+
}
|
|
1119
|
+
const ocrMode = input.ocr_mode ?? state.config.defaultOCRMode;
|
|
1120
|
+
const ocrOptions = {
|
|
1121
|
+
maxPages: input.max_pages,
|
|
1122
|
+
pageRange: input.page_range,
|
|
1123
|
+
skipCache: input.skip_cache,
|
|
1124
|
+
disableImageExtraction: input.disable_image_extraction,
|
|
1125
|
+
extras: input.extras,
|
|
1126
|
+
pageSchema: input.page_schema,
|
|
1127
|
+
additionalConfig: input.additional_config,
|
|
1128
|
+
};
|
|
1129
|
+
const results = {
|
|
1130
|
+
processed: 0,
|
|
1131
|
+
failed: 0,
|
|
1132
|
+
errors: [],
|
|
1133
|
+
warnings: [],
|
|
1134
|
+
};
|
|
1135
|
+
const successfulDocIds = [];
|
|
1136
|
+
const batchId = uuidv4();
|
|
1137
|
+
const batchStartTime = Date.now();
|
|
1138
|
+
console.error(`[INFO] Batch ${batchId}: processing ${pendingDocs.length} documents`);
|
|
1139
|
+
// Default images output directory
|
|
1140
|
+
const imagesBaseDir = resolve(state.config.defaultStoragePath, 'images');
|
|
1141
|
+
// FIX-P1-2: Process documents in parallel batches using max_concurrent
|
|
1142
|
+
const maxConcurrent = input.max_concurrent ?? 3;
|
|
1143
|
+
// Build shared processing params for the module-level processOneDocument function
|
|
1144
|
+
const processingParams = {
|
|
1145
|
+
db,
|
|
1146
|
+
vector,
|
|
1147
|
+
generation,
|
|
1148
|
+
ocrMode,
|
|
1149
|
+
ocrOptions,
|
|
1150
|
+
pageSchema: input.page_schema,
|
|
1151
|
+
imagesBaseDir,
|
|
1152
|
+
};
|
|
1153
|
+
// Wrapper that handles per-document error tracking and cleanup
|
|
1154
|
+
const processDocWithTracking = async (doc) => {
|
|
1155
|
+
try {
|
|
1156
|
+
const docWarnings = await processOneDocument(doc, processingParams);
|
|
1157
|
+
results.processed++;
|
|
1158
|
+
successfulDocIds.push(doc.id);
|
|
1159
|
+
if (docWarnings.length > 0) {
|
|
1160
|
+
results.warnings.push({ document_id: doc.id, warnings: docWarnings });
|
|
1161
|
+
}
|
|
1162
|
+
}
|
|
1163
|
+
catch (error) {
|
|
1164
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1165
|
+
console.error(`[ERROR] Document ${doc.id} failed: ${errorMsg}`);
|
|
1166
|
+
// F-INTEG-1: Clean up partial derived data (orphaned chunks, embeddings)
|
|
1167
|
+
// before marking as failed, so a retry starts from a clean state.
|
|
1168
|
+
try {
|
|
1169
|
+
db.cleanDocumentDerivedData(doc.id);
|
|
1170
|
+
console.error(`[INFO] Cleaned partial data for failed document ${doc.id}`);
|
|
1171
|
+
}
|
|
1172
|
+
catch (cleanupError) {
|
|
1173
|
+
const cleanupMsg = cleanupError instanceof Error ? cleanupError.message : String(cleanupError);
|
|
1174
|
+
console.error(`[WARN] Cleanup of partial data failed for ${doc.id}: ${cleanupMsg}`);
|
|
1175
|
+
}
|
|
1176
|
+
db.updateDocumentStatus(doc.id, 'failed', errorMsg);
|
|
1177
|
+
results.failed++;
|
|
1178
|
+
results.errors.push({ document_id: doc.id, error: errorMsg });
|
|
1179
|
+
}
|
|
1180
|
+
if (typeof global.gc === 'function') {
|
|
1181
|
+
global.gc();
|
|
1182
|
+
}
|
|
1183
|
+
};
|
|
1184
|
+
// FIX-P1-2: Execute documents in parallel batches
|
|
1185
|
+
for (let batchStart = 0; batchStart < pendingDocs.length; batchStart += maxConcurrent) {
|
|
1186
|
+
const batch = pendingDocs.slice(batchStart, batchStart + maxConcurrent);
|
|
1187
|
+
if (batch.length > 1) {
|
|
1188
|
+
console.error(`[INFO] Processing document batch ${Math.floor(batchStart / maxConcurrent) + 1}: ` +
|
|
1189
|
+
`${batch.length} documents (${batchStart + 1}-${batchStart + batch.length} of ${pendingDocs.length})`);
|
|
1190
|
+
}
|
|
1191
|
+
await Promise.allSettled(batch.map(processDocWithTracking));
|
|
1192
|
+
}
|
|
1193
|
+
// Get remaining count - CRITICAL: use 'status' not 'statusFilter'
|
|
1194
|
+
const remaining = db.listDocuments({ status: 'pending' }).length;
|
|
1195
|
+
// Auto-clustering check
|
|
1196
|
+
let autoClusterResult;
|
|
1197
|
+
const config = getConfig();
|
|
1198
|
+
if (config.autoClusterEnabled && results.processed > 0) {
|
|
1199
|
+
const totalDocs = conn.prepare('SELECT COUNT(*) as cnt FROM documents WHERE status = ?').get('complete').cnt;
|
|
1200
|
+
const threshold = config.autoClusterThreshold ?? 10;
|
|
1201
|
+
// Check if we have enough docs and no recent clustering run
|
|
1202
|
+
const lastCluster = conn.prepare('SELECT MAX(created_at) as latest FROM clusters').get();
|
|
1203
|
+
const lastClusterDate = lastCluster?.latest ? new Date(lastCluster.latest) : null;
|
|
1204
|
+
const hoursSinceLastCluster = lastClusterDate ? (Date.now() - lastClusterDate.getTime()) / 3600000 : Infinity;
|
|
1205
|
+
if (totalDocs >= threshold && hoursSinceLastCluster > 1) {
|
|
1206
|
+
try {
|
|
1207
|
+
const { runClustering } = await import('../services/clustering/clustering-service.js');
|
|
1208
|
+
const algorithm = config.autoClusterAlgorithm ?? 'hdbscan';
|
|
1209
|
+
const clusterResult = await runClustering(db, vector, { algorithm, n_clusters: null, min_cluster_size: 3, distance_threshold: null, linkage: 'average' });
|
|
1210
|
+
autoClusterResult = { triggered: true, run_id: clusterResult.run_id, clusters: clusterResult.n_clusters, algorithm };
|
|
1211
|
+
console.error(`[Ingestion] Auto-clustering triggered: ${clusterResult.n_clusters} clusters via ${algorithm}`);
|
|
1212
|
+
}
|
|
1213
|
+
catch (e) {
|
|
1214
|
+
console.error(`[Ingestion] Auto-clustering failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
1215
|
+
autoClusterResult = { triggered: true, error: e instanceof Error ? e.message : String(e) };
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1219
|
+
// Build response
|
|
1220
|
+
const response = {
|
|
1221
|
+
batch_id: batchId,
|
|
1222
|
+
batch_duration_ms: Date.now() - batchStartTime,
|
|
1223
|
+
processed: results.processed,
|
|
1224
|
+
failed: results.failed,
|
|
1225
|
+
remaining,
|
|
1226
|
+
errors: results.errors.length > 0 ? results.errors : undefined,
|
|
1227
|
+
warnings: results.warnings.length > 0 ? results.warnings : undefined,
|
|
1228
|
+
};
|
|
1229
|
+
response.next_steps = [
|
|
1230
|
+
{ tool: 'ocr_search', description: 'Search across all processed documents' },
|
|
1231
|
+
{ tool: 'ocr_document_list', description: 'Browse all documents in the database' },
|
|
1232
|
+
];
|
|
1233
|
+
try {
|
|
1234
|
+
const totalDocCount = db
|
|
1235
|
+
.getConnection()
|
|
1236
|
+
.prepare('SELECT COUNT(*) as cnt FROM documents WHERE status = ?')
|
|
1237
|
+
.get('complete').cnt;
|
|
1238
|
+
if (totalDocCount > 1) {
|
|
1239
|
+
response.next_steps.push({ tool: 'ocr_document_compare', description: 'Compare differences between documents' });
|
|
1240
|
+
}
|
|
1241
|
+
}
|
|
1242
|
+
catch (error) {
|
|
1243
|
+
console.error(`[Ingestion] Failed to query document count for auto-compare hint: ${String(error)}`);
|
|
1244
|
+
}
|
|
1245
|
+
if (autoClusterResult) {
|
|
1246
|
+
response.auto_clustering = autoClusterResult;
|
|
1247
|
+
}
|
|
1248
|
+
return formatResponse(successResult(response));
|
|
1249
|
+
});
|
|
1250
|
+
}
|
|
1251
|
+
catch (error) {
|
|
1252
|
+
return handleError(error);
|
|
1253
|
+
}
|
|
1254
|
+
}
|
|
1255
|
+
/**
|
|
1256
|
+
* Handle ocr_status - Get OCR processing status
|
|
1257
|
+
*/
|
|
1258
|
+
export async function handleOCRStatus(params) {
|
|
1259
|
+
try {
|
|
1260
|
+
const input = validateInput(OCRStatusInput, params);
|
|
1261
|
+
const { db } = requireDatabase();
|
|
1262
|
+
if (input.document_id) {
|
|
1263
|
+
const doc = db.getDocument(input.document_id);
|
|
1264
|
+
if (!doc) {
|
|
1265
|
+
throw documentNotFoundError(input.document_id);
|
|
1266
|
+
}
|
|
1267
|
+
return formatResponse(successResult({
|
|
1268
|
+
documents: [
|
|
1269
|
+
{
|
|
1270
|
+
document_id: doc.id,
|
|
1271
|
+
file_name: doc.file_name,
|
|
1272
|
+
status: doc.status,
|
|
1273
|
+
page_count: doc.page_count,
|
|
1274
|
+
error_message: doc.error_message ?? undefined,
|
|
1275
|
+
created_at: doc.created_at,
|
|
1276
|
+
},
|
|
1277
|
+
],
|
|
1278
|
+
summary: {
|
|
1279
|
+
total: 1,
|
|
1280
|
+
pending: doc.status === 'pending' ? 1 : 0,
|
|
1281
|
+
processing: doc.status === 'processing' ? 1 : 0,
|
|
1282
|
+
complete: doc.status === 'complete' ? 1 : 0,
|
|
1283
|
+
failed: doc.status === 'failed' ? 1 : 0,
|
|
1284
|
+
},
|
|
1285
|
+
next_steps: [
|
|
1286
|
+
{ tool: 'ocr_document_get', description: 'View full document details and metadata' },
|
|
1287
|
+
{ tool: 'ocr_process_pending', description: 'Process documents still pending OCR' },
|
|
1288
|
+
],
|
|
1289
|
+
}));
|
|
1290
|
+
}
|
|
1291
|
+
// Map filter values - CRITICAL: use 'status' not 'statusFilter' for listDocuments
|
|
1292
|
+
const statusFilter = input.status_filter ?? 'all';
|
|
1293
|
+
const filterMap = {
|
|
1294
|
+
pending: 'pending',
|
|
1295
|
+
processing: 'processing',
|
|
1296
|
+
complete: 'complete',
|
|
1297
|
+
failed: 'failed',
|
|
1298
|
+
all: undefined,
|
|
1299
|
+
};
|
|
1300
|
+
const documents = db.listDocuments({
|
|
1301
|
+
status: filterMap[statusFilter],
|
|
1302
|
+
limit: 1000,
|
|
1303
|
+
});
|
|
1304
|
+
const stats = db.getStats();
|
|
1305
|
+
return formatResponse(successResult({
|
|
1306
|
+
documents: documents.map((d) => ({
|
|
1307
|
+
document_id: d.id,
|
|
1308
|
+
file_name: d.file_name,
|
|
1309
|
+
status: d.status,
|
|
1310
|
+
page_count: d.page_count,
|
|
1311
|
+
error_message: d.error_message ?? undefined,
|
|
1312
|
+
created_at: d.created_at,
|
|
1313
|
+
})),
|
|
1314
|
+
summary: {
|
|
1315
|
+
total: stats.total_documents,
|
|
1316
|
+
pending: stats.documents_by_status.pending,
|
|
1317
|
+
processing: stats.documents_by_status.processing,
|
|
1318
|
+
complete: stats.documents_by_status.complete,
|
|
1319
|
+
failed: stats.documents_by_status.failed,
|
|
1320
|
+
},
|
|
1321
|
+
supplementary: {
|
|
1322
|
+
total_chunks: stats.total_chunks,
|
|
1323
|
+
total_embeddings: stats.total_embeddings,
|
|
1324
|
+
total_extractions: stats.total_extractions,
|
|
1325
|
+
total_form_fills: stats.total_form_fills,
|
|
1326
|
+
ocr_quality: stats.ocr_quality,
|
|
1327
|
+
costs: stats.costs,
|
|
1328
|
+
},
|
|
1329
|
+
next_steps: [
|
|
1330
|
+
{ tool: 'ocr_process_pending', description: 'Process documents still pending OCR' },
|
|
1331
|
+
{ tool: 'ocr_retry_failed', description: 'Reset failed documents for reprocessing' },
|
|
1332
|
+
],
|
|
1333
|
+
}));
|
|
1334
|
+
}
|
|
1335
|
+
catch (error) {
|
|
1336
|
+
return handleError(error);
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
/**
|
|
1340
|
+
* Handle ocr_retry_failed - Reset failed documents back to pending for reprocessing
|
|
1341
|
+
*
|
|
1342
|
+
* Cleans all derived data (OCR results, chunks, embeddings, images, non-root provenance)
|
|
1343
|
+
* before resetting status to 'pending' to avoid duplicate data on reprocessing.
|
|
1344
|
+
*/
|
|
1345
|
+
export async function handleRetryFailed(params) {
|
|
1346
|
+
try {
|
|
1347
|
+
const input = validateInput(RetryFailedInput, params);
|
|
1348
|
+
const { db } = requireDatabase();
|
|
1349
|
+
let resetCount = 0;
|
|
1350
|
+
if (input.document_id) {
|
|
1351
|
+
const doc = db.getDocument(input.document_id);
|
|
1352
|
+
if (!doc) {
|
|
1353
|
+
throw documentNotFoundError(input.document_id);
|
|
1354
|
+
}
|
|
1355
|
+
if (doc.status !== 'failed') {
|
|
1356
|
+
return formatResponse(successResult({
|
|
1357
|
+
reset: 0,
|
|
1358
|
+
message: `Document ${input.document_id} is not in failed state (current: ${doc.status})`,
|
|
1359
|
+
next_steps: [{ tool: 'ocr_status', description: 'Check document processing status' }],
|
|
1360
|
+
}));
|
|
1361
|
+
}
|
|
1362
|
+
// Clean all derived data before resetting to pending
|
|
1363
|
+
db.cleanDocumentDerivedData(input.document_id);
|
|
1364
|
+
db.updateDocumentStatus(input.document_id, 'pending');
|
|
1365
|
+
resetCount = 1;
|
|
1366
|
+
}
|
|
1367
|
+
else {
|
|
1368
|
+
const failedDocs = db.listDocuments({ status: 'failed', limit: 1000 });
|
|
1369
|
+
for (const doc of failedDocs) {
|
|
1370
|
+
// Clean all derived data before resetting to pending
|
|
1371
|
+
db.cleanDocumentDerivedData(doc.id);
|
|
1372
|
+
db.updateDocumentStatus(doc.id, 'pending');
|
|
1373
|
+
resetCount++;
|
|
1374
|
+
}
|
|
1375
|
+
}
|
|
1376
|
+
return formatResponse(successResult({
|
|
1377
|
+
reset: resetCount,
|
|
1378
|
+
message: `Reset ${resetCount} failed document(s) to pending (derived data cleaned)`,
|
|
1379
|
+
next_steps: [
|
|
1380
|
+
{ tool: 'ocr_process_pending', description: 'Process the reset documents' },
|
|
1381
|
+
{ tool: 'ocr_status', description: 'Check processing status after retry' },
|
|
1382
|
+
],
|
|
1383
|
+
}));
|
|
1384
|
+
}
|
|
1385
|
+
catch (error) {
|
|
1386
|
+
return handleError(error);
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1390
|
+
// RAW CONVERSION HANDLER (AI-4)
|
|
1391
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1392
|
+
/**
|
|
1393
|
+
* Handle ocr_convert_raw - Convert a document via OCR and return raw results
|
|
1394
|
+
* without storing in database. Quick one-off conversions.
|
|
1395
|
+
*/
|
|
1396
|
+
async function handleConvertRaw(params) {
|
|
1397
|
+
try {
|
|
1398
|
+
const input = validateInput(z.object({
|
|
1399
|
+
file_path: z.string().min(1),
|
|
1400
|
+
ocr_mode: z.enum(['fast', 'balanced', 'accurate']).default('balanced'),
|
|
1401
|
+
max_pages: z.number().int().min(1).max(7000).optional(),
|
|
1402
|
+
page_range: z.string().optional(),
|
|
1403
|
+
}), params);
|
|
1404
|
+
// Verify file exists - FAIL FAST
|
|
1405
|
+
if (!existsSync(input.file_path)) {
|
|
1406
|
+
throw new Error(`File not found: ${input.file_path}`);
|
|
1407
|
+
}
|
|
1408
|
+
const stats = statSync(input.file_path);
|
|
1409
|
+
if (!stats.isFile()) {
|
|
1410
|
+
throw new Error(`Not a file: ${input.file_path}`);
|
|
1411
|
+
}
|
|
1412
|
+
// Use DatalabClient directly without DB storage
|
|
1413
|
+
const client = new DatalabClient();
|
|
1414
|
+
const result = await client.processRaw(input.file_path, input.ocr_mode, {
|
|
1415
|
+
maxPages: input.max_pages,
|
|
1416
|
+
pageRange: input.page_range,
|
|
1417
|
+
});
|
|
1418
|
+
return formatResponse(successResult({
|
|
1419
|
+
file_path: input.file_path,
|
|
1420
|
+
text_length: result.markdown.length,
|
|
1421
|
+
page_count: result.pageCount,
|
|
1422
|
+
markdown: result.markdown,
|
|
1423
|
+
metadata: result.metadata ?? {},
|
|
1424
|
+
quality_score: result.qualityScore,
|
|
1425
|
+
cost_cents: result.costCents,
|
|
1426
|
+
processing_duration_ms: result.durationMs,
|
|
1427
|
+
next_steps: [
|
|
1428
|
+
{ tool: 'ocr_ingest_files', description: 'Ingest the file for full pipeline processing' },
|
|
1429
|
+
],
|
|
1430
|
+
}));
|
|
1431
|
+
}
|
|
1432
|
+
catch (error) {
|
|
1433
|
+
return handleError(error);
|
|
1434
|
+
}
|
|
1435
|
+
}
|
|
1436
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1437
|
+
// REPROCESS HANDLER
|
|
1438
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1439
|
+
/**
|
|
1440
|
+
* Handle ocr_reprocess - Reprocess a document with different OCR settings
|
|
1441
|
+
* Cleans all derived data first, then re-runs the pipeline.
|
|
1442
|
+
*
|
|
1443
|
+
* M-11 FIX: Previously called handleProcessPending() which uses atomic batch
|
|
1444
|
+
* claiming on ALL pending documents. If other documents were already pending,
|
|
1445
|
+
* the target document might not be claimed. Now directly claims and processes
|
|
1446
|
+
* only the target document via the module-level processOneDocument function.
|
|
1447
|
+
*/
|
|
1448
|
+
async function handleReprocess(params) {
|
|
1449
|
+
try {
|
|
1450
|
+
const input = validateInput(z.object({
|
|
1451
|
+
document_id: z.string().min(1),
|
|
1452
|
+
ocr_mode: z.enum(['fast', 'balanced', 'accurate']).optional(),
|
|
1453
|
+
skip_cache: z.boolean().default(true),
|
|
1454
|
+
}), params);
|
|
1455
|
+
if (!process.env.DATALAB_API_KEY) {
|
|
1456
|
+
throw new Error('DATALAB_API_KEY environment variable is required for OCR processing');
|
|
1457
|
+
}
|
|
1458
|
+
// H-1/H-2: Use withDatabaseOperation to track this long-running async operation.
|
|
1459
|
+
return await withDatabaseOperation(async ({ db, vector, generation }) => {
|
|
1460
|
+
const doc = db.getDocument(input.document_id);
|
|
1461
|
+
if (!doc)
|
|
1462
|
+
throw documentNotFoundError(input.document_id);
|
|
1463
|
+
if (doc.status !== 'complete' && doc.status !== 'failed') {
|
|
1464
|
+
throw new Error(`Document status must be 'complete' or 'failed' to reprocess (current: ${doc.status})`);
|
|
1465
|
+
}
|
|
1466
|
+
// Save previous quality score for comparison
|
|
1467
|
+
const previousOCR = db.getOCRResultByDocumentId(doc.id);
|
|
1468
|
+
const previousQuality = previousOCR?.parse_quality_score ?? null;
|
|
1469
|
+
// Clean all derived data (chunks, embeddings, images, ocr_results, extractions)
|
|
1470
|
+
db.cleanDocumentDerivedData(doc.id);
|
|
1471
|
+
// M-11 FIX: Directly claim THIS document by setting status to 'processing'.
|
|
1472
|
+
// Previously set to 'pending' then called handleProcessPending() which batch-claims
|
|
1473
|
+
// from ALL pending documents -- a race condition if other documents are also pending.
|
|
1474
|
+
db.updateDocumentStatus(doc.id, 'processing');
|
|
1475
|
+
const ocrMode = input.ocr_mode ?? state.config.defaultOCRMode;
|
|
1476
|
+
const imagesBaseDir = resolve(state.config.defaultStoragePath, 'images');
|
|
1477
|
+
const startTime = Date.now();
|
|
1478
|
+
// Process the single document directly -- no batch claiming needed
|
|
1479
|
+
let reprocessWarnings = [];
|
|
1480
|
+
try {
|
|
1481
|
+
reprocessWarnings = await processOneDocument(doc, {
|
|
1482
|
+
db,
|
|
1483
|
+
vector,
|
|
1484
|
+
generation,
|
|
1485
|
+
ocrMode,
|
|
1486
|
+
ocrOptions: {
|
|
1487
|
+
skipCache: input.skip_cache,
|
|
1488
|
+
},
|
|
1489
|
+
imagesBaseDir,
|
|
1490
|
+
});
|
|
1491
|
+
}
|
|
1492
|
+
catch (error) {
|
|
1493
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1494
|
+
console.error(`[ERROR] Reprocess failed for document ${doc.id}: ${errorMsg}`);
|
|
1495
|
+
// Clean up partial data and mark as failed
|
|
1496
|
+
try {
|
|
1497
|
+
db.cleanDocumentDerivedData(doc.id);
|
|
1498
|
+
}
|
|
1499
|
+
catch (cleanupError) {
|
|
1500
|
+
console.error(`[WARN] Cleanup of partial data failed for ${doc.id}: ` +
|
|
1501
|
+
`${cleanupError instanceof Error ? cleanupError.message : String(cleanupError)}`);
|
|
1502
|
+
}
|
|
1503
|
+
db.updateDocumentStatus(doc.id, 'failed', errorMsg);
|
|
1504
|
+
throw error;
|
|
1505
|
+
}
|
|
1506
|
+
// Get new quality score
|
|
1507
|
+
const newOCR = db.getOCRResultByDocumentId(doc.id);
|
|
1508
|
+
return formatResponse(successResult({
|
|
1509
|
+
document_id: doc.id,
|
|
1510
|
+
previous_quality: previousQuality,
|
|
1511
|
+
new_quality: newOCR?.parse_quality_score ?? null,
|
|
1512
|
+
quality_change: previousQuality !== null &&
|
|
1513
|
+
newOCR?.parse_quality_score !== null &&
|
|
1514
|
+
newOCR?.parse_quality_score !== undefined
|
|
1515
|
+
? (newOCR.parse_quality_score - previousQuality).toFixed(2)
|
|
1516
|
+
: null,
|
|
1517
|
+
processing_duration_ms: Date.now() - startTime,
|
|
1518
|
+
...(reprocessWarnings.length > 0 ? { warnings: reprocessWarnings } : {}),
|
|
1519
|
+
next_steps: [
|
|
1520
|
+
{ tool: 'ocr_status', description: 'Check processing status' },
|
|
1521
|
+
{ tool: 'ocr_document_get', description: 'View updated document details' },
|
|
1522
|
+
],
|
|
1523
|
+
}));
|
|
1524
|
+
});
|
|
1525
|
+
}
|
|
1526
|
+
catch (error) {
|
|
1527
|
+
return handleError(error);
|
|
1528
|
+
}
|
|
1529
|
+
}
|
|
1530
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1531
|
+
// TOOL DEFINITIONS FOR MCP REGISTRATION
|
|
1532
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1533
|
+
/**
|
|
1534
|
+
* Ingestion tools collection for MCP server registration
|
|
1535
|
+
*/
|
|
1536
|
+
export const ingestionTools = {
|
|
1537
|
+
ocr_ingest_directory: {
|
|
1538
|
+
description: '[PROCESSING] Use to bulk-ingest all supported files from a directory. Returns per-file status (pending/skipped/error). Follow with ocr_process_pending to run OCR.',
|
|
1539
|
+
inputSchema: {
|
|
1540
|
+
directory_path: z.string().min(1).describe('Path to directory to scan'),
|
|
1541
|
+
recursive: z.boolean().default(true).describe('Scan subdirectories'),
|
|
1542
|
+
file_types: z
|
|
1543
|
+
.array(z.string())
|
|
1544
|
+
.optional()
|
|
1545
|
+
.describe('File types to include (default: pdf, png, jpg, docx, etc.)'),
|
|
1546
|
+
},
|
|
1547
|
+
handler: handleIngestDirectory,
|
|
1548
|
+
},
|
|
1549
|
+
ocr_ingest_files: {
|
|
1550
|
+
description: '[ESSENTIAL] Use to ingest specific files by path into the current database. Returns per-file status. Follow with ocr_process_pending to run OCR.',
|
|
1551
|
+
inputSchema: {
|
|
1552
|
+
file_paths: z.array(z.string().min(1)).min(1).describe('Array of file paths to ingest'),
|
|
1553
|
+
},
|
|
1554
|
+
handler: handleIngestFiles,
|
|
1555
|
+
},
|
|
1556
|
+
ocr_process_pending: {
|
|
1557
|
+
description: '[ESSENTIAL] Use after ingesting files to run the full OCR pipeline (OCR, chunking, embedding, VLM). Returns processed/failed counts. Requires DATALAB_API_KEY.',
|
|
1558
|
+
inputSchema: {
|
|
1559
|
+
max_concurrent: z
|
|
1560
|
+
.number()
|
|
1561
|
+
.int()
|
|
1562
|
+
.min(1)
|
|
1563
|
+
.max(10)
|
|
1564
|
+
.default(3)
|
|
1565
|
+
.describe('Maximum concurrent OCR operations'),
|
|
1566
|
+
ocr_mode: z
|
|
1567
|
+
.enum(['fast', 'balanced', 'accurate'])
|
|
1568
|
+
.optional()
|
|
1569
|
+
.describe('OCR processing mode override'),
|
|
1570
|
+
max_pages: z
|
|
1571
|
+
.number()
|
|
1572
|
+
.int()
|
|
1573
|
+
.min(1)
|
|
1574
|
+
.max(7000)
|
|
1575
|
+
.optional()
|
|
1576
|
+
.describe('Maximum pages to process per document (Datalab limit: 7000)'),
|
|
1577
|
+
page_range: z
|
|
1578
|
+
.string()
|
|
1579
|
+
.regex(/^[0-9,\-\s]+$/)
|
|
1580
|
+
.optional()
|
|
1581
|
+
.describe('Specific pages to process, 0-indexed (e.g., "0-5,10")'),
|
|
1582
|
+
skip_cache: z.boolean().optional().describe('Force reprocessing, skip Datalab cache'),
|
|
1583
|
+
disable_image_extraction: z
|
|
1584
|
+
.boolean()
|
|
1585
|
+
.optional()
|
|
1586
|
+
.describe('Skip image extraction for text-only processing'),
|
|
1587
|
+
extras: z
|
|
1588
|
+
.array(z.enum([
|
|
1589
|
+
'track_changes',
|
|
1590
|
+
'chart_understanding',
|
|
1591
|
+
'extract_links',
|
|
1592
|
+
'table_row_bboxes',
|
|
1593
|
+
'infographic',
|
|
1594
|
+
'new_block_types',
|
|
1595
|
+
]))
|
|
1596
|
+
.optional()
|
|
1597
|
+
.describe('Extra Datalab features to enable'),
|
|
1598
|
+
page_schema: z
|
|
1599
|
+
.string()
|
|
1600
|
+
.optional()
|
|
1601
|
+
.describe('JSON schema string for structured data extraction per page'),
|
|
1602
|
+
additional_config: z
|
|
1603
|
+
.record(z.unknown())
|
|
1604
|
+
.optional()
|
|
1605
|
+
.describe('Additional Datalab config: keep_pageheader_in_output, keep_pagefooter_in_output, keep_spreadsheet_formatting'),
|
|
1606
|
+
},
|
|
1607
|
+
handler: handleProcessPending,
|
|
1608
|
+
},
|
|
1609
|
+
ocr_status: {
|
|
1610
|
+
description: '[STATUS] Use to check processing status of documents (pending/processing/complete/failed). Returns per-document status and summary counts.',
|
|
1611
|
+
inputSchema: {
|
|
1612
|
+
document_id: z.string().optional().describe('Specific document ID to check'),
|
|
1613
|
+
status_filter: z
|
|
1614
|
+
.enum(['pending', 'processing', 'complete', 'failed', 'all'])
|
|
1615
|
+
.default('all')
|
|
1616
|
+
.describe('Filter by status'),
|
|
1617
|
+
},
|
|
1618
|
+
handler: handleOCRStatus,
|
|
1619
|
+
},
|
|
1620
|
+
ocr_retry_failed: {
|
|
1621
|
+
description: '[PROCESSING] Use to reset failed documents back to pending for reprocessing. Cleans derived data first. Follow with ocr_process_pending.',
|
|
1622
|
+
inputSchema: {
|
|
1623
|
+
document_id: z
|
|
1624
|
+
.string()
|
|
1625
|
+
.optional()
|
|
1626
|
+
.describe('Specific document ID to retry (omit to retry all failed)'),
|
|
1627
|
+
},
|
|
1628
|
+
handler: handleRetryFailed,
|
|
1629
|
+
},
|
|
1630
|
+
ocr_reprocess: {
|
|
1631
|
+
description: '[PROCESSING] Use to re-run OCR on a document with different settings. Cleans existing data first. Returns quality comparison (before/after).',
|
|
1632
|
+
inputSchema: {
|
|
1633
|
+
document_id: z.string().min(1).describe('Document ID to reprocess'),
|
|
1634
|
+
ocr_mode: z.enum(['fast', 'balanced', 'accurate']).optional().describe('OCR mode override'),
|
|
1635
|
+
skip_cache: z
|
|
1636
|
+
.boolean()
|
|
1637
|
+
.default(true)
|
|
1638
|
+
.describe('Skip Datalab cache (default: true for reprocessing)'),
|
|
1639
|
+
},
|
|
1640
|
+
handler: handleReprocess,
|
|
1641
|
+
},
|
|
1642
|
+
ocr_convert_raw: {
|
|
1643
|
+
description: '[PROCESSING] Use when you need a quick OCR preview of a file without creating database records. Converts a file to markdown text via Datalab API and returns the raw result. Use ocr_ingest_files + ocr_process_pending instead for full pipeline processing.',
|
|
1644
|
+
inputSchema: {
|
|
1645
|
+
file_path: z.string().min(1).describe('Path to file to convert'),
|
|
1646
|
+
ocr_mode: z
|
|
1647
|
+
.enum(['fast', 'balanced', 'accurate'])
|
|
1648
|
+
.default('balanced')
|
|
1649
|
+
.describe('OCR processing mode'),
|
|
1650
|
+
max_pages: z.number().int().min(1).max(7000).optional().describe('Maximum pages to process'),
|
|
1651
|
+
page_range: z
|
|
1652
|
+
.string()
|
|
1653
|
+
.optional()
|
|
1654
|
+
.describe('Specific pages to process (0-indexed, e.g., "0-5,10")'),
|
|
1655
|
+
},
|
|
1656
|
+
handler: handleConvertRaw,
|
|
1657
|
+
},
|
|
1658
|
+
};
|
|
1659
|
+
//# sourceMappingURL=ingestion.js.map
|