ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,1624 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Management MCP Tools
|
|
3
|
+
*
|
|
4
|
+
* Extracted from src/index.ts Task 22.
|
|
5
|
+
* Tools: ocr_document_list, ocr_document_get, ocr_document_delete,
|
|
6
|
+
* ocr_document_find_similar
|
|
7
|
+
*
|
|
8
|
+
* CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
|
|
9
|
+
* Use console.error() for all logging.
|
|
10
|
+
*
|
|
11
|
+
* @module tools/documents
|
|
12
|
+
*/
|
|
13
|
+
import { z } from 'zod';
|
|
14
|
+
import { existsSync, rmSync, writeFileSync, mkdirSync } from 'fs';
|
|
15
|
+
import { resolve, dirname } from 'path';
|
|
16
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
17
|
+
import { requireDatabase, getDefaultStoragePath } from '../server/state.js';
|
|
18
|
+
import { successResult } from '../server/types.js';
|
|
19
|
+
import { validateInput, sanitizePath, DocumentGetInput, DocumentDeleteInput, } from '../utils/validation.js';
|
|
20
|
+
import { listDocumentsWithCursor, encodeCursor } from '../services/storage/database/document-operations.js';
|
|
21
|
+
import { documentNotFoundError, MCPError } from '../server/errors.js';
|
|
22
|
+
import { formatResponse, handleError, fetchProvenanceChain } from './shared.js';
|
|
23
|
+
import { getComparisonSummariesByDocument } from '../services/storage/database/comparison-operations.js';
|
|
24
|
+
import { getClusterSummariesForDocument } from '../services/storage/database/cluster-operations.js';
|
|
25
|
+
import { getImagesByDocument } from '../services/storage/database/image-operations.js';
|
|
26
|
+
import { extractTableStructures } from '../services/chunking/json-block-analyzer.js';
|
|
27
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
28
|
+
// DOCUMENT LIST INPUT SCHEMA (with cursor support)
|
|
29
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
30
|
+
const DocumentListInputWithCursor = z.object({
|
|
31
|
+
status_filter: z.enum(['pending', 'processing', 'complete', 'failed']).optional(),
|
|
32
|
+
limit: z.number().int().min(1).max(1000).default(50),
|
|
33
|
+
offset: z.number().int().min(0).default(0),
|
|
34
|
+
created_after: z.string().datetime().optional()
|
|
35
|
+
.describe('Filter documents created after this ISO 8601 timestamp'),
|
|
36
|
+
created_before: z.string().datetime().optional()
|
|
37
|
+
.describe('Filter documents created before this ISO 8601 timestamp'),
|
|
38
|
+
file_type: z.string().optional()
|
|
39
|
+
.describe('Filter by file type (e.g., "pdf", "docx")'),
|
|
40
|
+
cursor: z.string().optional()
|
|
41
|
+
.describe('Cursor from a previous response for keyset pagination. When provided, offset is ignored.'),
|
|
42
|
+
});
|
|
43
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
44
|
+
// DOCUMENT TOOL HANDLERS
|
|
45
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
46
|
+
/**
|
|
47
|
+
* Handle ocr_document_list - List documents in the current database.
|
|
48
|
+
*
|
|
49
|
+
* Supports both offset-based and cursor-based pagination.
|
|
50
|
+
* When `cursor` is provided, keyset pagination is used (more efficient for large datasets).
|
|
51
|
+
*/
|
|
52
|
+
export async function handleDocumentList(params) {
|
|
53
|
+
try {
|
|
54
|
+
const input = validateInput(DocumentListInputWithCursor, params);
|
|
55
|
+
const { db } = requireDatabase();
|
|
56
|
+
const conn = db.getConnection();
|
|
57
|
+
// Build dynamic SQL with conditional WHERE clauses for new filters
|
|
58
|
+
const conditions = [];
|
|
59
|
+
const queryParams = [];
|
|
60
|
+
if (input.status_filter) {
|
|
61
|
+
conditions.push('status = ?');
|
|
62
|
+
queryParams.push(input.status_filter);
|
|
63
|
+
}
|
|
64
|
+
if (input.created_after) {
|
|
65
|
+
conditions.push('created_at > ?');
|
|
66
|
+
queryParams.push(input.created_after);
|
|
67
|
+
}
|
|
68
|
+
if (input.created_before) {
|
|
69
|
+
conditions.push('created_at < ?');
|
|
70
|
+
queryParams.push(input.created_before);
|
|
71
|
+
}
|
|
72
|
+
if (input.file_type) {
|
|
73
|
+
conditions.push('file_type = ?');
|
|
74
|
+
queryParams.push(input.file_type);
|
|
75
|
+
}
|
|
76
|
+
// When using cursor, delegate to the cursor-based pagination layer
|
|
77
|
+
// which handles keyset filtering internally
|
|
78
|
+
if (input.cursor) {
|
|
79
|
+
const cursorResult = listDocumentsWithCursor(conn, {
|
|
80
|
+
status: input.status_filter,
|
|
81
|
+
limit: input.limit,
|
|
82
|
+
cursor: input.cursor,
|
|
83
|
+
});
|
|
84
|
+
// Get total count with same filters (without cursor for accurate total)
|
|
85
|
+
const whereClause = conditions.length > 0 ? ' WHERE ' + conditions.join(' AND ') : '';
|
|
86
|
+
const countRow = conn
|
|
87
|
+
.prepare(`SELECT COUNT(*) as total FROM documents${whereClause}`)
|
|
88
|
+
.get(...queryParams);
|
|
89
|
+
const extrasStmt = conn.prepare('SELECT extras_json FROM ocr_results WHERE document_id = ? LIMIT 1');
|
|
90
|
+
return formatResponse(successResult({
|
|
91
|
+
documents: cursorResult.documents.map((d) => ({
|
|
92
|
+
id: d.id,
|
|
93
|
+
file_name: d.file_name,
|
|
94
|
+
file_path: d.file_path,
|
|
95
|
+
file_size: d.file_size,
|
|
96
|
+
file_type: d.file_type,
|
|
97
|
+
status: d.status,
|
|
98
|
+
page_count: d.page_count,
|
|
99
|
+
doc_title: d.doc_title ?? null,
|
|
100
|
+
doc_author: d.doc_author ?? null,
|
|
101
|
+
doc_subject: d.doc_subject ?? null,
|
|
102
|
+
created_at: d.created_at,
|
|
103
|
+
structural_summary: getStructuralSummary(extrasStmt, d.id),
|
|
104
|
+
})),
|
|
105
|
+
total: countRow.total,
|
|
106
|
+
limit: input.limit,
|
|
107
|
+
next_cursor: cursorResult.next_cursor,
|
|
108
|
+
next_steps: buildDocumentListNextSteps(countRow.total),
|
|
109
|
+
}));
|
|
110
|
+
}
|
|
111
|
+
// Standard offset-based pagination path
|
|
112
|
+
const whereClause = conditions.length > 0 ? ' WHERE ' + conditions.join(' AND ') : '';
|
|
113
|
+
// Get total count with same filters
|
|
114
|
+
const countRow = conn
|
|
115
|
+
.prepare(`SELECT COUNT(*) as total FROM documents${whereClause}`)
|
|
116
|
+
.get(...queryParams);
|
|
117
|
+
const total = countRow.total;
|
|
118
|
+
// Get paginated results
|
|
119
|
+
const dataQuery = `SELECT * FROM documents${whereClause} ORDER BY created_at DESC, id DESC LIMIT ? OFFSET ?`;
|
|
120
|
+
const dataParams = [...queryParams, input.limit, input.offset];
|
|
121
|
+
const rows = conn.prepare(dataQuery).all(...dataParams);
|
|
122
|
+
// Phase 2: Prepared statement for structural summary from extras_json
|
|
123
|
+
const extrasStmt = conn.prepare('SELECT extras_json FROM ocr_results WHERE document_id = ? LIMIT 1');
|
|
124
|
+
// Compute next_cursor from the last row for cursor-based pagination compatibility
|
|
125
|
+
let next_cursor = null;
|
|
126
|
+
if (rows.length > 0 && rows.length === input.limit) {
|
|
127
|
+
const lastRow = rows[rows.length - 1];
|
|
128
|
+
next_cursor = encodeCursor(lastRow.created_at, lastRow.id);
|
|
129
|
+
}
|
|
130
|
+
return formatResponse(successResult({
|
|
131
|
+
documents: rows.map((d) => ({
|
|
132
|
+
id: d.id,
|
|
133
|
+
file_name: d.file_name,
|
|
134
|
+
file_path: d.file_path,
|
|
135
|
+
file_size: d.file_size,
|
|
136
|
+
file_type: d.file_type,
|
|
137
|
+
status: d.status,
|
|
138
|
+
page_count: d.page_count,
|
|
139
|
+
doc_title: d.doc_title ?? null,
|
|
140
|
+
doc_author: d.doc_author ?? null,
|
|
141
|
+
doc_subject: d.doc_subject ?? null,
|
|
142
|
+
created_at: d.created_at,
|
|
143
|
+
structural_summary: getStructuralSummary(extrasStmt, d.id),
|
|
144
|
+
})),
|
|
145
|
+
total,
|
|
146
|
+
limit: input.limit,
|
|
147
|
+
offset: input.offset,
|
|
148
|
+
next_cursor,
|
|
149
|
+
next_steps: buildDocumentListNextSteps(total),
|
|
150
|
+
}));
|
|
151
|
+
}
|
|
152
|
+
catch (error) {
|
|
153
|
+
return handleError(error);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Extract structural summary from extras_json for a document.
|
|
158
|
+
*/
|
|
159
|
+
function getStructuralSummary(extrasStmt, documentId) {
|
|
160
|
+
try {
|
|
161
|
+
const ocrRow = extrasStmt.get(documentId);
|
|
162
|
+
if (!ocrRow?.extras_json)
|
|
163
|
+
return null;
|
|
164
|
+
const extras = JSON.parse(ocrRow.extras_json);
|
|
165
|
+
const fp = extras.structural_fingerprint;
|
|
166
|
+
if (!fp)
|
|
167
|
+
return null;
|
|
168
|
+
const headingDepths = fp.heading_depths;
|
|
169
|
+
return {
|
|
170
|
+
table_count: fp.table_count ?? 0,
|
|
171
|
+
figure_count: fp.figure_count ?? 0,
|
|
172
|
+
heading_count: headingDepths ? Object.values(headingDepths).reduce((a, b) => a + b, 0) : 0,
|
|
173
|
+
content_types: fp.content_type_distribution ?? null,
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
catch (error) {
|
|
177
|
+
console.error(`[documents] Failed to parse structural fingerprint for document ${documentId}: ${error instanceof Error ? error.message : String(error)}`);
|
|
178
|
+
return null;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Build next_steps for document list based on total count.
|
|
183
|
+
*/
|
|
184
|
+
function buildDocumentListNextSteps(total) {
|
|
185
|
+
return total === 0
|
|
186
|
+
? [
|
|
187
|
+
{ tool: 'ocr_ingest_files', description: 'Add documents to the database first' },
|
|
188
|
+
{ tool: 'ocr_ingest_directory', description: 'Scan a directory for documents to ingest' },
|
|
189
|
+
]
|
|
190
|
+
: [
|
|
191
|
+
{ tool: 'ocr_document_get', description: 'Get details for a specific document by ID' },
|
|
192
|
+
{ tool: 'ocr_search', description: 'Search within the corpus' },
|
|
193
|
+
{ tool: 'ocr_document_structure', description: 'View a document outline (headings, tables)' },
|
|
194
|
+
];
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Handle ocr_document_get - Get detailed information about a specific document
|
|
198
|
+
*/
|
|
199
|
+
export async function handleDocumentGet(params) {
|
|
200
|
+
try {
|
|
201
|
+
const input = validateInput(DocumentGetInput, params);
|
|
202
|
+
const { db } = requireDatabase();
|
|
203
|
+
const doc = db.getDocument(input.document_id);
|
|
204
|
+
if (!doc) {
|
|
205
|
+
throw documentNotFoundError(input.document_id);
|
|
206
|
+
}
|
|
207
|
+
// Always fetch OCR result for metadata (lightweight - excludes extracted_text in response unless include_text)
|
|
208
|
+
const ocrResult = db.getOCRResultByDocumentId(doc.id);
|
|
209
|
+
const result = {
|
|
210
|
+
id: doc.id,
|
|
211
|
+
file_name: doc.file_name,
|
|
212
|
+
file_path: doc.file_path,
|
|
213
|
+
file_hash: doc.file_hash,
|
|
214
|
+
file_size: doc.file_size,
|
|
215
|
+
file_type: doc.file_type,
|
|
216
|
+
status: doc.status,
|
|
217
|
+
page_count: doc.page_count,
|
|
218
|
+
doc_title: doc.doc_title ?? null,
|
|
219
|
+
doc_author: doc.doc_author ?? null,
|
|
220
|
+
doc_subject: doc.doc_subject ?? null,
|
|
221
|
+
created_at: doc.created_at,
|
|
222
|
+
provenance_id: doc.provenance_id,
|
|
223
|
+
ocr_info: ocrResult
|
|
224
|
+
? {
|
|
225
|
+
ocr_result_id: ocrResult.id,
|
|
226
|
+
datalab_request_id: ocrResult.datalab_request_id,
|
|
227
|
+
datalab_mode: ocrResult.datalab_mode,
|
|
228
|
+
parse_quality_score: ocrResult.parse_quality_score,
|
|
229
|
+
cost_cents: ocrResult.cost_cents,
|
|
230
|
+
page_count: ocrResult.page_count,
|
|
231
|
+
text_length: ocrResult.text_length,
|
|
232
|
+
processing_duration_ms: ocrResult.processing_duration_ms,
|
|
233
|
+
content_hash: ocrResult.content_hash,
|
|
234
|
+
}
|
|
235
|
+
: null,
|
|
236
|
+
};
|
|
237
|
+
// Surface enrichment data from extras_json (Tasks 4.1, 4.2, 4.4)
|
|
238
|
+
if (ocrResult?.extras_json) {
|
|
239
|
+
try {
|
|
240
|
+
const extras = JSON.parse(ocrResult.extras_json);
|
|
241
|
+
if (extras.block_type_stats) {
|
|
242
|
+
result.block_type_stats = extras.block_type_stats;
|
|
243
|
+
}
|
|
244
|
+
if (extras.link_count !== undefined) {
|
|
245
|
+
result.link_count = extras.link_count;
|
|
246
|
+
result.structured_links = extras.structured_links ?? [];
|
|
247
|
+
}
|
|
248
|
+
if (extras.structural_fingerprint) {
|
|
249
|
+
result.structural_fingerprint = extras.structural_fingerprint;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
catch (parseErr) {
|
|
253
|
+
console.error(`[DocumentGet] Failed to parse extras_json for enrichment fields: ${String(parseErr)}`);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// Compute document_profile from block_type_stats (no additional DB queries)
|
|
257
|
+
const stats = result.block_type_stats;
|
|
258
|
+
if (stats) {
|
|
259
|
+
const richBlockCount = stats.table_blocks + stats.figure_blocks + stats.code_blocks;
|
|
260
|
+
let contentComplexity;
|
|
261
|
+
if (richBlockCount > 5) {
|
|
262
|
+
contentComplexity = 'high';
|
|
263
|
+
}
|
|
264
|
+
else if (stats.table_blocks + stats.figure_blocks > 0) {
|
|
265
|
+
contentComplexity = 'medium';
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
contentComplexity = 'low';
|
|
269
|
+
}
|
|
270
|
+
result.document_profile = {
|
|
271
|
+
has_tables: stats.table_blocks > 0,
|
|
272
|
+
has_figures: stats.figure_blocks > 0,
|
|
273
|
+
has_code: stats.code_blocks > 0,
|
|
274
|
+
has_lists: stats.list_blocks > 0,
|
|
275
|
+
content_complexity: contentComplexity,
|
|
276
|
+
tables_per_page: stats.tables_per_page ?? null,
|
|
277
|
+
figures_per_page: stats.figures_per_page ?? null,
|
|
278
|
+
text_density: stats.text_density ?? null,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
else {
|
|
282
|
+
result.document_profile = null;
|
|
283
|
+
}
|
|
284
|
+
if (input.include_text) {
|
|
285
|
+
result.ocr_text = ocrResult?.extracted_text ?? null;
|
|
286
|
+
}
|
|
287
|
+
if (input.include_chunks) {
|
|
288
|
+
const chunks = db.getChunksByDocumentId(doc.id);
|
|
289
|
+
result.chunks = chunks.map((c) => ({
|
|
290
|
+
id: c.id,
|
|
291
|
+
chunk_index: c.chunk_index,
|
|
292
|
+
text_length: c.text.length,
|
|
293
|
+
page_number: c.page_number,
|
|
294
|
+
character_start: c.character_start,
|
|
295
|
+
character_end: c.character_end,
|
|
296
|
+
embedding_status: c.embedding_status,
|
|
297
|
+
heading_context: c.heading_context ?? null,
|
|
298
|
+
heading_level: c.heading_level ?? null,
|
|
299
|
+
section_path: c.section_path ?? null,
|
|
300
|
+
content_types: c.content_types ?? null,
|
|
301
|
+
is_atomic: c.is_atomic ?? 0,
|
|
302
|
+
chunking_strategy: c.chunking_strategy ?? null,
|
|
303
|
+
}));
|
|
304
|
+
}
|
|
305
|
+
if (input.include_blocks && ocrResult) {
|
|
306
|
+
result.json_blocks = ocrResult.json_blocks ? JSON.parse(ocrResult.json_blocks) : null;
|
|
307
|
+
result.extras = ocrResult.extras_json ? JSON.parse(ocrResult.extras_json) : null;
|
|
308
|
+
}
|
|
309
|
+
if (input.include_full_provenance) {
|
|
310
|
+
const chain = db.getProvenanceChain(doc.provenance_id);
|
|
311
|
+
result.provenance_chain = chain.map((p) => ({
|
|
312
|
+
id: p.id,
|
|
313
|
+
type: p.type,
|
|
314
|
+
chain_depth: p.chain_depth,
|
|
315
|
+
processor: p.processor,
|
|
316
|
+
processor_version: p.processor_version,
|
|
317
|
+
content_hash: p.content_hash,
|
|
318
|
+
created_at: p.created_at,
|
|
319
|
+
}));
|
|
320
|
+
}
|
|
321
|
+
// Comparison context: show all comparisons referencing this document
|
|
322
|
+
const comparisons = getComparisonSummariesByDocument(db.getConnection(), doc.id);
|
|
323
|
+
result.comparisons = {
|
|
324
|
+
total: comparisons.length,
|
|
325
|
+
items: comparisons.map((c) => ({
|
|
326
|
+
comparison_id: c.id,
|
|
327
|
+
compared_with: c.document_id_1 === doc.id ? c.document_id_2 : c.document_id_1,
|
|
328
|
+
similarity_ratio: c.similarity_ratio,
|
|
329
|
+
summary: c.summary,
|
|
330
|
+
created_at: c.created_at,
|
|
331
|
+
})),
|
|
332
|
+
};
|
|
333
|
+
// Cluster memberships: show all clusters this document belongs to
|
|
334
|
+
const clusterMemberships = getClusterSummariesForDocument(db.getConnection(), doc.id);
|
|
335
|
+
if (clusterMemberships.length > 0) {
|
|
336
|
+
result.clusters = clusterMemberships.map((c) => ({
|
|
337
|
+
cluster_id: c.id,
|
|
338
|
+
run_id: c.run_id,
|
|
339
|
+
cluster_index: c.cluster_index,
|
|
340
|
+
label: c.label,
|
|
341
|
+
classification_tag: c.classification_tag,
|
|
342
|
+
coherence_score: c.coherence_score,
|
|
343
|
+
}));
|
|
344
|
+
}
|
|
345
|
+
result.next_steps = [
|
|
346
|
+
{ tool: 'ocr_document_page', description: 'Read a specific page of this document' },
|
|
347
|
+
{ tool: 'ocr_document_structure', description: 'View document outline and layout' },
|
|
348
|
+
{ tool: 'ocr_search', description: 'Search within this document (use document_id filter)' },
|
|
349
|
+
{ tool: 'ocr_chunk_list', description: 'List all chunks with section/heading filtering' },
|
|
350
|
+
{ tool: 'ocr_form_fill', description: 'Fill form fields using this document' },
|
|
351
|
+
{ tool: 'ocr_document_versions', description: 'Find other versions of this document' },
|
|
352
|
+
{ tool: 'ocr_document_extras', description: 'View OCR extras (blocks, links, fingerprint)' },
|
|
353
|
+
{ tool: 'ocr_document_recommend', description: 'Get cluster-based document recommendations' },
|
|
354
|
+
];
|
|
355
|
+
return formatResponse(successResult(result));
|
|
356
|
+
}
|
|
357
|
+
catch (error) {
|
|
358
|
+
return handleError(error);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Handle ocr_document_delete - Delete a document and all its derived data
|
|
363
|
+
*/
|
|
364
|
+
export async function handleDocumentDelete(params) {
|
|
365
|
+
try {
|
|
366
|
+
const input = validateInput(DocumentDeleteInput, params);
|
|
367
|
+
const { db, vector } = requireDatabase();
|
|
368
|
+
const doc = db.getDocument(input.document_id);
|
|
369
|
+
if (!doc) {
|
|
370
|
+
throw documentNotFoundError(input.document_id);
|
|
371
|
+
}
|
|
372
|
+
// Count items before deletion for reporting
|
|
373
|
+
const chunks = db.getChunksByDocumentId(doc.id);
|
|
374
|
+
const embeddings = db.getEmbeddingsByDocumentId(doc.id);
|
|
375
|
+
const provenance = db.getProvenanceByRootDocument(doc.provenance_id);
|
|
376
|
+
// Delete vectors first
|
|
377
|
+
const vectorsDeleted = vector.deleteVectorsByDocumentId(doc.id);
|
|
378
|
+
// Delete document (cascades to chunks, embeddings, provenance)
|
|
379
|
+
db.deleteDocument(doc.id);
|
|
380
|
+
// Clean up extracted image files on disk
|
|
381
|
+
let imagesCleanedUp = false;
|
|
382
|
+
const imageDir = resolve(getDefaultStoragePath(), 'images', doc.id);
|
|
383
|
+
if (existsSync(imageDir)) {
|
|
384
|
+
rmSync(imageDir, { recursive: true, force: true });
|
|
385
|
+
imagesCleanedUp = true;
|
|
386
|
+
}
|
|
387
|
+
return formatResponse(successResult({
|
|
388
|
+
document_id: doc.id,
|
|
389
|
+
deleted: true,
|
|
390
|
+
chunks_deleted: chunks.length,
|
|
391
|
+
embeddings_deleted: embeddings.length,
|
|
392
|
+
vectors_deleted: vectorsDeleted,
|
|
393
|
+
provenance_deleted: provenance.length,
|
|
394
|
+
images_directory_cleaned: imagesCleanedUp,
|
|
395
|
+
next_steps: [{ tool: 'ocr_document_list', description: 'Browse remaining documents' }],
|
|
396
|
+
}));
|
|
397
|
+
}
|
|
398
|
+
catch (error) {
|
|
399
|
+
return handleError(error);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
403
|
+
// INPUT SCHEMAS FOR NEW TOOLS
|
|
404
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
405
|
+
const DocumentStructureInput = z.object({
|
|
406
|
+
document_id: z.string().min(1).describe('Document ID'),
|
|
407
|
+
format: z.enum(['structure', 'tree', 'outline']).default('structure')
|
|
408
|
+
.describe('Output format: "structure" (headings/tables/figures/code), "tree" (hierarchical section tree with chunks), "outline" (flat numbered section list)'),
|
|
409
|
+
include_chunk_ids: z.boolean().default(true)
|
|
410
|
+
.describe('Include chunk IDs in each section node (tree/outline formats only)'),
|
|
411
|
+
include_page_numbers: z.boolean().default(true)
|
|
412
|
+
.describe('Include page numbers in each section node (tree/outline formats only)'),
|
|
413
|
+
});
|
|
414
|
+
const FindSimilarInput = z.object({
|
|
415
|
+
document_id: z.string().min(1).describe('Source document ID'),
|
|
416
|
+
limit: z.number().int().min(1).max(50).default(10),
|
|
417
|
+
min_similarity: z.number().min(0).max(1).default(0.5)
|
|
418
|
+
.describe('Minimum similarity threshold (0-1)'),
|
|
419
|
+
});
|
|
420
|
+
const UpdateMetadataInput = z.object({
|
|
421
|
+
document_ids: z.array(z.string().min(1)).min(1)
|
|
422
|
+
.describe('Document IDs to update'),
|
|
423
|
+
doc_title: z.string().optional(),
|
|
424
|
+
doc_author: z.string().optional(),
|
|
425
|
+
doc_subject: z.string().optional(),
|
|
426
|
+
});
|
|
427
|
+
const DuplicateDetectionInput = z.object({
|
|
428
|
+
mode: z.enum(['exact', 'near']).default('near')
|
|
429
|
+
.describe('exact: same file_hash; near: high text similarity'),
|
|
430
|
+
similarity_threshold: z.number().min(0.5).max(1).default(0.9)
|
|
431
|
+
.describe('Minimum similarity for near-duplicate detection'),
|
|
432
|
+
limit: z.number().int().min(1).max(100).default(20),
|
|
433
|
+
});
|
|
434
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
435
|
+
// CROSS-DOCUMENT SIMILARITY HANDLER
|
|
436
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
437
|
+
/**
|
|
438
|
+
* Handle ocr_document_find_similar - Find documents similar to a given document
|
|
439
|
+
* using averaged chunk embeddings as document centroid for vector search.
|
|
440
|
+
*/
|
|
441
|
+
export async function handleFindSimilar(params) {
|
|
442
|
+
try {
|
|
443
|
+
const input = validateInput(FindSimilarInput, params);
|
|
444
|
+
const { db, vector } = requireDatabase();
|
|
445
|
+
// Verify document exists
|
|
446
|
+
const doc = db.getDocument(input.document_id);
|
|
447
|
+
if (!doc) {
|
|
448
|
+
throw documentNotFoundError(input.document_id);
|
|
449
|
+
}
|
|
450
|
+
// Get all chunk embeddings for source document
|
|
451
|
+
const embeddingRows = db.getConnection()
|
|
452
|
+
.prepare('SELECT id FROM embeddings WHERE document_id = ? AND chunk_id IS NOT NULL')
|
|
453
|
+
.all(input.document_id);
|
|
454
|
+
if (embeddingRows.length === 0) {
|
|
455
|
+
throw new MCPError('VALIDATION_ERROR', `Document "${input.document_id}" has no chunk embeddings. Process the document first.`);
|
|
456
|
+
}
|
|
457
|
+
// Collect vectors and compute centroid
|
|
458
|
+
const vectors = [];
|
|
459
|
+
for (const row of embeddingRows) {
|
|
460
|
+
const vec = vector.getVector(row.id);
|
|
461
|
+
if (vec) {
|
|
462
|
+
vectors.push(vec);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
if (vectors.length === 0) {
|
|
466
|
+
throw new MCPError('VALIDATION_ERROR', `Document "${input.document_id}" has embedding records but no vectors in vec_embeddings.`);
|
|
467
|
+
}
|
|
468
|
+
// Average vectors to create 768-dim document centroid
|
|
469
|
+
const dims = 768;
|
|
470
|
+
const centroid = new Float32Array(dims);
|
|
471
|
+
for (const vec of vectors) {
|
|
472
|
+
for (let i = 0; i < dims; i++) {
|
|
473
|
+
centroid[i] += vec[i];
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
for (let i = 0; i < dims; i++) {
|
|
477
|
+
centroid[i] /= vectors.length;
|
|
478
|
+
}
|
|
479
|
+
// Search for similar embeddings (fetch extra to allow aggregation)
|
|
480
|
+
const resultLimit = input.limit ?? 10;
|
|
481
|
+
const minSim = input.min_similarity ?? 0.5;
|
|
482
|
+
const searchResults = vector.searchSimilar(centroid, {
|
|
483
|
+
limit: resultLimit * 10,
|
|
484
|
+
threshold: minSim,
|
|
485
|
+
});
|
|
486
|
+
// Aggregate by document: average similarity across matching chunks, excluding source doc
|
|
487
|
+
const docSimilarityMap = new Map();
|
|
488
|
+
for (const r of searchResults) {
|
|
489
|
+
if (r.document_id === input.document_id)
|
|
490
|
+
continue;
|
|
491
|
+
const entry = docSimilarityMap.get(r.document_id);
|
|
492
|
+
if (entry) {
|
|
493
|
+
entry.totalSim += r.similarity_score;
|
|
494
|
+
entry.count += 1;
|
|
495
|
+
}
|
|
496
|
+
else {
|
|
497
|
+
docSimilarityMap.set(r.document_id, { totalSim: r.similarity_score, count: 1 });
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
// Rank by average similarity, filter by min_similarity, slice to limit
|
|
501
|
+
const ranked = Array.from(docSimilarityMap.entries())
|
|
502
|
+
.map(([docId, { totalSim, count }]) => ({
|
|
503
|
+
document_id: docId,
|
|
504
|
+
avg_similarity: Math.round((totalSim / count) * 1000000) / 1000000,
|
|
505
|
+
matching_chunks: count,
|
|
506
|
+
}))
|
|
507
|
+
.filter((r) => r.avg_similarity >= minSim)
|
|
508
|
+
.sort((a, b) => b.avg_similarity - a.avg_similarity)
|
|
509
|
+
.slice(0, resultLimit);
|
|
510
|
+
// Enrich with document metadata and structural fingerprint
|
|
511
|
+
const conn = db.getConnection();
|
|
512
|
+
const similarDocuments = ranked.map((r) => {
|
|
513
|
+
const simDoc = db.getDocument(r.document_id);
|
|
514
|
+
// Try to include structural fingerprint from extras_json
|
|
515
|
+
let structuralFingerprint = null;
|
|
516
|
+
try {
|
|
517
|
+
const ocrRow = conn
|
|
518
|
+
.prepare('SELECT extras_json FROM ocr_results WHERE document_id = ?')
|
|
519
|
+
.get(r.document_id);
|
|
520
|
+
if (ocrRow?.extras_json) {
|
|
521
|
+
const extras = JSON.parse(ocrRow.extras_json);
|
|
522
|
+
if (extras.structural_fingerprint) {
|
|
523
|
+
structuralFingerprint = extras.structural_fingerprint;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
catch (error) {
|
|
528
|
+
console.error(`[documents] Failed to enrich structural fingerprint for document ${r.document_id}: ${error instanceof Error ? error.message : String(error)}`);
|
|
529
|
+
}
|
|
530
|
+
return {
|
|
531
|
+
document_id: r.document_id,
|
|
532
|
+
file_name: simDoc?.file_name ?? null,
|
|
533
|
+
file_type: simDoc?.file_type ?? null,
|
|
534
|
+
status: simDoc?.status ?? null,
|
|
535
|
+
avg_similarity: r.avg_similarity,
|
|
536
|
+
matching_chunks: r.matching_chunks,
|
|
537
|
+
structural_fingerprint: structuralFingerprint,
|
|
538
|
+
};
|
|
539
|
+
});
|
|
540
|
+
return formatResponse(successResult({
|
|
541
|
+
source_document_id: input.document_id,
|
|
542
|
+
source_chunk_count: vectors.length,
|
|
543
|
+
similar_documents: similarDocuments,
|
|
544
|
+
total: similarDocuments.length,
|
|
545
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'Get details for a similar document' }, { tool: 'ocr_document_compare', description: 'Compare two similar documents' }],
|
|
546
|
+
}));
|
|
547
|
+
}
|
|
548
|
+
catch (error) {
|
|
549
|
+
return handleError(error);
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
553
|
+
// BATCH METADATA UPDATE HANDLER
|
|
554
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
555
|
+
/**
|
|
556
|
+
* Handle ocr_document_update_metadata - Batch update metadata for multiple documents
|
|
557
|
+
*/
|
|
558
|
+
export async function handleUpdateMetadata(params) {
|
|
559
|
+
try {
|
|
560
|
+
const input = validateInput(UpdateMetadataInput, params);
|
|
561
|
+
// Verify at least one metadata field is provided (before requiring database)
|
|
562
|
+
if (input.doc_title === undefined &&
|
|
563
|
+
input.doc_author === undefined &&
|
|
564
|
+
input.doc_subject === undefined) {
|
|
565
|
+
throw new MCPError('VALIDATION_ERROR', 'At least one metadata field (doc_title, doc_author, doc_subject) must be provided.');
|
|
566
|
+
}
|
|
567
|
+
const { db } = requireDatabase();
|
|
568
|
+
let updatedCount = 0;
|
|
569
|
+
const notFoundIds = [];
|
|
570
|
+
for (const docId of input.document_ids) {
|
|
571
|
+
try {
|
|
572
|
+
const doc = db.getDocument(docId);
|
|
573
|
+
if (!doc) {
|
|
574
|
+
notFoundIds.push(docId);
|
|
575
|
+
continue;
|
|
576
|
+
}
|
|
577
|
+
db.updateDocumentMetadata(docId, {
|
|
578
|
+
docTitle: input.doc_title,
|
|
579
|
+
docAuthor: input.doc_author,
|
|
580
|
+
docSubject: input.doc_subject,
|
|
581
|
+
});
|
|
582
|
+
updatedCount++;
|
|
583
|
+
}
|
|
584
|
+
catch (docError) {
|
|
585
|
+
const errMsg = docError instanceof Error ? docError.message : String(docError);
|
|
586
|
+
console.error(`[WARN] Failed to update metadata for document ${docId}: ${errMsg}`);
|
|
587
|
+
notFoundIds.push(docId);
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
return formatResponse(successResult({
|
|
591
|
+
updated_count: updatedCount,
|
|
592
|
+
not_found_ids: notFoundIds,
|
|
593
|
+
total_requested: input.document_ids.length,
|
|
594
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'Verify the updated metadata' }],
|
|
595
|
+
}));
|
|
596
|
+
}
|
|
597
|
+
catch (error) {
|
|
598
|
+
return handleError(error);
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
602
|
+
// DUPLICATE DOCUMENT DETECTION HANDLER
|
|
603
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
604
|
+
/**
|
|
605
|
+
* Handle ocr_document_duplicates - Detect duplicate documents
|
|
606
|
+
*/
|
|
607
|
+
export async function handleDuplicateDetection(params) {
|
|
608
|
+
try {
|
|
609
|
+
const input = validateInput(DuplicateDetectionInput, params);
|
|
610
|
+
const { db } = requireDatabase();
|
|
611
|
+
const conn = db.getConnection();
|
|
612
|
+
if (input.mode === 'exact') {
|
|
613
|
+
// Find documents with same file_hash
|
|
614
|
+
const groups = conn
|
|
615
|
+
.prepare(`
|
|
616
|
+
SELECT file_hash, GROUP_CONCAT(id) as doc_ids, GROUP_CONCAT(file_name) as file_names,
|
|
617
|
+
COUNT(*) as count
|
|
618
|
+
FROM documents
|
|
619
|
+
GROUP BY file_hash
|
|
620
|
+
HAVING COUNT(*) > 1
|
|
621
|
+
ORDER BY count DESC
|
|
622
|
+
LIMIT ?
|
|
623
|
+
`)
|
|
624
|
+
.all(input.limit);
|
|
625
|
+
const duplicateGroups = groups.map((g) => ({
|
|
626
|
+
file_hash: g.file_hash,
|
|
627
|
+
document_ids: g.doc_ids.split(','),
|
|
628
|
+
file_names: g.file_names.split(','),
|
|
629
|
+
count: g.count,
|
|
630
|
+
}));
|
|
631
|
+
return formatResponse(successResult({
|
|
632
|
+
mode: 'exact',
|
|
633
|
+
total_groups: duplicateGroups.length,
|
|
634
|
+
total_duplicate_documents: duplicateGroups.reduce((sum, g) => sum + g.count, 0),
|
|
635
|
+
groups: duplicateGroups,
|
|
636
|
+
next_steps: [{ tool: 'ocr_document_compare', description: 'Compare a duplicate pair in detail' }, { tool: 'ocr_document_delete', description: 'Delete a confirmed duplicate' }],
|
|
637
|
+
}));
|
|
638
|
+
}
|
|
639
|
+
else {
|
|
640
|
+
// Near-duplicate mode: query comparisons table
|
|
641
|
+
const comparisons = conn
|
|
642
|
+
.prepare(`
|
|
643
|
+
SELECT c.id as comparison_id, c.document_id_1, c.document_id_2,
|
|
644
|
+
c.similarity_ratio, c.summary,
|
|
645
|
+
d1.file_name as file_name_1, d2.file_name as file_name_2
|
|
646
|
+
FROM comparisons c
|
|
647
|
+
JOIN documents d1 ON d1.id = c.document_id_1
|
|
648
|
+
JOIN documents d2 ON d2.id = c.document_id_2
|
|
649
|
+
WHERE c.similarity_ratio >= ?
|
|
650
|
+
ORDER BY c.similarity_ratio DESC
|
|
651
|
+
LIMIT ?
|
|
652
|
+
`)
|
|
653
|
+
.all(input.similarity_threshold, input.limit);
|
|
654
|
+
return formatResponse(successResult({
|
|
655
|
+
mode: 'near',
|
|
656
|
+
similarity_threshold: input.similarity_threshold,
|
|
657
|
+
total_pairs: comparisons.length,
|
|
658
|
+
pairs: comparisons.map((c) => ({
|
|
659
|
+
comparison_id: c.comparison_id,
|
|
660
|
+
document_id_1: c.document_id_1,
|
|
661
|
+
file_name_1: c.file_name_1,
|
|
662
|
+
document_id_2: c.document_id_2,
|
|
663
|
+
file_name_2: c.file_name_2,
|
|
664
|
+
similarity_ratio: c.similarity_ratio,
|
|
665
|
+
summary: c.summary,
|
|
666
|
+
})),
|
|
667
|
+
next_steps: [{ tool: 'ocr_document_compare', description: 'Compare a duplicate pair in detail' }, { tool: 'ocr_document_delete', description: 'Delete a confirmed duplicate' }],
|
|
668
|
+
}));
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
catch (error) {
|
|
672
|
+
return handleError(error);
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
676
|
+
// DOCUMENT STRUCTURE ANALYSIS HANDLER
|
|
677
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
678
|
+
/**
|
|
679
|
+
* Build an outline from chunks that have heading metadata.
|
|
680
|
+
* Deduplicates headings by tracking seen heading_context values.
|
|
681
|
+
*/
|
|
682
|
+
function buildOutlineFromChunks(chunks) {
|
|
683
|
+
const seen = new Set();
|
|
684
|
+
const outline = [];
|
|
685
|
+
for (const chunk of chunks) {
|
|
686
|
+
if (chunk.heading_context && !seen.has(chunk.heading_context)) {
|
|
687
|
+
seen.add(chunk.heading_context);
|
|
688
|
+
outline.push({
|
|
689
|
+
level: chunk.heading_level ?? 1,
|
|
690
|
+
text: chunk.heading_context,
|
|
691
|
+
page: chunk.page_number,
|
|
692
|
+
});
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
return outline;
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Walk a block tree from json_blocks, extracting structural elements.
|
|
699
|
+
*/
|
|
700
|
+
function walkBlocks(blocks, outline, tables, figures, codeBlocks) {
|
|
701
|
+
for (const block of blocks) {
|
|
702
|
+
const blockType = block.block_type;
|
|
703
|
+
const page = block.page ?? block.page_idx ?? null;
|
|
704
|
+
if (blockType === 'SectionHeader' || blockType === 'Title') {
|
|
705
|
+
const text = block.text ?? block.html ?? '';
|
|
706
|
+
const level = block.level ?? (blockType === 'Title' ? 1 : 2);
|
|
707
|
+
if (text) {
|
|
708
|
+
outline.push({ level, text, page });
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
else if (blockType === 'Table') {
|
|
712
|
+
const caption = block.caption ?? undefined;
|
|
713
|
+
tables.push({ page, caption });
|
|
714
|
+
}
|
|
715
|
+
else if (blockType === 'Figure' || blockType === 'Picture') {
|
|
716
|
+
const caption = block.caption ?? undefined;
|
|
717
|
+
figures.push({ page, caption });
|
|
718
|
+
}
|
|
719
|
+
else if (blockType === 'Code') {
|
|
720
|
+
const language = block.language ?? undefined;
|
|
721
|
+
codeBlocks.push({ page, language });
|
|
722
|
+
}
|
|
723
|
+
// Recursively walk children if present
|
|
724
|
+
if (Array.isArray(block.children)) {
|
|
725
|
+
walkBlocks(block.children, outline, tables, figures, codeBlocks);
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
/**
|
|
730
|
+
* Handle ocr_document_structure - Analyze document structure
|
|
731
|
+
*
|
|
732
|
+
* Supports three formats:
|
|
733
|
+
* - 'structure' (default): headings, tables, figures, code blocks from json_blocks or chunks
|
|
734
|
+
* - 'tree': hierarchical section tree with chunk_ids, page_numbers (merged from ocr_document_sections)
|
|
735
|
+
* - 'outline': flat numbered outline with chunk counts (merged from ocr_document_sections)
|
|
736
|
+
*/
|
|
737
|
+
export async function handleDocumentStructure(params) {
|
|
738
|
+
try {
|
|
739
|
+
const input = validateInput(DocumentStructureInput, params);
|
|
740
|
+
const { db } = requireDatabase();
|
|
741
|
+
const doc = db.getDocument(input.document_id);
|
|
742
|
+
if (!doc) {
|
|
743
|
+
throw documentNotFoundError(input.document_id);
|
|
744
|
+
}
|
|
745
|
+
// Delegate to sections logic for tree/outline formats
|
|
746
|
+
if (input.format === 'tree' || input.format === 'outline') {
|
|
747
|
+
return handleDocumentSectionsInternal(db, doc, input);
|
|
748
|
+
}
|
|
749
|
+
// Default 'structure' format: headings, tables, figures, code blocks
|
|
750
|
+
const conn = db.getConnection();
|
|
751
|
+
const outline = [];
|
|
752
|
+
const tables = [];
|
|
753
|
+
const figures = [];
|
|
754
|
+
const codeBlocks = [];
|
|
755
|
+
let source = 'chunks';
|
|
756
|
+
let documentMap = null;
|
|
757
|
+
// Try json_blocks first (richer structure)
|
|
758
|
+
const ocrRow = conn
|
|
759
|
+
.prepare('SELECT json_blocks FROM ocr_results WHERE document_id = ?')
|
|
760
|
+
.get(input.document_id);
|
|
761
|
+
if (ocrRow?.json_blocks) {
|
|
762
|
+
try {
|
|
763
|
+
const parsed = JSON.parse(ocrRow.json_blocks);
|
|
764
|
+
// Handle both formats: array of blocks or {children: [...]} object
|
|
765
|
+
const blocks = Array.isArray(parsed) ? parsed
|
|
766
|
+
: (Array.isArray(parsed.children) ? parsed.children : null);
|
|
767
|
+
if (blocks && blocks.length > 0) {
|
|
768
|
+
walkBlocks(blocks, outline, tables, figures, codeBlocks);
|
|
769
|
+
source = 'json_blocks';
|
|
770
|
+
// Build document map with table column details
|
|
771
|
+
try {
|
|
772
|
+
const ocrTextRow = conn.prepare('SELECT extracted_text FROM ocr_results WHERE document_id = ?')
|
|
773
|
+
.get(input.document_id);
|
|
774
|
+
if (ocrTextRow?.extracted_text) {
|
|
775
|
+
// Pass the original parsed object (or wrap array in {children:...})
|
|
776
|
+
const jsonBlocksRoot = Array.isArray(parsed)
|
|
777
|
+
? { children: parsed }
|
|
778
|
+
: parsed;
|
|
779
|
+
const tableStructures = extractTableStructures(jsonBlocksRoot, ocrTextRow.extracted_text, [] // pageOffsets not needed for structure extraction
|
|
780
|
+
);
|
|
781
|
+
documentMap = {
|
|
782
|
+
sections: outline.map(o => ({
|
|
783
|
+
heading: o.text,
|
|
784
|
+
level: o.level,
|
|
785
|
+
page: o.page,
|
|
786
|
+
})),
|
|
787
|
+
tables: tableStructures.map(ts => ({
|
|
788
|
+
page: ts.pageNumber,
|
|
789
|
+
columns: ts.columnHeaders,
|
|
790
|
+
row_count: ts.rowCount,
|
|
791
|
+
column_count: ts.columnCount,
|
|
792
|
+
})),
|
|
793
|
+
figures: figures.map(f => ({
|
|
794
|
+
page: f.page,
|
|
795
|
+
caption: f.caption ?? null,
|
|
796
|
+
})),
|
|
797
|
+
code_blocks: codeBlocks.map(cb => ({
|
|
798
|
+
page: cb.page,
|
|
799
|
+
language: cb.language ?? null,
|
|
800
|
+
})),
|
|
801
|
+
};
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
catch (mapErr) {
|
|
805
|
+
console.error(`[DocumentStructure] Failed to build document_map: ${String(mapErr)}`);
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
catch (parseErr) {
|
|
810
|
+
console.error(`[DocumentStructure] Failed to parse json_blocks for ${input.document_id}: ${String(parseErr)}`);
|
|
811
|
+
// Fall through to chunk-based analysis
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
// Fallback to chunks if no json_blocks or parsing failed
|
|
815
|
+
if (source === 'chunks') {
|
|
816
|
+
const chunks = db.getChunksByDocumentId(input.document_id);
|
|
817
|
+
const chunkData = chunks.map((c) => ({
|
|
818
|
+
heading_context: c.heading_context ?? null,
|
|
819
|
+
heading_level: c.heading_level ?? null,
|
|
820
|
+
page_number: c.page_number,
|
|
821
|
+
}));
|
|
822
|
+
const chunkOutline = buildOutlineFromChunks(chunkData);
|
|
823
|
+
outline.push(...chunkOutline);
|
|
824
|
+
}
|
|
825
|
+
const responseData = {
|
|
826
|
+
document_id: doc.id,
|
|
827
|
+
file_name: doc.file_name,
|
|
828
|
+
page_count: doc.page_count,
|
|
829
|
+
format: 'structure',
|
|
830
|
+
source,
|
|
831
|
+
outline,
|
|
832
|
+
tables: { count: tables.length, items: tables },
|
|
833
|
+
figures: { count: figures.length, items: figures },
|
|
834
|
+
code_blocks: { count: codeBlocks.length, items: codeBlocks },
|
|
835
|
+
total_structural_elements: outline.length + tables.length + figures.length + codeBlocks.length,
|
|
836
|
+
next_steps: [{ tool: 'ocr_document_page', description: 'Read a specific page from the document' }, { tool: 'ocr_search', description: 'Search within the document' }, { tool: 'ocr_document_tables', description: 'Extract table data from the document' }],
|
|
837
|
+
};
|
|
838
|
+
if (documentMap) {
|
|
839
|
+
responseData.document_map = documentMap;
|
|
840
|
+
}
|
|
841
|
+
return formatResponse(successResult(responseData));
|
|
842
|
+
}
|
|
843
|
+
catch (error) {
|
|
844
|
+
return handleError(error);
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
/**
|
|
848
|
+
* Flatten a section tree into a numbered outline format.
|
|
849
|
+
* Example: "1. Introduction (pages 1-3) [5 chunks]"
|
|
850
|
+
*/
|
|
851
|
+
function flattenToOutline(nodes, prefix = '') {
|
|
852
|
+
const lines = [];
|
|
853
|
+
for (let i = 0; i < nodes.length; i++) {
|
|
854
|
+
const num = prefix ? `${prefix}.${i + 1}` : `${i + 1}`;
|
|
855
|
+
const node = nodes[i];
|
|
856
|
+
const pageInfo = node.page_range ? ` (pages ${node.page_range})` : '';
|
|
857
|
+
lines.push(`${num}. ${node.name}${pageInfo} [${node.chunk_count} chunks]`);
|
|
858
|
+
if (node.children && node.children.length > 0) {
|
|
859
|
+
lines.push(...flattenToOutline(node.children, num));
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
return lines;
|
|
863
|
+
}
|
|
864
|
+
/**
|
|
865
|
+
* Internal handler for section tree/outline format (merged from ocr_document_sections).
|
|
866
|
+
* Called by handleDocumentStructure when format='tree' or format='outline'.
|
|
867
|
+
*/
|
|
868
|
+
async function handleDocumentSectionsInternal(db, doc, input) {
|
|
869
|
+
try {
|
|
870
|
+
const chunks = db.getChunksByDocumentId(input.document_id);
|
|
871
|
+
// Build tree from section_path strings
|
|
872
|
+
const root = {
|
|
873
|
+
name: '(root)',
|
|
874
|
+
chunk_count: 0,
|
|
875
|
+
heading_level: null,
|
|
876
|
+
first_chunk_index: null,
|
|
877
|
+
last_chunk_index: null,
|
|
878
|
+
chunk_ids: input.include_chunk_ids ? [] : undefined,
|
|
879
|
+
page_numbers: input.include_page_numbers ? [] : undefined,
|
|
880
|
+
children: [],
|
|
881
|
+
};
|
|
882
|
+
let chunksWithSections = 0;
|
|
883
|
+
let chunksWithoutSections = 0;
|
|
884
|
+
/** Helper to update chunk index range on a node */
|
|
885
|
+
const updateChunkIndexRange = (node, chunkIndex) => {
|
|
886
|
+
if (chunkIndex == null)
|
|
887
|
+
return;
|
|
888
|
+
if (node.first_chunk_index === null || chunkIndex < node.first_chunk_index) {
|
|
889
|
+
node.first_chunk_index = chunkIndex;
|
|
890
|
+
}
|
|
891
|
+
if (node.last_chunk_index === null || chunkIndex > node.last_chunk_index) {
|
|
892
|
+
node.last_chunk_index = chunkIndex;
|
|
893
|
+
}
|
|
894
|
+
};
|
|
895
|
+
for (const chunk of chunks) {
|
|
896
|
+
if (!chunk.section_path) {
|
|
897
|
+
// Chunks without section_path go to root
|
|
898
|
+
chunksWithoutSections++;
|
|
899
|
+
root.chunk_count++;
|
|
900
|
+
updateChunkIndexRange(root, chunk.chunk_index);
|
|
901
|
+
if (input.include_chunk_ids && root.chunk_ids) {
|
|
902
|
+
root.chunk_ids.push(chunk.id);
|
|
903
|
+
}
|
|
904
|
+
if (input.include_page_numbers && root.page_numbers && chunk.page_number !== null) {
|
|
905
|
+
if (!root.page_numbers.includes(chunk.page_number)) {
|
|
906
|
+
root.page_numbers.push(chunk.page_number);
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
continue;
|
|
910
|
+
}
|
|
911
|
+
chunksWithSections++;
|
|
912
|
+
// Parse section_path: "Heading 1 > Heading 2 > Heading 3"
|
|
913
|
+
const parts = chunk.section_path.split(' > ').map((s) => s.trim()).filter((s) => s.length > 0);
|
|
914
|
+
let current = root;
|
|
915
|
+
for (let i = 0; i < parts.length; i++) {
|
|
916
|
+
const partName = parts[i];
|
|
917
|
+
let child = current.children.find((c) => c.name === partName);
|
|
918
|
+
if (!child) {
|
|
919
|
+
child = {
|
|
920
|
+
name: partName,
|
|
921
|
+
chunk_count: 0,
|
|
922
|
+
heading_level: null,
|
|
923
|
+
first_chunk_index: null,
|
|
924
|
+
last_chunk_index: null,
|
|
925
|
+
chunk_ids: input.include_chunk_ids ? [] : undefined,
|
|
926
|
+
page_numbers: input.include_page_numbers ? [] : undefined,
|
|
927
|
+
children: [],
|
|
928
|
+
};
|
|
929
|
+
current.children.push(child);
|
|
930
|
+
}
|
|
931
|
+
// Only add chunk to the deepest (leaf) level
|
|
932
|
+
if (i === parts.length - 1) {
|
|
933
|
+
child.chunk_count++;
|
|
934
|
+
updateChunkIndexRange(child, chunk.chunk_index);
|
|
935
|
+
// Set heading_level from the chunk (first non-null wins)
|
|
936
|
+
if (child.heading_level === null && chunk.heading_level != null) {
|
|
937
|
+
child.heading_level = chunk.heading_level;
|
|
938
|
+
}
|
|
939
|
+
if (input.include_chunk_ids && child.chunk_ids) {
|
|
940
|
+
child.chunk_ids.push(chunk.id);
|
|
941
|
+
}
|
|
942
|
+
if (input.include_page_numbers && child.page_numbers && chunk.page_number !== null) {
|
|
943
|
+
if (!child.page_numbers.includes(chunk.page_number)) {
|
|
944
|
+
child.page_numbers.push(chunk.page_number);
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
current = child;
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
// Post-process: compute page_range for nodes with page_numbers
|
|
952
|
+
const computePageRange = (node) => {
|
|
953
|
+
if (node.page_numbers && node.page_numbers.length > 0) {
|
|
954
|
+
node.page_numbers.sort((a, b) => a - b);
|
|
955
|
+
const min = node.page_numbers[0];
|
|
956
|
+
const max = node.page_numbers[node.page_numbers.length - 1];
|
|
957
|
+
node.page_range = min === max ? String(min) : `${min}-${max}`;
|
|
958
|
+
}
|
|
959
|
+
else {
|
|
960
|
+
node.page_range = null;
|
|
961
|
+
}
|
|
962
|
+
for (const child of node.children) {
|
|
963
|
+
computePageRange(child);
|
|
964
|
+
}
|
|
965
|
+
};
|
|
966
|
+
if (input.include_page_numbers) {
|
|
967
|
+
computePageRange(root);
|
|
968
|
+
}
|
|
969
|
+
// Count total sections in the tree
|
|
970
|
+
const countSections = (nodes) => {
|
|
971
|
+
let count = nodes.length;
|
|
972
|
+
for (const node of nodes) {
|
|
973
|
+
count += countSections(node.children);
|
|
974
|
+
}
|
|
975
|
+
return count;
|
|
976
|
+
};
|
|
977
|
+
const totalSections = countSections(root.children);
|
|
978
|
+
if (input.format === 'outline') {
|
|
979
|
+
// Flat numbered outline format
|
|
980
|
+
const outline = flattenToOutline(root.children);
|
|
981
|
+
return formatResponse(successResult({
|
|
982
|
+
document_id: doc.id,
|
|
983
|
+
file_name: doc.file_name,
|
|
984
|
+
format: 'outline',
|
|
985
|
+
total_chunks: chunks.length,
|
|
986
|
+
chunks_with_sections: chunksWithSections,
|
|
987
|
+
chunks_without_sections: chunksWithoutSections,
|
|
988
|
+
total_sections: totalSections,
|
|
989
|
+
root_chunks: root.chunk_count,
|
|
990
|
+
outline,
|
|
991
|
+
next_steps: [{ tool: 'ocr_document_page', description: 'Read a specific page from the document' }, { tool: 'ocr_search', description: 'Search within the document' }, { tool: 'ocr_document_tables', description: 'Extract table data from the document' }],
|
|
992
|
+
}));
|
|
993
|
+
}
|
|
994
|
+
// Default: tree format
|
|
995
|
+
return formatResponse(successResult({
|
|
996
|
+
document_id: doc.id,
|
|
997
|
+
file_name: doc.file_name,
|
|
998
|
+
format: 'tree',
|
|
999
|
+
total_chunks: chunks.length,
|
|
1000
|
+
chunks_with_sections: chunksWithSections,
|
|
1001
|
+
chunks_without_sections: chunksWithoutSections,
|
|
1002
|
+
total_sections: totalSections,
|
|
1003
|
+
sections: root.children,
|
|
1004
|
+
root_chunks: root.chunk_count,
|
|
1005
|
+
next_steps: [{ tool: 'ocr_document_page', description: 'Read a specific page from the document' }, { tool: 'ocr_search', description: 'Search within the document' }, { tool: 'ocr_document_tables', description: 'Extract table data from the document' }],
|
|
1006
|
+
}));
|
|
1007
|
+
}
|
|
1008
|
+
catch (error) {
|
|
1009
|
+
return handleError(error);
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1013
|
+
// UNIFIED EXPORT INPUT SCHEMA (MERGE-A: ocr_document_export + ocr_corpus_export → ocr_export)
|
|
1014
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1015
|
+
const ExportInput = z.object({
|
|
1016
|
+
document_id: z.string().min(1).optional()
|
|
1017
|
+
.describe('Document ID to export. Omit to export entire corpus.'),
|
|
1018
|
+
format: z.enum(['json', 'markdown', 'csv']).default('json')
|
|
1019
|
+
.describe('Export format: json/markdown for single doc, json/csv for corpus'),
|
|
1020
|
+
output_path: z.string().min(1).describe('Path to save exported file'),
|
|
1021
|
+
include_images: z.boolean().default(true)
|
|
1022
|
+
.describe('Include image data in export'),
|
|
1023
|
+
include_extractions: z.boolean().default(true)
|
|
1024
|
+
.describe('Include structured extractions (single doc only)'),
|
|
1025
|
+
include_provenance: z.boolean().default(false)
|
|
1026
|
+
.describe('Include provenance chain (single doc only)'),
|
|
1027
|
+
include_chunks: z.boolean().default(false)
|
|
1028
|
+
.describe('Include chunk list per document (corpus only)'),
|
|
1029
|
+
});
|
|
1030
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1031
|
+
// UNIFIED EXPORT HANDLER (MERGE-A)
|
|
1032
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1033
|
+
/**
|
|
1034
|
+
* Handle ocr_export - Unified export for single document or entire corpus
|
|
1035
|
+
* If document_id is provided: exports that document (json/markdown)
|
|
1036
|
+
* If document_id is omitted: exports entire corpus (json/csv)
|
|
1037
|
+
*/
|
|
1038
|
+
export async function handleExport(params) {
|
|
1039
|
+
try {
|
|
1040
|
+
const input = validateInput(ExportInput, params);
|
|
1041
|
+
if (input.document_id) {
|
|
1042
|
+
// Format validation for single doc
|
|
1043
|
+
if (input.format === 'csv') {
|
|
1044
|
+
throw new MCPError('VALIDATION_ERROR', 'CSV format only supported for corpus export, not single document. Use json or markdown.');
|
|
1045
|
+
}
|
|
1046
|
+
return handleDocumentExportInternal(input);
|
|
1047
|
+
}
|
|
1048
|
+
else {
|
|
1049
|
+
// Format validation for corpus
|
|
1050
|
+
if (input.format === 'markdown') {
|
|
1051
|
+
throw new MCPError('VALIDATION_ERROR', 'Markdown format only supported for single document export, not corpus. Use json or csv.');
|
|
1052
|
+
}
|
|
1053
|
+
return handleCorpusExportInternal(input);
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
catch (error) {
|
|
1057
|
+
return handleError(error);
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
/**
|
|
1061
|
+
* Internal: Export all data for a single document to JSON or markdown
|
|
1062
|
+
*/
|
|
1063
|
+
async function handleDocumentExportInternal(input) {
|
|
1064
|
+
try {
|
|
1065
|
+
const { db } = requireDatabase();
|
|
1066
|
+
// Get document record
|
|
1067
|
+
const doc = db.getDocument(input.document_id);
|
|
1068
|
+
if (!doc) {
|
|
1069
|
+
throw documentNotFoundError(input.document_id);
|
|
1070
|
+
}
|
|
1071
|
+
// Get OCR results
|
|
1072
|
+
const ocrResult = db.getOCRResultByDocumentId(doc.id);
|
|
1073
|
+
// Get all chunks
|
|
1074
|
+
const chunks = db.getChunksByDocumentId(doc.id);
|
|
1075
|
+
// Get images if requested
|
|
1076
|
+
let images = [];
|
|
1077
|
+
if (input.include_images) {
|
|
1078
|
+
const conn = db.getConnection();
|
|
1079
|
+
const imgRows = getImagesByDocument(conn, doc.id);
|
|
1080
|
+
images = imgRows.map((img) => ({
|
|
1081
|
+
id: img.id,
|
|
1082
|
+
page_number: img.page_number,
|
|
1083
|
+
image_index: img.image_index,
|
|
1084
|
+
block_type: img.block_type,
|
|
1085
|
+
extracted_path: img.extracted_path,
|
|
1086
|
+
width: img.dimensions?.width ?? null,
|
|
1087
|
+
height: img.dimensions?.height ?? null,
|
|
1088
|
+
vlm_status: img.vlm_status,
|
|
1089
|
+
vlm_description: img.vlm_description ?? null,
|
|
1090
|
+
vlm_image_type: img.vlm_structured_data?.imageType ?? null,
|
|
1091
|
+
created_at: img.created_at,
|
|
1092
|
+
}));
|
|
1093
|
+
}
|
|
1094
|
+
// Get extractions if requested
|
|
1095
|
+
let extractions = [];
|
|
1096
|
+
if (input.include_extractions) {
|
|
1097
|
+
const extRows = db.getExtractionsByDocument(doc.id);
|
|
1098
|
+
extractions = extRows.map((ext) => ({
|
|
1099
|
+
id: ext.id,
|
|
1100
|
+
schema_json: ext.schema_json,
|
|
1101
|
+
extraction_json: ext.extraction_json,
|
|
1102
|
+
content_hash: ext.content_hash,
|
|
1103
|
+
created_at: ext.created_at,
|
|
1104
|
+
}));
|
|
1105
|
+
}
|
|
1106
|
+
// Get provenance if requested
|
|
1107
|
+
let provenance;
|
|
1108
|
+
if (input.include_provenance) {
|
|
1109
|
+
provenance = fetchProvenanceChain(db, doc.provenance_id, 'DocumentExport');
|
|
1110
|
+
}
|
|
1111
|
+
// Sanitize output path
|
|
1112
|
+
const safePath = sanitizePath(input.output_path);
|
|
1113
|
+
// Create output directory if needed
|
|
1114
|
+
const dir = dirname(safePath);
|
|
1115
|
+
if (!existsSync(dir)) {
|
|
1116
|
+
mkdirSync(dir, { recursive: true });
|
|
1117
|
+
}
|
|
1118
|
+
if (input.format === 'json') {
|
|
1119
|
+
// Build JSON export
|
|
1120
|
+
const exportData = {
|
|
1121
|
+
document: {
|
|
1122
|
+
id: doc.id,
|
|
1123
|
+
file_name: doc.file_name,
|
|
1124
|
+
file_path: doc.file_path,
|
|
1125
|
+
file_hash: doc.file_hash,
|
|
1126
|
+
file_size: doc.file_size,
|
|
1127
|
+
file_type: doc.file_type,
|
|
1128
|
+
status: doc.status,
|
|
1129
|
+
page_count: doc.page_count,
|
|
1130
|
+
doc_title: doc.doc_title ?? null,
|
|
1131
|
+
doc_author: doc.doc_author ?? null,
|
|
1132
|
+
doc_subject: doc.doc_subject ?? null,
|
|
1133
|
+
created_at: doc.created_at,
|
|
1134
|
+
},
|
|
1135
|
+
ocr_results: ocrResult
|
|
1136
|
+
? {
|
|
1137
|
+
id: ocrResult.id,
|
|
1138
|
+
datalab_mode: ocrResult.datalab_mode,
|
|
1139
|
+
parse_quality_score: ocrResult.parse_quality_score,
|
|
1140
|
+
page_count: ocrResult.page_count,
|
|
1141
|
+
text_length: ocrResult.text_length,
|
|
1142
|
+
extracted_text: ocrResult.extracted_text,
|
|
1143
|
+
cost_cents: ocrResult.cost_cents,
|
|
1144
|
+
processing_duration_ms: ocrResult.processing_duration_ms,
|
|
1145
|
+
}
|
|
1146
|
+
: null,
|
|
1147
|
+
chunks: chunks.map((c) => ({
|
|
1148
|
+
id: c.id,
|
|
1149
|
+
chunk_index: c.chunk_index,
|
|
1150
|
+
text: c.text,
|
|
1151
|
+
page_number: c.page_number,
|
|
1152
|
+
character_start: c.character_start,
|
|
1153
|
+
character_end: c.character_end,
|
|
1154
|
+
heading_context: c.heading_context ?? null,
|
|
1155
|
+
section_path: c.section_path ?? null,
|
|
1156
|
+
content_types: c.content_types ?? null,
|
|
1157
|
+
})),
|
|
1158
|
+
};
|
|
1159
|
+
if (input.include_images) {
|
|
1160
|
+
exportData.images = images;
|
|
1161
|
+
}
|
|
1162
|
+
if (input.include_extractions) {
|
|
1163
|
+
exportData.extractions = extractions;
|
|
1164
|
+
}
|
|
1165
|
+
if (input.include_provenance && provenance) {
|
|
1166
|
+
exportData.provenance = provenance;
|
|
1167
|
+
}
|
|
1168
|
+
writeFileSync(safePath, JSON.stringify(exportData, null, 2), 'utf-8');
|
|
1169
|
+
}
|
|
1170
|
+
else {
|
|
1171
|
+
// Build Markdown export
|
|
1172
|
+
const lines = [];
|
|
1173
|
+
lines.push(`# Document Export: ${doc.file_name}`);
|
|
1174
|
+
lines.push('');
|
|
1175
|
+
lines.push('## Metadata');
|
|
1176
|
+
lines.push(`- **File:** ${doc.file_path}`);
|
|
1177
|
+
lines.push(`- **Status:** ${doc.status}`);
|
|
1178
|
+
lines.push(`- **Pages:** ${doc.page_count ?? 'N/A'}`);
|
|
1179
|
+
lines.push(`- **Created:** ${doc.created_at}`);
|
|
1180
|
+
lines.push(`- **File Type:** ${doc.file_type}`);
|
|
1181
|
+
lines.push(`- **File Size:** ${doc.file_size} bytes`);
|
|
1182
|
+
if (doc.doc_title)
|
|
1183
|
+
lines.push(`- **Title:** ${doc.doc_title}`);
|
|
1184
|
+
if (doc.doc_author)
|
|
1185
|
+
lines.push(`- **Author:** ${doc.doc_author}`);
|
|
1186
|
+
lines.push('');
|
|
1187
|
+
if (ocrResult) {
|
|
1188
|
+
lines.push('## OCR Info');
|
|
1189
|
+
lines.push(`- **Mode:** ${ocrResult.datalab_mode}`);
|
|
1190
|
+
lines.push(`- **Quality Score:** ${ocrResult.parse_quality_score}`);
|
|
1191
|
+
lines.push(`- **Text Length:** ${ocrResult.text_length}`);
|
|
1192
|
+
lines.push(`- **Processing Time:** ${ocrResult.processing_duration_ms}ms`);
|
|
1193
|
+
lines.push('');
|
|
1194
|
+
}
|
|
1195
|
+
if (chunks.length > 0) {
|
|
1196
|
+
lines.push('## Content');
|
|
1197
|
+
lines.push('');
|
|
1198
|
+
for (const chunk of chunks) {
|
|
1199
|
+
const pageInfo = chunk.page_number !== null ? ` (Page ${chunk.page_number})` : '';
|
|
1200
|
+
const heading = chunk.heading_context ? ` - ${chunk.heading_context}` : '';
|
|
1201
|
+
lines.push(`### Chunk ${chunk.chunk_index}${pageInfo}${heading}`);
|
|
1202
|
+
lines.push('');
|
|
1203
|
+
lines.push(chunk.text);
|
|
1204
|
+
lines.push('');
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
if (input.include_images && images.length > 0) {
|
|
1208
|
+
lines.push('## Images');
|
|
1209
|
+
lines.push('');
|
|
1210
|
+
for (let i = 0; i < images.length; i++) {
|
|
1211
|
+
const img = images[i];
|
|
1212
|
+
const pageInfo = img.page_number !== null ? ` (Page ${img.page_number})` : '';
|
|
1213
|
+
lines.push(`### Image ${i + 1}${pageInfo}`);
|
|
1214
|
+
lines.push(`- **Path:** ${img.extracted_path ?? 'N/A'}`);
|
|
1215
|
+
lines.push(`- **Type:** ${img.block_type ?? 'unknown'}`);
|
|
1216
|
+
lines.push(`- **Size:** ${img.width ?? '?'}x${img.height ?? '?'}`);
|
|
1217
|
+
if (img.vlm_description) {
|
|
1218
|
+
lines.push(`- **Description:** ${img.vlm_description}`);
|
|
1219
|
+
}
|
|
1220
|
+
lines.push('');
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
if (input.include_extractions && extractions.length > 0) {
|
|
1224
|
+
lines.push('## Extractions');
|
|
1225
|
+
lines.push('');
|
|
1226
|
+
for (let i = 0; i < extractions.length; i++) {
|
|
1227
|
+
const ext = extractions[i];
|
|
1228
|
+
lines.push(`### Extraction ${i + 1}`);
|
|
1229
|
+
lines.push('');
|
|
1230
|
+
lines.push('**Schema:**');
|
|
1231
|
+
lines.push('```json');
|
|
1232
|
+
lines.push(String(ext.schema_json));
|
|
1233
|
+
lines.push('```');
|
|
1234
|
+
lines.push('');
|
|
1235
|
+
lines.push('**Data:**');
|
|
1236
|
+
lines.push('```json');
|
|
1237
|
+
lines.push(String(ext.extraction_json));
|
|
1238
|
+
lines.push('```');
|
|
1239
|
+
lines.push('');
|
|
1240
|
+
}
|
|
1241
|
+
}
|
|
1242
|
+
if (input.include_provenance && provenance && provenance.length > 0) {
|
|
1243
|
+
lines.push('## Provenance');
|
|
1244
|
+
lines.push('');
|
|
1245
|
+
lines.push('```json');
|
|
1246
|
+
lines.push(JSON.stringify(provenance, null, 2));
|
|
1247
|
+
lines.push('```');
|
|
1248
|
+
lines.push('');
|
|
1249
|
+
}
|
|
1250
|
+
writeFileSync(safePath, lines.join('\n'), 'utf-8');
|
|
1251
|
+
}
|
|
1252
|
+
return formatResponse(successResult({
|
|
1253
|
+
output_path: safePath,
|
|
1254
|
+
format: input.format,
|
|
1255
|
+
document_id: doc.id,
|
|
1256
|
+
stats: {
|
|
1257
|
+
chunk_count: chunks.length,
|
|
1258
|
+
image_count: images.length,
|
|
1259
|
+
extraction_count: extractions.length,
|
|
1260
|
+
},
|
|
1261
|
+
next_steps: [{ tool: 'ocr_document_list', description: 'Export another document' }],
|
|
1262
|
+
}));
|
|
1263
|
+
}
|
|
1264
|
+
catch (error) {
|
|
1265
|
+
return handleError(error);
|
|
1266
|
+
}
|
|
1267
|
+
}
|
|
1268
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1269
|
+
// INTERNAL CORPUS EXPORT HANDLER
|
|
1270
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1271
|
+
/**
|
|
1272
|
+
* Internal: Export entire corpus metadata and statistics
|
|
1273
|
+
*/
|
|
1274
|
+
async function handleCorpusExportInternal(input) {
|
|
1275
|
+
try {
|
|
1276
|
+
const { db } = requireDatabase();
|
|
1277
|
+
const conn = db.getConnection();
|
|
1278
|
+
// Get all documents
|
|
1279
|
+
const documents = db.listDocuments();
|
|
1280
|
+
// Sanitize output path
|
|
1281
|
+
const safePath = sanitizePath(input.output_path);
|
|
1282
|
+
// Create output directory if needed
|
|
1283
|
+
const dir = dirname(safePath);
|
|
1284
|
+
if (!existsSync(dir)) {
|
|
1285
|
+
mkdirSync(dir, { recursive: true });
|
|
1286
|
+
}
|
|
1287
|
+
let totalChunks = 0;
|
|
1288
|
+
let totalImages = 0;
|
|
1289
|
+
if (input.format === 'json') {
|
|
1290
|
+
// Build JSON export: array of document objects
|
|
1291
|
+
const exportDocs = [];
|
|
1292
|
+
for (const doc of documents) {
|
|
1293
|
+
const chunkRows = db.getChunksByDocumentId(doc.id);
|
|
1294
|
+
const chunkCount = chunkRows.length;
|
|
1295
|
+
totalChunks += chunkCount;
|
|
1296
|
+
const imageCountRow = conn
|
|
1297
|
+
.prepare('SELECT COUNT(*) as count FROM images WHERE document_id = ?')
|
|
1298
|
+
.get(doc.id);
|
|
1299
|
+
const imageCount = imageCountRow?.count ?? 0;
|
|
1300
|
+
totalImages += imageCount;
|
|
1301
|
+
const docEntry = {
|
|
1302
|
+
id: doc.id,
|
|
1303
|
+
file_path: doc.file_path,
|
|
1304
|
+
file_name: doc.file_name,
|
|
1305
|
+
file_type: doc.file_type,
|
|
1306
|
+
file_size: doc.file_size,
|
|
1307
|
+
status: doc.status,
|
|
1308
|
+
page_count: doc.page_count,
|
|
1309
|
+
doc_title: doc.doc_title ?? null,
|
|
1310
|
+
doc_author: doc.doc_author ?? null,
|
|
1311
|
+
doc_subject: doc.doc_subject ?? null,
|
|
1312
|
+
chunk_count: chunkCount,
|
|
1313
|
+
image_count: imageCount,
|
|
1314
|
+
created_at: doc.created_at,
|
|
1315
|
+
};
|
|
1316
|
+
if (input.include_chunks) {
|
|
1317
|
+
docEntry.chunks = chunkRows.map((c) => ({
|
|
1318
|
+
id: c.id,
|
|
1319
|
+
chunk_index: c.chunk_index,
|
|
1320
|
+
text: c.text,
|
|
1321
|
+
page_number: c.page_number,
|
|
1322
|
+
heading_context: c.heading_context ?? null,
|
|
1323
|
+
section_path: c.section_path ?? null,
|
|
1324
|
+
content_types: c.content_types ?? null,
|
|
1325
|
+
}));
|
|
1326
|
+
}
|
|
1327
|
+
if (input.include_images) {
|
|
1328
|
+
const imgRows = getImagesByDocument(conn, doc.id);
|
|
1329
|
+
totalImages = totalImages - imageCount + imgRows.length; // Correct count
|
|
1330
|
+
docEntry.images = imgRows.map((img) => ({
|
|
1331
|
+
id: img.id,
|
|
1332
|
+
page_number: img.page_number,
|
|
1333
|
+
block_type: img.block_type,
|
|
1334
|
+
extracted_path: img.extracted_path,
|
|
1335
|
+
width: img.dimensions?.width ?? null,
|
|
1336
|
+
height: img.dimensions?.height ?? null,
|
|
1337
|
+
vlm_status: img.vlm_status,
|
|
1338
|
+
vlm_description: img.vlm_description ?? null,
|
|
1339
|
+
}));
|
|
1340
|
+
}
|
|
1341
|
+
exportDocs.push(docEntry);
|
|
1342
|
+
}
|
|
1343
|
+
writeFileSync(safePath, JSON.stringify(exportDocs, null, 2), 'utf-8');
|
|
1344
|
+
}
|
|
1345
|
+
else {
|
|
1346
|
+
// CSV format: one row per document
|
|
1347
|
+
const csvQuote = (value) => `"${value.replace(/"/g, '""')}"`;
|
|
1348
|
+
const headers = ['id', 'file_path', 'file_name', 'file_type', 'status', 'page_count', 'chunk_count', 'image_count', 'created_at'];
|
|
1349
|
+
const csvLines = [headers.map(csvQuote).join(',')];
|
|
1350
|
+
for (const doc of documents) {
|
|
1351
|
+
const chunkCount = db.getChunksByDocumentId(doc.id).length;
|
|
1352
|
+
totalChunks += chunkCount;
|
|
1353
|
+
const imageCountRow = conn
|
|
1354
|
+
.prepare('SELECT COUNT(*) as count FROM images WHERE document_id = ?')
|
|
1355
|
+
.get(doc.id);
|
|
1356
|
+
const imageCount = imageCountRow?.count ?? 0;
|
|
1357
|
+
totalImages += imageCount;
|
|
1358
|
+
csvLines.push([
|
|
1359
|
+
csvQuote(doc.id),
|
|
1360
|
+
csvQuote(doc.file_path),
|
|
1361
|
+
csvQuote(doc.file_name),
|
|
1362
|
+
csvQuote(doc.file_type),
|
|
1363
|
+
csvQuote(doc.status),
|
|
1364
|
+
csvQuote(String(doc.page_count ?? '')),
|
|
1365
|
+
csvQuote(String(chunkCount)),
|
|
1366
|
+
csvQuote(String(imageCount)),
|
|
1367
|
+
csvQuote(doc.created_at),
|
|
1368
|
+
].join(','));
|
|
1369
|
+
}
|
|
1370
|
+
writeFileSync(safePath, csvLines.join('\n'), 'utf-8');
|
|
1371
|
+
}
|
|
1372
|
+
return formatResponse(successResult({
|
|
1373
|
+
output_path: safePath,
|
|
1374
|
+
format: input.format,
|
|
1375
|
+
document_count: documents.length,
|
|
1376
|
+
total_chunks: totalChunks,
|
|
1377
|
+
total_images: totalImages,
|
|
1378
|
+
next_steps: [{ tool: 'ocr_report_overview', description: 'Get quality and corpus analytics' }],
|
|
1379
|
+
}));
|
|
1380
|
+
}
|
|
1381
|
+
catch (error) {
|
|
1382
|
+
return handleError(error);
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1386
|
+
// DOCUMENT VERSIONS HANDLER
|
|
1387
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1388
|
+
const DocumentVersionsInput = z.object({
|
|
1389
|
+
document_id: z.string().min(1).describe('Document ID to find versions of'),
|
|
1390
|
+
});
|
|
1391
|
+
/**
|
|
1392
|
+
* Handle ocr_document_versions - Find all versions of a document by file_path
|
|
1393
|
+
*/
|
|
1394
|
+
async function handleDocumentVersions(params) {
|
|
1395
|
+
try {
|
|
1396
|
+
const input = validateInput(DocumentVersionsInput, params);
|
|
1397
|
+
const { db } = requireDatabase();
|
|
1398
|
+
const conn = db.getConnection();
|
|
1399
|
+
const doc = db.getDocument(input.document_id);
|
|
1400
|
+
if (!doc) {
|
|
1401
|
+
throw documentNotFoundError(input.document_id);
|
|
1402
|
+
}
|
|
1403
|
+
// Query ALL documents with the same file_path, ordered by created_at DESC
|
|
1404
|
+
const versions = conn
|
|
1405
|
+
.prepare(`SELECT id, file_hash, file_size, status, created_at, ocr_completed_at
|
|
1406
|
+
FROM documents
|
|
1407
|
+
WHERE file_path = ?
|
|
1408
|
+
ORDER BY created_at DESC`)
|
|
1409
|
+
.all(doc.file_path);
|
|
1410
|
+
return formatResponse(successResult({
|
|
1411
|
+
document_id: input.document_id,
|
|
1412
|
+
file_path: doc.file_path,
|
|
1413
|
+
versions: versions.map((v) => ({
|
|
1414
|
+
id: v.id,
|
|
1415
|
+
file_hash: v.file_hash,
|
|
1416
|
+
file_size: v.file_size,
|
|
1417
|
+
status: v.status,
|
|
1418
|
+
created_at: v.created_at,
|
|
1419
|
+
ocr_completed_at: v.ocr_completed_at,
|
|
1420
|
+
})),
|
|
1421
|
+
total_versions: versions.length,
|
|
1422
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'Get details for a specific version' }, { tool: 'ocr_document_compare', description: 'Compare two versions' }],
|
|
1423
|
+
}));
|
|
1424
|
+
}
|
|
1425
|
+
catch (error) {
|
|
1426
|
+
return handleError(error);
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1430
|
+
// DOCUMENT WORKFLOW HANDLER
|
|
1431
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1432
|
+
const WORKFLOW_PREFIX = 'workflow:';
|
|
1433
|
+
const WORKFLOW_COLORS = {
|
|
1434
|
+
draft: '#6B7280',
|
|
1435
|
+
review: '#F59E0B',
|
|
1436
|
+
approved: '#10B981',
|
|
1437
|
+
rejected: '#EF4444',
|
|
1438
|
+
archived: '#6366F1',
|
|
1439
|
+
};
|
|
1440
|
+
const DocumentWorkflowInput = z.object({
|
|
1441
|
+
document_id: z.string().min(1).describe('Document ID'),
|
|
1442
|
+
action: z.enum(['get', 'set', 'history']).describe('Action: get current state, set new state, or view history'),
|
|
1443
|
+
state: z.enum(['draft', 'review', 'approved', 'rejected', 'archived']).optional()
|
|
1444
|
+
.describe('New workflow state (required for action=set)'),
|
|
1445
|
+
note: z.string().max(500).optional().describe('Optional note for state transition'),
|
|
1446
|
+
});
|
|
1447
|
+
/**
|
|
1448
|
+
* Get the current workflow state for a document from its most recent workflow tag.
|
|
1449
|
+
*/
|
|
1450
|
+
function getCurrentWorkflowState(conn, documentId) {
|
|
1451
|
+
const tag = conn
|
|
1452
|
+
.prepare(`SELECT t.name FROM tags t
|
|
1453
|
+
JOIN entity_tags et ON et.tag_id = t.id
|
|
1454
|
+
WHERE et.entity_type = 'document' AND et.entity_id = ?
|
|
1455
|
+
AND t.name LIKE 'workflow:%'
|
|
1456
|
+
ORDER BY et.created_at DESC LIMIT 1`)
|
|
1457
|
+
.get(documentId);
|
|
1458
|
+
return tag ? tag.name.replace(WORKFLOW_PREFIX, '') : 'none';
|
|
1459
|
+
}
|
|
1460
|
+
/**
|
|
1461
|
+
* Handle ocr_document_workflow - Manage document workflow states via tags
|
|
1462
|
+
*/
|
|
1463
|
+
async function handleDocumentWorkflow(params) {
|
|
1464
|
+
try {
|
|
1465
|
+
const input = validateInput(DocumentWorkflowInput, params);
|
|
1466
|
+
const { db } = requireDatabase();
|
|
1467
|
+
const conn = db.getConnection();
|
|
1468
|
+
// Verify document exists
|
|
1469
|
+
const doc = db.getDocument(input.document_id);
|
|
1470
|
+
if (!doc) {
|
|
1471
|
+
throw documentNotFoundError(input.document_id);
|
|
1472
|
+
}
|
|
1473
|
+
if (input.action === 'get') {
|
|
1474
|
+
return formatResponse(successResult({
|
|
1475
|
+
document_id: input.document_id,
|
|
1476
|
+
current_state: getCurrentWorkflowState(conn, input.document_id),
|
|
1477
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'View document details after workflow change' }, { tool: 'ocr_tag_search', description: 'Find other documents in the same workflow state' }],
|
|
1478
|
+
}));
|
|
1479
|
+
}
|
|
1480
|
+
if (input.action === 'set') {
|
|
1481
|
+
if (!input.state) {
|
|
1482
|
+
throw new MCPError('VALIDATION_ERROR', 'state is required when action is "set"');
|
|
1483
|
+
}
|
|
1484
|
+
const previousState = getCurrentWorkflowState(conn, input.document_id);
|
|
1485
|
+
// Don't delete old workflow tags - preserve history for the 'history' action.
|
|
1486
|
+
// The 'get' action uses ORDER BY created_at DESC LIMIT 1 to get current state.
|
|
1487
|
+
// Create tag if it doesn't exist
|
|
1488
|
+
const tagName = WORKFLOW_PREFIX + input.state;
|
|
1489
|
+
const now = new Date().toISOString();
|
|
1490
|
+
conn
|
|
1491
|
+
.prepare(`INSERT OR IGNORE INTO tags (id, name, description, color, created_at)
|
|
1492
|
+
VALUES (?, ?, ?, ?, ?)`)
|
|
1493
|
+
.run(uuidv4(), tagName, `Workflow state: ${input.state}${input.note ? ' - ' + input.note : ''}`, WORKFLOW_COLORS[input.state] ?? '#6B7280', now);
|
|
1494
|
+
// Get the tag ID (may have been pre-existing)
|
|
1495
|
+
const tag = conn
|
|
1496
|
+
.prepare('SELECT id FROM tags WHERE name = ?')
|
|
1497
|
+
.get(tagName);
|
|
1498
|
+
// Apply tag to document
|
|
1499
|
+
conn
|
|
1500
|
+
.prepare(`INSERT INTO entity_tags (id, entity_type, entity_id, tag_id, created_at)
|
|
1501
|
+
VALUES (?, 'document', ?, ?, ?)`)
|
|
1502
|
+
.run(uuidv4(), input.document_id, tag.id, now);
|
|
1503
|
+
return formatResponse(successResult({
|
|
1504
|
+
document_id: input.document_id,
|
|
1505
|
+
previous_state: previousState,
|
|
1506
|
+
new_state: input.state,
|
|
1507
|
+
transitioned_at: now,
|
|
1508
|
+
note: input.note ?? null,
|
|
1509
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'View document details after workflow change' }, { tool: 'ocr_tag_search', description: 'Find other documents in the same workflow state' }],
|
|
1510
|
+
}));
|
|
1511
|
+
}
|
|
1512
|
+
// action === 'history'
|
|
1513
|
+
const historyRows = conn
|
|
1514
|
+
.prepare(`SELECT t.name, et.created_at
|
|
1515
|
+
FROM entity_tags et
|
|
1516
|
+
JOIN tags t ON t.id = et.tag_id
|
|
1517
|
+
WHERE et.entity_type = 'document' AND et.entity_id = ?
|
|
1518
|
+
AND t.name LIKE 'workflow:%'
|
|
1519
|
+
ORDER BY et.created_at ASC`)
|
|
1520
|
+
.all(input.document_id);
|
|
1521
|
+
// Get current state (last entry)
|
|
1522
|
+
const currentState = historyRows.length > 0
|
|
1523
|
+
? historyRows[historyRows.length - 1].name.replace(WORKFLOW_PREFIX, '')
|
|
1524
|
+
: 'none';
|
|
1525
|
+
return formatResponse(successResult({
|
|
1526
|
+
document_id: input.document_id,
|
|
1527
|
+
current_state: currentState,
|
|
1528
|
+
history: historyRows.map((r) => ({
|
|
1529
|
+
state: r.name.replace(WORKFLOW_PREFIX, ''),
|
|
1530
|
+
applied_at: r.created_at,
|
|
1531
|
+
})),
|
|
1532
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'View document details after workflow change' }, { tool: 'ocr_tag_search', description: 'Find other documents in the same workflow state' }],
|
|
1533
|
+
}));
|
|
1534
|
+
}
|
|
1535
|
+
catch (error) {
|
|
1536
|
+
return handleError(error);
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1540
|
+
// TOOL DEFINITIONS FOR MCP REGISTRATION
|
|
1541
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1542
|
+
/**
|
|
1543
|
+
* Document tools collection for MCP server registration
|
|
1544
|
+
*/
|
|
1545
|
+
export const documentTools = {
|
|
1546
|
+
ocr_document_list: {
|
|
1547
|
+
description: '[ESSENTIAL] Use to browse documents in the current database. Returns metadata with structural summaries. Filter by status, date, or file type. Supports cursor-based pagination for large datasets. Start here after ocr_db_select.',
|
|
1548
|
+
inputSchema: {
|
|
1549
|
+
status_filter: z
|
|
1550
|
+
.enum(['pending', 'processing', 'complete', 'failed'])
|
|
1551
|
+
.optional()
|
|
1552
|
+
.describe('Filter by status'),
|
|
1553
|
+
limit: z.number().int().min(1).max(1000).default(50).describe('Maximum results'),
|
|
1554
|
+
offset: z.number().int().min(0).default(0).describe('Offset for pagination'),
|
|
1555
|
+
created_after: z.string().datetime().optional()
|
|
1556
|
+
.describe('Filter documents created after this ISO 8601 timestamp'),
|
|
1557
|
+
created_before: z.string().datetime().optional()
|
|
1558
|
+
.describe('Filter documents created before this ISO 8601 timestamp'),
|
|
1559
|
+
file_type: z.string().optional()
|
|
1560
|
+
.describe('Filter by file type (e.g., "pdf", "docx")'),
|
|
1561
|
+
cursor: z.string().optional()
|
|
1562
|
+
.describe('Cursor from previous response for efficient keyset pagination. When provided, offset is ignored. Use next_cursor from the response.'),
|
|
1563
|
+
},
|
|
1564
|
+
handler: handleDocumentList,
|
|
1565
|
+
},
|
|
1566
|
+
ocr_document_get: {
|
|
1567
|
+
description: '[ESSENTIAL] Use to get full details for a single document. Returns OCR metadata, structure, quality, and memberships. Use ocr_document_page to read specific pages.',
|
|
1568
|
+
inputSchema: {
|
|
1569
|
+
document_id: z.string().min(1).describe('Document ID'),
|
|
1570
|
+
include_text: z.boolean().default(false).describe('Include OCR extracted text'),
|
|
1571
|
+
include_chunks: z.boolean().default(false).describe('Include chunk information'),
|
|
1572
|
+
include_blocks: z
|
|
1573
|
+
.boolean()
|
|
1574
|
+
.default(false)
|
|
1575
|
+
.describe('Include JSON blocks and extras metadata'),
|
|
1576
|
+
include_full_provenance: z.boolean().default(false).describe('Include full provenance chain'),
|
|
1577
|
+
},
|
|
1578
|
+
handler: handleDocumentGet,
|
|
1579
|
+
},
|
|
1580
|
+
ocr_document_delete: {
|
|
1581
|
+
description: '[DESTRUCTIVE] Use to permanently delete a document and all derived data (chunks, embeddings, images, provenance). Requires confirm=true.',
|
|
1582
|
+
inputSchema: {
|
|
1583
|
+
document_id: z.string().min(1).describe('Document ID to delete'),
|
|
1584
|
+
confirm: z.literal(true).describe('Must be true to confirm deletion'),
|
|
1585
|
+
},
|
|
1586
|
+
handler: handleDocumentDelete,
|
|
1587
|
+
},
|
|
1588
|
+
ocr_document_find_similar: {
|
|
1589
|
+
description: '[ANALYSIS] Use to find documents similar to a given document by content. Returns ranked list with similarity scores. Requires completed embeddings.',
|
|
1590
|
+
inputSchema: FindSimilarInput.shape,
|
|
1591
|
+
handler: handleFindSimilar,
|
|
1592
|
+
},
|
|
1593
|
+
ocr_document_structure: {
|
|
1594
|
+
description: '[ESSENTIAL] Document structure. format="structure" (default: headings/tables/figures), "tree" (hierarchical with chunk IDs), or "outline" (flat numbered).',
|
|
1595
|
+
inputSchema: DocumentStructureInput.shape,
|
|
1596
|
+
handler: handleDocumentStructure,
|
|
1597
|
+
},
|
|
1598
|
+
ocr_document_update_metadata: {
|
|
1599
|
+
description: '[MANAGE] Use to update title, author, or subject metadata on one or more documents. Returns updated document IDs.',
|
|
1600
|
+
inputSchema: UpdateMetadataInput.shape,
|
|
1601
|
+
handler: handleUpdateMetadata,
|
|
1602
|
+
},
|
|
1603
|
+
ocr_document_duplicates: {
|
|
1604
|
+
description: '[ANALYSIS] Use to find duplicate documents. Exact mode matches file hashes; near mode uses similarity scores from comparisons. Returns duplicate pairs.',
|
|
1605
|
+
inputSchema: DuplicateDetectionInput.shape,
|
|
1606
|
+
handler: handleDuplicateDetection,
|
|
1607
|
+
},
|
|
1608
|
+
ocr_export: {
|
|
1609
|
+
description: '[STATUS] Export document or corpus data. Provide document_id for single doc (json/markdown), omit for corpus (json/csv).',
|
|
1610
|
+
inputSchema: ExportInput.shape,
|
|
1611
|
+
handler: handleExport,
|
|
1612
|
+
},
|
|
1613
|
+
ocr_document_versions: {
|
|
1614
|
+
description: '[ANALYSIS] Use to find all versions of a re-ingested document. Returns documents sharing the same file path, newest first.',
|
|
1615
|
+
inputSchema: DocumentVersionsInput.shape,
|
|
1616
|
+
handler: handleDocumentVersions,
|
|
1617
|
+
},
|
|
1618
|
+
ocr_document_workflow: {
|
|
1619
|
+
description: '[MANAGE] Track document review states. action="get"|"set"|"history". States: draft/review/approved/rejected/archived.',
|
|
1620
|
+
inputSchema: DocumentWorkflowInput.shape,
|
|
1621
|
+
handler: handleDocumentWorkflow,
|
|
1622
|
+
},
|
|
1623
|
+
};
|
|
1624
|
+
//# sourceMappingURL=documents.js.map
|