ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,1039 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Intelligence MCP Tools
|
|
3
|
+
*
|
|
4
|
+
* Tools: ocr_guide, ocr_document_tables, ocr_document_recommend, ocr_document_extras
|
|
5
|
+
*
|
|
6
|
+
* Internal-only data access and analysis tools. No external API calls needed.
|
|
7
|
+
*
|
|
8
|
+
* CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
|
|
9
|
+
* Use console.error() for all logging.
|
|
10
|
+
*
|
|
11
|
+
* @module tools/intelligence
|
|
12
|
+
*/
|
|
13
|
+
import { z } from 'zod';
|
|
14
|
+
import { state, hasDatabase, requireDatabase, getDefaultStoragePath } from '../server/state.js';
|
|
15
|
+
import { DatabaseService } from '../services/storage/database/index.js';
|
|
16
|
+
import { successResult } from '../server/types.js';
|
|
17
|
+
import { validateInput } from '../utils/validation.js';
|
|
18
|
+
import { MCPError, documentNotFoundError } from '../server/errors.js';
|
|
19
|
+
import { formatResponse, handleError } from './shared.js';
|
|
20
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
21
|
+
// INPUT SCHEMAS
|
|
22
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
23
|
+
const GuideInput = z.object({
|
|
24
|
+
intent: z.enum(['explore', 'search', 'ingest', 'analyze', 'status']).optional()
|
|
25
|
+
.describe('Optional intent hint: explore (browse data), search (find content), ingest (add documents), analyze (compare/cluster), status (check health). Omit for general guidance.'),
|
|
26
|
+
});
|
|
27
|
+
const DocumentTablesInput = z.object({
|
|
28
|
+
document_id: z.string().min(1).describe('Document ID to extract tables from'),
|
|
29
|
+
table_index: z.number().int().min(0).optional()
|
|
30
|
+
.describe('Specific table index (0-based) to retrieve. Omit for all tables.'),
|
|
31
|
+
});
|
|
32
|
+
const DocumentRecommendInput = z.object({
|
|
33
|
+
document_id: z.string().min(1).describe('Source document ID to get recommendations for'),
|
|
34
|
+
limit: z.number().int().min(1).max(50).default(10)
|
|
35
|
+
.describe('Maximum number of recommendations'),
|
|
36
|
+
});
|
|
37
|
+
const DocumentExtrasInput = z.object({
|
|
38
|
+
document_id: z.string().min(1).describe('Document ID to retrieve extras data for'),
|
|
39
|
+
section: z.string().optional()
|
|
40
|
+
.describe('Specific extras section to retrieve (charts, links, tracked_changes, table_row_bboxes, infographics). Omit for all.'),
|
|
41
|
+
});
|
|
42
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
43
|
+
// TABLE EXTRACTION HELPERS
|
|
44
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
45
|
+
/**
|
|
46
|
+
* Walk JSON blocks tree looking for Table-type blocks.
|
|
47
|
+
* Extracts cell data into a structured format.
|
|
48
|
+
*/
|
|
49
|
+
function extractTablesFromBlocks(blocks) {
|
|
50
|
+
const tables = [];
|
|
51
|
+
function walkBlock(block, pageNumber) {
|
|
52
|
+
const blockType = block.block_type;
|
|
53
|
+
// Track page number from Page blocks (handle both number and numeric string IDs)
|
|
54
|
+
let currentPage = pageNumber;
|
|
55
|
+
if (blockType === 'Page') {
|
|
56
|
+
if (typeof block.id === 'number') {
|
|
57
|
+
currentPage = block.id + 1;
|
|
58
|
+
}
|
|
59
|
+
else if (typeof block.id === 'string' && /^\d+$/.test(block.id)) {
|
|
60
|
+
currentPage = parseInt(block.id, 10) + 1;
|
|
61
|
+
}
|
|
62
|
+
else if (typeof block.page === 'number') {
|
|
63
|
+
currentPage = block.page + 1;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
// Fallback: if block has a page field, use it
|
|
67
|
+
if (currentPage === null && typeof block.page === 'number') {
|
|
68
|
+
currentPage = block.page + 1;
|
|
69
|
+
}
|
|
70
|
+
if (blockType === 'Table') {
|
|
71
|
+
const table = parseTableBlock(block, tables.length, currentPage);
|
|
72
|
+
tables.push(table);
|
|
73
|
+
}
|
|
74
|
+
// Recurse into children
|
|
75
|
+
const children = block.children;
|
|
76
|
+
if (Array.isArray(children)) {
|
|
77
|
+
for (const child of children) {
|
|
78
|
+
walkBlock(child, currentPage);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
for (const block of blocks) {
|
|
83
|
+
walkBlock(block, null);
|
|
84
|
+
}
|
|
85
|
+
return tables;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Parse a single Table block into a structured table representation.
|
|
89
|
+
*/
|
|
90
|
+
function parseTableBlock(block, tableIndex, pageNumber) {
|
|
91
|
+
const cells = [];
|
|
92
|
+
let maxRow = 0;
|
|
93
|
+
let maxCol = 0;
|
|
94
|
+
let caption = null;
|
|
95
|
+
// Look for caption in the block itself or nearby
|
|
96
|
+
if (typeof block.html === 'string' && block.html.includes('<caption>')) {
|
|
97
|
+
const captionMatch = block.html.match(/<caption>(.*?)<\/caption>/);
|
|
98
|
+
if (captionMatch) {
|
|
99
|
+
caption = captionMatch[1];
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
// Try to extract cells from HTML if available
|
|
103
|
+
if (typeof block.html === 'string') {
|
|
104
|
+
const html = block.html;
|
|
105
|
+
extractCellsFromHTML(html, cells);
|
|
106
|
+
}
|
|
107
|
+
// Also try to extract from children blocks (TableRow/TableCell pattern)
|
|
108
|
+
const children = block.children;
|
|
109
|
+
if (Array.isArray(children) && cells.length === 0) {
|
|
110
|
+
let rowIndex = 0;
|
|
111
|
+
for (const child of children) {
|
|
112
|
+
const childType = child.block_type;
|
|
113
|
+
if (childType === 'TableRow' || childType === 'TableHeader') {
|
|
114
|
+
const rowChildren = child.children;
|
|
115
|
+
if (Array.isArray(rowChildren)) {
|
|
116
|
+
let colIndex = 0;
|
|
117
|
+
for (const cell of rowChildren) {
|
|
118
|
+
const cellType = cell.block_type;
|
|
119
|
+
if (cellType === 'TableCell' || cellType === 'TableHeaderCell') {
|
|
120
|
+
const text = extractBlockText(cell);
|
|
121
|
+
cells.push({ row: rowIndex, col: colIndex, text });
|
|
122
|
+
if (colIndex > maxCol)
|
|
123
|
+
maxCol = colIndex;
|
|
124
|
+
colIndex++;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
rowIndex++;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
maxRow = rowIndex > 0 ? rowIndex - 1 : 0;
|
|
132
|
+
}
|
|
133
|
+
// Compute maxRow/maxCol from cells
|
|
134
|
+
for (const cell of cells) {
|
|
135
|
+
if (cell.row > maxRow)
|
|
136
|
+
maxRow = cell.row;
|
|
137
|
+
if (cell.col > maxCol)
|
|
138
|
+
maxCol = cell.col;
|
|
139
|
+
}
|
|
140
|
+
return {
|
|
141
|
+
table_index: tableIndex,
|
|
142
|
+
page_number: pageNumber,
|
|
143
|
+
caption,
|
|
144
|
+
row_count: cells.length > 0 ? maxRow + 1 : 0,
|
|
145
|
+
column_count: cells.length > 0 ? maxCol + 1 : 0,
|
|
146
|
+
cells,
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Extract cells from HTML table string.
|
|
151
|
+
*/
|
|
152
|
+
function extractCellsFromHTML(html, cells) {
|
|
153
|
+
// Split by rows
|
|
154
|
+
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
|
|
155
|
+
let rowMatch;
|
|
156
|
+
let rowIndex = 0;
|
|
157
|
+
while ((rowMatch = rowRegex.exec(html)) !== null) {
|
|
158
|
+
const rowContent = rowMatch[1];
|
|
159
|
+
const cellRegex = /<t[dh][^>]*>([\s\S]*?)<\/t[dh]>/gi;
|
|
160
|
+
let cellMatch;
|
|
161
|
+
let colIndex = 0;
|
|
162
|
+
while ((cellMatch = cellRegex.exec(rowContent)) !== null) {
|
|
163
|
+
// Strip inner HTML tags to get text
|
|
164
|
+
const text = cellMatch[1].replace(/<[^>]*>/g, '').trim();
|
|
165
|
+
cells.push({ row: rowIndex, col: colIndex, text });
|
|
166
|
+
colIndex++;
|
|
167
|
+
}
|
|
168
|
+
rowIndex++;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Extract text content from a block recursively.
|
|
173
|
+
*/
|
|
174
|
+
function extractBlockText(block) {
|
|
175
|
+
if (typeof block.text === 'string')
|
|
176
|
+
return block.text;
|
|
177
|
+
if (typeof block.html === 'string') {
|
|
178
|
+
return block.html.replace(/<[^>]*>/g, '').trim();
|
|
179
|
+
}
|
|
180
|
+
const children = block.children;
|
|
181
|
+
if (Array.isArray(children)) {
|
|
182
|
+
return children.map(extractBlockText).filter(Boolean).join(' ');
|
|
183
|
+
}
|
|
184
|
+
return '';
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Fetch and parse json_blocks for a document from ocr_results.
|
|
188
|
+
* Handles both formats: flat array or {children: [...], metadata: {...}}.
|
|
189
|
+
*/
|
|
190
|
+
function fetchJsonBlocks(conn, documentId) {
|
|
191
|
+
const ocrRow = conn
|
|
192
|
+
.prepare('SELECT json_blocks FROM ocr_results WHERE document_id = ?')
|
|
193
|
+
.get(documentId);
|
|
194
|
+
if (!ocrRow?.json_blocks) {
|
|
195
|
+
return { ok: false, reason: 'no_ocr_data' };
|
|
196
|
+
}
|
|
197
|
+
let blocks;
|
|
198
|
+
try {
|
|
199
|
+
const parsed = JSON.parse(ocrRow.json_blocks);
|
|
200
|
+
if (Array.isArray(parsed)) {
|
|
201
|
+
blocks = parsed;
|
|
202
|
+
}
|
|
203
|
+
else if (parsed && typeof parsed === 'object' && Array.isArray(parsed.children)) {
|
|
204
|
+
blocks = parsed.children;
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
blocks = [];
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
catch (parseErr) {
|
|
211
|
+
console.error(`[intelligence] Failed to parse json_blocks for ${documentId}: ${String(parseErr)}`);
|
|
212
|
+
return { ok: false, reason: 'parse_error' };
|
|
213
|
+
}
|
|
214
|
+
if (blocks.length === 0) {
|
|
215
|
+
return { ok: false, reason: 'empty' };
|
|
216
|
+
}
|
|
217
|
+
return { ok: true, blocks };
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Filter parsed tables by optional table_index.
|
|
221
|
+
* Returns null if the index is out of range (caller should handle).
|
|
222
|
+
*/
|
|
223
|
+
function filterTablesByIndex(allTables, tableIndex) {
|
|
224
|
+
if (tableIndex === undefined) {
|
|
225
|
+
return allTables;
|
|
226
|
+
}
|
|
227
|
+
if (tableIndex >= allTables.length) {
|
|
228
|
+
return null;
|
|
229
|
+
}
|
|
230
|
+
return [allTables[tableIndex]];
|
|
231
|
+
}
|
|
232
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
233
|
+
// HANDLER: ocr_guide
|
|
234
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
235
|
+
/**
|
|
236
|
+
* Handle ocr_guide - Contextual navigation aid for AI agents.
|
|
237
|
+
*
|
|
238
|
+
* Inspects current system state (databases, selected DB, document counts,
|
|
239
|
+
* processing status) and returns actionable guidance. No external API calls.
|
|
240
|
+
*/
|
|
241
|
+
async function handleGuide(params) {
|
|
242
|
+
try {
|
|
243
|
+
const input = validateInput(GuideInput, params);
|
|
244
|
+
const intent = input.intent;
|
|
245
|
+
const storagePath = getDefaultStoragePath();
|
|
246
|
+
const databases = DatabaseService.list(storagePath);
|
|
247
|
+
const selectedDb = state.currentDatabaseName;
|
|
248
|
+
const dbSelected = hasDatabase();
|
|
249
|
+
// Build context about current state
|
|
250
|
+
const context = {
|
|
251
|
+
databases_available: databases.length,
|
|
252
|
+
database_names: databases.map(d => d.name),
|
|
253
|
+
selected_database: selectedDb ?? 'none',
|
|
254
|
+
};
|
|
255
|
+
// If a database is selected, get its stats
|
|
256
|
+
let docCount = 0;
|
|
257
|
+
let pendingCount = 0;
|
|
258
|
+
let completeCount = 0;
|
|
259
|
+
let failedCount = 0;
|
|
260
|
+
let chunkCount = 0;
|
|
261
|
+
let embeddingCount = 0;
|
|
262
|
+
let imageCount = 0;
|
|
263
|
+
let clusterCount = 0;
|
|
264
|
+
let embeddingCoverage = 0;
|
|
265
|
+
let vlmCoverage = 0;
|
|
266
|
+
if (dbSelected) {
|
|
267
|
+
try {
|
|
268
|
+
const { db, vector } = requireDatabase();
|
|
269
|
+
const conn = db.getConnection();
|
|
270
|
+
const statusRows = conn.prepare('SELECT status, COUNT(*) as count FROM documents GROUP BY status').all();
|
|
271
|
+
for (const row of statusRows) {
|
|
272
|
+
docCount += row.count;
|
|
273
|
+
if (row.status === 'pending')
|
|
274
|
+
pendingCount = row.count;
|
|
275
|
+
else if (row.status === 'complete')
|
|
276
|
+
completeCount = row.count;
|
|
277
|
+
else if (row.status === 'failed')
|
|
278
|
+
failedCount = row.count;
|
|
279
|
+
}
|
|
280
|
+
chunkCount = conn.prepare('SELECT COUNT(*) as c FROM chunks').get().c;
|
|
281
|
+
embeddingCount = conn.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
|
|
282
|
+
imageCount = conn.prepare('SELECT COUNT(*) as c FROM images').get().c;
|
|
283
|
+
clusterCount = conn.prepare('SELECT COUNT(*) as c FROM clusters').get().c;
|
|
284
|
+
context.database_stats = {
|
|
285
|
+
total_documents: docCount,
|
|
286
|
+
complete: completeCount,
|
|
287
|
+
pending: pendingCount,
|
|
288
|
+
failed: failedCount,
|
|
289
|
+
chunks: chunkCount,
|
|
290
|
+
embeddings: embeddingCount,
|
|
291
|
+
images: imageCount,
|
|
292
|
+
clusters: clusterCount,
|
|
293
|
+
vectors: vector.getVectorCount(),
|
|
294
|
+
};
|
|
295
|
+
// V7: Corpus snapshot for smarter guide
|
|
296
|
+
if (docCount > 0) {
|
|
297
|
+
const fileTypeRows = conn.prepare("SELECT file_type, COUNT(*) as count FROM documents WHERE file_type IS NOT NULL GROUP BY file_type ORDER BY count DESC").all();
|
|
298
|
+
const comparisonCount = conn.prepare('SELECT COUNT(*) as c FROM comparisons').get().c;
|
|
299
|
+
embeddingCoverage = chunkCount > 0
|
|
300
|
+
? Math.round((embeddingCount / chunkCount) * 100)
|
|
301
|
+
: 0;
|
|
302
|
+
// Count images with VLM descriptions vs total
|
|
303
|
+
const vlmCompleteCount = imageCount > 0
|
|
304
|
+
? conn.prepare("SELECT COUNT(*) as c FROM images WHERE vlm_status = 'complete'").get().c
|
|
305
|
+
: 0;
|
|
306
|
+
vlmCoverage = imageCount > 0
|
|
307
|
+
? Math.round((vlmCompleteCount / imageCount) * 100)
|
|
308
|
+
: 0;
|
|
309
|
+
context.corpus_snapshot = {
|
|
310
|
+
document_count: docCount,
|
|
311
|
+
total_chunks: chunkCount,
|
|
312
|
+
total_images: imageCount,
|
|
313
|
+
file_types: fileTypeRows.map(r => r.file_type),
|
|
314
|
+
has_clusters: clusterCount > 0,
|
|
315
|
+
has_comparisons: comparisonCount > 0,
|
|
316
|
+
embedding_coverage: `${embeddingCoverage}%`,
|
|
317
|
+
vlm_coverage: `${vlmCoverage}%`,
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
catch (err) {
|
|
322
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
323
|
+
context.database_stats_error = errMsg;
|
|
324
|
+
return formatResponse(successResult({
|
|
325
|
+
status: 'database_error',
|
|
326
|
+
message: `Database "${selectedDb}" selected but query failed: ${errMsg}. Try ocr_health_check to diagnose.`,
|
|
327
|
+
context,
|
|
328
|
+
next_steps: [
|
|
329
|
+
{ tool: 'ocr_health_check', description: 'Diagnose database integrity issues.', priority: 'required' },
|
|
330
|
+
{ tool: 'ocr_db_select', description: 'Re-select the database to reset connection.', priority: 'optional' },
|
|
331
|
+
],
|
|
332
|
+
}));
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
// Build next_steps based on state and intent
|
|
336
|
+
const next_steps = [];
|
|
337
|
+
if (!dbSelected) {
|
|
338
|
+
if (databases.length === 0) {
|
|
339
|
+
next_steps.push({
|
|
340
|
+
tool: 'ocr_db_create',
|
|
341
|
+
description: 'Create a database first, then ingest documents.',
|
|
342
|
+
priority: 'required',
|
|
343
|
+
});
|
|
344
|
+
}
|
|
345
|
+
else {
|
|
346
|
+
next_steps.push({
|
|
347
|
+
tool: 'ocr_db_select',
|
|
348
|
+
description: 'Select a database to work with (see database_names in context above)',
|
|
349
|
+
priority: 'required',
|
|
350
|
+
});
|
|
351
|
+
}
|
|
352
|
+
return formatResponse(successResult({
|
|
353
|
+
status: 'no_database_selected',
|
|
354
|
+
message: databases.length === 0
|
|
355
|
+
? 'No databases exist. Create one with ocr_db_create, then ingest documents.'
|
|
356
|
+
: `${databases.length} database(s) available. Select one with ocr_db_select to get started.`,
|
|
357
|
+
context,
|
|
358
|
+
next_steps,
|
|
359
|
+
}));
|
|
360
|
+
}
|
|
361
|
+
// Database is selected - provide guidance based on intent and state
|
|
362
|
+
if (intent === 'ingest' || (docCount === 0 && !intent)) {
|
|
363
|
+
next_steps.push({
|
|
364
|
+
tool: 'ocr_ingest_files',
|
|
365
|
+
description: 'Ingest specific files by path.',
|
|
366
|
+
priority: docCount === 0 ? 'required' : 'optional',
|
|
367
|
+
});
|
|
368
|
+
next_steps.push({
|
|
369
|
+
tool: 'ocr_ingest_directory',
|
|
370
|
+
description: 'Scan a directory for documents to ingest.',
|
|
371
|
+
priority: 'optional',
|
|
372
|
+
});
|
|
373
|
+
if (pendingCount > 0) {
|
|
374
|
+
next_steps.push({
|
|
375
|
+
tool: 'ocr_process_pending',
|
|
376
|
+
description: `Process ${pendingCount} pending documents through OCR pipeline.`,
|
|
377
|
+
priority: 'required',
|
|
378
|
+
});
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
else if (intent === 'search' || (!intent && completeCount > 0)) {
|
|
382
|
+
next_steps.push({
|
|
383
|
+
tool: 'ocr_search',
|
|
384
|
+
description: 'Search across all documents. Default and recommended search tool.',
|
|
385
|
+
priority: 'recommended',
|
|
386
|
+
});
|
|
387
|
+
next_steps.push({
|
|
388
|
+
tool: 'ocr_rag_context',
|
|
389
|
+
description: 'Get pre-assembled context for answering a specific question.',
|
|
390
|
+
priority: 'recommended',
|
|
391
|
+
});
|
|
392
|
+
if (embeddingCount === 0 && chunkCount > 0) {
|
|
393
|
+
next_steps.push({
|
|
394
|
+
tool: 'ocr_health_check',
|
|
395
|
+
description: 'Chunks exist but no embeddings. Run health check with fix=true.',
|
|
396
|
+
priority: 'required',
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
else if (intent === 'explore') {
|
|
401
|
+
next_steps.push({
|
|
402
|
+
tool: 'ocr_document_list',
|
|
403
|
+
description: `Browse ${docCount} documents in the database.`,
|
|
404
|
+
priority: 'recommended',
|
|
405
|
+
});
|
|
406
|
+
next_steps.push({
|
|
407
|
+
tool: 'ocr_report_overview',
|
|
408
|
+
description: 'Get corpus overview with content type distribution (section="corpus").',
|
|
409
|
+
priority: 'optional',
|
|
410
|
+
});
|
|
411
|
+
}
|
|
412
|
+
else if (intent === 'analyze') {
|
|
413
|
+
if (clusterCount > 0) {
|
|
414
|
+
next_steps.push({
|
|
415
|
+
tool: 'ocr_cluster_list',
|
|
416
|
+
description: `View ${clusterCount} existing clusters.`,
|
|
417
|
+
priority: 'recommended',
|
|
418
|
+
});
|
|
419
|
+
}
|
|
420
|
+
else if (completeCount >= 2) {
|
|
421
|
+
next_steps.push({
|
|
422
|
+
tool: 'ocr_cluster_documents',
|
|
423
|
+
description: `Cluster ${completeCount} documents by similarity.`,
|
|
424
|
+
priority: 'recommended',
|
|
425
|
+
});
|
|
426
|
+
}
|
|
427
|
+
if (completeCount >= 2) {
|
|
428
|
+
next_steps.push({
|
|
429
|
+
tool: 'ocr_document_compare',
|
|
430
|
+
description: 'Compare two documents to find differences.',
|
|
431
|
+
priority: 'optional',
|
|
432
|
+
});
|
|
433
|
+
}
|
|
434
|
+
next_steps.push({
|
|
435
|
+
tool: 'ocr_document_duplicates',
|
|
436
|
+
description: 'Find duplicate documents by hash or similarity.',
|
|
437
|
+
priority: 'optional',
|
|
438
|
+
});
|
|
439
|
+
}
|
|
440
|
+
else if (intent === 'status') {
|
|
441
|
+
next_steps.push({
|
|
442
|
+
tool: 'ocr_health_check',
|
|
443
|
+
description: 'Check for data integrity issues.',
|
|
444
|
+
priority: 'recommended',
|
|
445
|
+
});
|
|
446
|
+
next_steps.push({
|
|
447
|
+
tool: 'ocr_db_stats',
|
|
448
|
+
description: 'Get comprehensive database statistics.',
|
|
449
|
+
priority: 'optional',
|
|
450
|
+
});
|
|
451
|
+
if (failedCount > 0) {
|
|
452
|
+
next_steps.push({
|
|
453
|
+
tool: 'ocr_retry_failed',
|
|
454
|
+
description: `${failedCount} failed documents. Reset for reprocessing.`,
|
|
455
|
+
priority: 'recommended',
|
|
456
|
+
});
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
else {
|
|
460
|
+
// General guidance when DB has data and no specific intent
|
|
461
|
+
if (pendingCount > 0) {
|
|
462
|
+
next_steps.push({
|
|
463
|
+
tool: 'ocr_process_pending',
|
|
464
|
+
description: `${pendingCount} documents awaiting processing.`,
|
|
465
|
+
priority: 'recommended',
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
if (failedCount > 0) {
|
|
469
|
+
next_steps.push({
|
|
470
|
+
tool: 'ocr_retry_failed',
|
|
471
|
+
description: `${failedCount} failed documents need attention.`,
|
|
472
|
+
priority: 'recommended',
|
|
473
|
+
});
|
|
474
|
+
}
|
|
475
|
+
if (completeCount > 0) {
|
|
476
|
+
next_steps.push({
|
|
477
|
+
tool: 'ocr_search',
|
|
478
|
+
description: 'Search across all documents.',
|
|
479
|
+
priority: 'recommended',
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
next_steps.push({
|
|
483
|
+
tool: 'ocr_document_list',
|
|
484
|
+
description: `Browse ${docCount} documents.`,
|
|
485
|
+
priority: 'optional',
|
|
486
|
+
});
|
|
487
|
+
// V7: Context-aware next_steps from corpus snapshot
|
|
488
|
+
if (embeddingCoverage < 100 && chunkCount > 0) {
|
|
489
|
+
next_steps.push({
|
|
490
|
+
tool: 'ocr_health_check',
|
|
491
|
+
description: `Check for processing gaps (${embeddingCoverage}% embedding coverage).`,
|
|
492
|
+
priority: 'recommended',
|
|
493
|
+
});
|
|
494
|
+
}
|
|
495
|
+
if (clusterCount > 0) {
|
|
496
|
+
next_steps.push({
|
|
497
|
+
tool: 'ocr_cluster_list',
|
|
498
|
+
description: `Explore ${clusterCount} topic clusters.`,
|
|
499
|
+
priority: 'optional',
|
|
500
|
+
});
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
// Build summary message
|
|
504
|
+
const parts = [];
|
|
505
|
+
parts.push(`Database "${selectedDb}" selected.`);
|
|
506
|
+
parts.push(`${docCount} documents (${completeCount} complete, ${pendingCount} pending, ${failedCount} failed).`);
|
|
507
|
+
if (chunkCount > 0)
|
|
508
|
+
parts.push(`${chunkCount} chunks, ${embeddingCount} embeddings.`);
|
|
509
|
+
if (imageCount > 0)
|
|
510
|
+
parts.push(`${imageCount} images.`);
|
|
511
|
+
if (clusterCount > 0)
|
|
512
|
+
parts.push(`${clusterCount} clusters.`);
|
|
513
|
+
return formatResponse(successResult({
|
|
514
|
+
status: 'ready',
|
|
515
|
+
message: parts.join(' '),
|
|
516
|
+
context,
|
|
517
|
+
next_steps,
|
|
518
|
+
workflow_chains: docCount > 0 ? [
|
|
519
|
+
{ name: 'find_and_read', steps: ['ocr_search -> ocr_chunk_context -> ocr_document_page'], description: 'Find content, expand context, read full page' },
|
|
520
|
+
{ name: 'compare_documents', steps: ['ocr_comparison_discover -> ocr_document_compare -> ocr_comparison_get'], description: 'Find similar pairs, diff them, inspect results' },
|
|
521
|
+
{ name: 'process_new', steps: ['ocr_ingest_files -> ocr_process_pending -> ocr_health_check'], description: 'Add files, run OCR pipeline, verify completeness' },
|
|
522
|
+
] : undefined,
|
|
523
|
+
}));
|
|
524
|
+
}
|
|
525
|
+
catch (error) {
|
|
526
|
+
return handleError(error);
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
530
|
+
// HANDLER: ocr_document_tables
|
|
531
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
532
|
+
/**
|
|
533
|
+
* Handle ocr_document_tables - Extract table data from JSON blocks
|
|
534
|
+
*/
|
|
535
|
+
async function handleDocumentTables(params) {
|
|
536
|
+
try {
|
|
537
|
+
const input = validateInput(DocumentTablesInput, params);
|
|
538
|
+
const { db } = requireDatabase();
|
|
539
|
+
const doc = db.getDocument(input.document_id);
|
|
540
|
+
if (!doc) {
|
|
541
|
+
throw documentNotFoundError(input.document_id);
|
|
542
|
+
}
|
|
543
|
+
const nextSteps = [
|
|
544
|
+
{ tool: 'ocr_document_page', description: 'Read the page containing a table' },
|
|
545
|
+
{ tool: 'ocr_search', description: 'Search for related content' },
|
|
546
|
+
];
|
|
547
|
+
const blocksResult = fetchJsonBlocks(db.getConnection(), input.document_id);
|
|
548
|
+
if (!blocksResult.ok) {
|
|
549
|
+
return formatResponse(successResult({
|
|
550
|
+
document_id: input.document_id,
|
|
551
|
+
file_name: doc.file_name,
|
|
552
|
+
tables: [],
|
|
553
|
+
total_tables: 0,
|
|
554
|
+
source: blocksResult.reason === 'no_ocr_data' ? 'no_ocr_results_or_blocks'
|
|
555
|
+
: blocksResult.reason === 'parse_error' ? 'json_blocks_parse_error'
|
|
556
|
+
: 'empty_json_blocks',
|
|
557
|
+
next_steps: nextSteps,
|
|
558
|
+
}));
|
|
559
|
+
}
|
|
560
|
+
const allTables = extractTablesFromBlocks(blocksResult.blocks);
|
|
561
|
+
const tables = filterTablesByIndex(allTables, input.table_index);
|
|
562
|
+
if (tables === null) {
|
|
563
|
+
return formatResponse(successResult({
|
|
564
|
+
document_id: input.document_id,
|
|
565
|
+
file_name: doc.file_name,
|
|
566
|
+
tables: [],
|
|
567
|
+
total_tables: allTables.length,
|
|
568
|
+
requested_index: input.table_index,
|
|
569
|
+
message: `Table index ${input.table_index} out of range. Document has ${allTables.length} table(s).`,
|
|
570
|
+
next_steps: nextSteps,
|
|
571
|
+
}));
|
|
572
|
+
}
|
|
573
|
+
return formatResponse(successResult({
|
|
574
|
+
document_id: input.document_id,
|
|
575
|
+
file_name: doc.file_name,
|
|
576
|
+
tables,
|
|
577
|
+
total_tables: allTables.length,
|
|
578
|
+
source: 'json_blocks',
|
|
579
|
+
next_steps: nextSteps,
|
|
580
|
+
}));
|
|
581
|
+
}
|
|
582
|
+
catch (error) {
|
|
583
|
+
return handleError(error);
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
587
|
+
// HANDLER: ocr_document_recommend
|
|
588
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
589
|
+
/**
|
|
590
|
+
* Handle ocr_document_recommend - Cluster-based document recommendations
|
|
591
|
+
*
|
|
592
|
+
* Combines two signals:
|
|
593
|
+
* 1. Cluster peers (documents in the same cluster)
|
|
594
|
+
* 2. Vector similarity (centroid-based similar documents)
|
|
595
|
+
*/
|
|
596
|
+
async function handleDocumentRecommend(params) {
|
|
597
|
+
try {
|
|
598
|
+
const input = validateInput(DocumentRecommendInput, params);
|
|
599
|
+
const { db, vector } = requireDatabase();
|
|
600
|
+
// Verify source document exists
|
|
601
|
+
const doc = db.getDocument(input.document_id);
|
|
602
|
+
if (!doc) {
|
|
603
|
+
throw documentNotFoundError(input.document_id);
|
|
604
|
+
}
|
|
605
|
+
const conn = db.getConnection();
|
|
606
|
+
const limit = input.limit ?? 10;
|
|
607
|
+
// Map of document_id -> recommendation entry
|
|
608
|
+
const recommendations = new Map();
|
|
609
|
+
// ──────────────────────────────────────────────────────────────
|
|
610
|
+
// Signal 1: Cluster peers
|
|
611
|
+
// ──────────────────────────────────────────────────────────────
|
|
612
|
+
const sourceClusters = conn.prepare('SELECT cluster_id FROM document_clusters WHERE document_id = ?').all(input.document_id);
|
|
613
|
+
if (sourceClusters.length > 0) {
|
|
614
|
+
const clusterIds = sourceClusters.map(c => c.cluster_id);
|
|
615
|
+
for (const clusterId of clusterIds) {
|
|
616
|
+
const peers = conn.prepare('SELECT document_id FROM document_clusters WHERE cluster_id = ? AND document_id != ?').all(clusterId, input.document_id);
|
|
617
|
+
for (const peer of peers) {
|
|
618
|
+
const existing = recommendations.get(peer.document_id);
|
|
619
|
+
if (existing) {
|
|
620
|
+
existing.cluster_match = true;
|
|
621
|
+
existing.cluster_ids.push(clusterId);
|
|
622
|
+
}
|
|
623
|
+
else {
|
|
624
|
+
recommendations.set(peer.document_id, {
|
|
625
|
+
cluster_match: true,
|
|
626
|
+
cluster_ids: [clusterId],
|
|
627
|
+
similarity: null,
|
|
628
|
+
});
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
// ──────────────────────────────────────────────────────────────
|
|
634
|
+
// Signal 2: Vector similarity (centroid approach)
|
|
635
|
+
// ──────────────────────────────────────────────────────────────
|
|
636
|
+
const embeddingRows = conn.prepare('SELECT id FROM embeddings WHERE document_id = ? AND chunk_id IS NOT NULL').all(input.document_id);
|
|
637
|
+
if (embeddingRows.length > 0) {
|
|
638
|
+
const vectors = [];
|
|
639
|
+
for (const row of embeddingRows) {
|
|
640
|
+
const vec = vector.getVector(row.id);
|
|
641
|
+
if (vec)
|
|
642
|
+
vectors.push(vec);
|
|
643
|
+
}
|
|
644
|
+
if (vectors.length > 0) {
|
|
645
|
+
// Compute centroid
|
|
646
|
+
const dims = 768;
|
|
647
|
+
const centroid = new Float32Array(dims);
|
|
648
|
+
for (const vec of vectors) {
|
|
649
|
+
for (let i = 0; i < dims; i++) {
|
|
650
|
+
centroid[i] += vec[i];
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
for (let i = 0; i < dims; i++) {
|
|
654
|
+
centroid[i] /= vectors.length;
|
|
655
|
+
}
|
|
656
|
+
// Search for similar embeddings
|
|
657
|
+
const searchResults = vector.searchSimilar(centroid, {
|
|
658
|
+
limit: limit * 10,
|
|
659
|
+
threshold: 0.4,
|
|
660
|
+
});
|
|
661
|
+
// Aggregate by document
|
|
662
|
+
const docSimilarityMap = new Map();
|
|
663
|
+
for (const r of searchResults) {
|
|
664
|
+
if (r.document_id === input.document_id)
|
|
665
|
+
continue;
|
|
666
|
+
const entry = docSimilarityMap.get(r.document_id);
|
|
667
|
+
if (entry) {
|
|
668
|
+
entry.totalSim += r.similarity_score;
|
|
669
|
+
entry.count += 1;
|
|
670
|
+
}
|
|
671
|
+
else {
|
|
672
|
+
docSimilarityMap.set(r.document_id, { totalSim: r.similarity_score, count: 1 });
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
for (const [docId, { totalSim, count }] of docSimilarityMap.entries()) {
|
|
676
|
+
const avgSim = Math.round((totalSim / count) * 1000000) / 1000000;
|
|
677
|
+
const existing = recommendations.get(docId);
|
|
678
|
+
if (existing) {
|
|
679
|
+
existing.similarity = avgSim;
|
|
680
|
+
}
|
|
681
|
+
else {
|
|
682
|
+
recommendations.set(docId, {
|
|
683
|
+
cluster_match: false,
|
|
684
|
+
cluster_ids: [],
|
|
685
|
+
similarity: avgSim,
|
|
686
|
+
});
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
// ──────────────────────────────────────────────────────────────
|
|
692
|
+
// Merge, score, and rank
|
|
693
|
+
// ──────────────────────────────────────────────────────────────
|
|
694
|
+
const ranked = [];
|
|
695
|
+
for (const [docId, rec] of recommendations.entries()) {
|
|
696
|
+
const recDoc = db.getDocument(docId);
|
|
697
|
+
// Score: cluster match = 0.5 bonus, similarity = actual value
|
|
698
|
+
const clusterBonus = rec.cluster_match ? 0.5 : 0;
|
|
699
|
+
const simScore = rec.similarity ?? 0;
|
|
700
|
+
const score = Math.round((clusterBonus + simScore) * 1000000) / 1000000;
|
|
701
|
+
const reasons = [];
|
|
702
|
+
if (rec.cluster_match) {
|
|
703
|
+
reasons.push(`cluster_peer (clusters: ${rec.cluster_ids.join(', ')})`);
|
|
704
|
+
}
|
|
705
|
+
if (rec.similarity !== null) {
|
|
706
|
+
reasons.push(`similar (score: ${rec.similarity})`);
|
|
707
|
+
}
|
|
708
|
+
ranked.push({
|
|
709
|
+
document_id: docId,
|
|
710
|
+
file_name: recDoc?.file_name ?? null,
|
|
711
|
+
file_type: recDoc?.file_type ?? null,
|
|
712
|
+
status: recDoc?.status ?? null,
|
|
713
|
+
score,
|
|
714
|
+
reasons,
|
|
715
|
+
cluster_match: rec.cluster_match,
|
|
716
|
+
similarity: rec.similarity,
|
|
717
|
+
});
|
|
718
|
+
}
|
|
719
|
+
ranked.sort((a, b) => b.score - a.score);
|
|
720
|
+
const topRanked = ranked.slice(0, limit);
|
|
721
|
+
return formatResponse(successResult({
|
|
722
|
+
source_document_id: input.document_id,
|
|
723
|
+
source_file_name: doc.file_name,
|
|
724
|
+
source_cluster_count: sourceClusters.length,
|
|
725
|
+
source_embedding_count: embeddingRows.length,
|
|
726
|
+
recommendations: topRanked,
|
|
727
|
+
total_candidates: ranked.length,
|
|
728
|
+
returned: topRanked.length,
|
|
729
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'Get details for a recommended document' }, { tool: 'ocr_document_compare', description: 'Compare the source document with a recommendation' }],
|
|
730
|
+
}));
|
|
731
|
+
}
|
|
732
|
+
catch (error) {
|
|
733
|
+
return handleError(error);
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
737
|
+
// HANDLER: ocr_document_extras
|
|
738
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
739
|
+
/** Known extras sections */
|
|
740
|
+
const KNOWN_EXTRAS_SECTIONS = ['charts', 'links', 'tracked_changes', 'table_row_bboxes', 'infographics'];
|
|
741
|
+
/**
|
|
742
|
+
* Handle ocr_document_extras - Surface extras_json data from OCR results
|
|
743
|
+
*/
|
|
744
|
+
async function handleDocumentExtras(params) {
|
|
745
|
+
try {
|
|
746
|
+
const input = validateInput(DocumentExtrasInput, params);
|
|
747
|
+
const { db } = requireDatabase();
|
|
748
|
+
// Verify document exists
|
|
749
|
+
const doc = db.getDocument(input.document_id);
|
|
750
|
+
if (!doc) {
|
|
751
|
+
throw documentNotFoundError(input.document_id);
|
|
752
|
+
}
|
|
753
|
+
// Get extras_json from ocr_results
|
|
754
|
+
const ocrRow = db.getConnection()
|
|
755
|
+
.prepare('SELECT extras_json FROM ocr_results WHERE document_id = ?')
|
|
756
|
+
.get(input.document_id);
|
|
757
|
+
if (!ocrRow?.extras_json) {
|
|
758
|
+
return formatResponse(successResult({
|
|
759
|
+
document_id: input.document_id,
|
|
760
|
+
file_name: doc.file_name,
|
|
761
|
+
extras: {},
|
|
762
|
+
available_sections: [],
|
|
763
|
+
message: 'No extras data available for this document.',
|
|
764
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'View document details' }],
|
|
765
|
+
}));
|
|
766
|
+
}
|
|
767
|
+
let extras;
|
|
768
|
+
try {
|
|
769
|
+
extras = JSON.parse(ocrRow.extras_json);
|
|
770
|
+
}
|
|
771
|
+
catch (parseErr) {
|
|
772
|
+
console.error(`[DocumentExtras] Failed to parse extras_json for ${input.document_id}: ${String(parseErr)}`);
|
|
773
|
+
throw new MCPError('INTERNAL_ERROR', `Failed to parse extras_json: ${String(parseErr)}`);
|
|
774
|
+
}
|
|
775
|
+
// Determine available sections
|
|
776
|
+
const availableSections = Object.keys(extras).filter(key => extras[key] !== null && extras[key] !== undefined);
|
|
777
|
+
// Filter by specific section if requested
|
|
778
|
+
if (input.section) {
|
|
779
|
+
if (!KNOWN_EXTRAS_SECTIONS.includes(input.section) &&
|
|
780
|
+
!(input.section in extras)) {
|
|
781
|
+
throw new MCPError('VALIDATION_ERROR', `Unknown section "${input.section}". Available sections: ${availableSections.join(', ')}`);
|
|
782
|
+
}
|
|
783
|
+
const sectionData = extras[input.section];
|
|
784
|
+
return formatResponse(successResult({
|
|
785
|
+
document_id: input.document_id,
|
|
786
|
+
file_name: doc.file_name,
|
|
787
|
+
section: input.section,
|
|
788
|
+
data: sectionData ?? null,
|
|
789
|
+
available_sections: availableSections,
|
|
790
|
+
next_steps: [{ tool: 'ocr_document_tables', description: 'Extract table data from the document' }, { tool: 'ocr_document_get', description: 'View core document metadata' }],
|
|
791
|
+
}));
|
|
792
|
+
}
|
|
793
|
+
// Return all extras organized by section
|
|
794
|
+
const organized = {};
|
|
795
|
+
for (const section of KNOWN_EXTRAS_SECTIONS) {
|
|
796
|
+
if (section in extras) {
|
|
797
|
+
organized[section] = extras[section];
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
// Include any non-standard sections
|
|
801
|
+
for (const key of Object.keys(extras)) {
|
|
802
|
+
if (!KNOWN_EXTRAS_SECTIONS.includes(key)) {
|
|
803
|
+
organized[key] = extras[key];
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
return formatResponse(successResult({
|
|
807
|
+
document_id: input.document_id,
|
|
808
|
+
file_name: doc.file_name,
|
|
809
|
+
extras: organized,
|
|
810
|
+
available_sections: availableSections,
|
|
811
|
+
next_steps: [{ tool: 'ocr_document_tables', description: 'Extract table data from the document' }, { tool: 'ocr_document_get', description: 'View core document metadata' }],
|
|
812
|
+
}));
|
|
813
|
+
}
|
|
814
|
+
catch (error) {
|
|
815
|
+
return handleError(error);
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
/**
|
|
819
|
+
* Build a 2D grid from a parsed table's cells with computed max row/column indices.
|
|
820
|
+
*/
|
|
821
|
+
function buildTableGrid(table) {
|
|
822
|
+
const rowMap = new Map();
|
|
823
|
+
for (const cell of table.cells) {
|
|
824
|
+
if (!rowMap.has(cell.row))
|
|
825
|
+
rowMap.set(cell.row, new Map());
|
|
826
|
+
rowMap.get(cell.row).set(cell.col, cell.text);
|
|
827
|
+
}
|
|
828
|
+
const maxRow = table.row_count > 0
|
|
829
|
+
? table.row_count - 1
|
|
830
|
+
: Math.max(0, ...table.cells.map(c => c.row));
|
|
831
|
+
const maxCol = table.column_count > 0
|
|
832
|
+
? table.column_count - 1
|
|
833
|
+
: Math.max(0, ...table.cells.map(c => c.col));
|
|
834
|
+
return { rowMap, maxRow, maxCol };
|
|
835
|
+
}
|
|
836
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
837
|
+
// INPUT SCHEMA: ocr_table_export
|
|
838
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
839
|
+
const TableExportInput = z.object({
|
|
840
|
+
document_id: z.string().min(1).describe('Document ID to export tables from'),
|
|
841
|
+
table_index: z.number().int().min(0).optional()
|
|
842
|
+
.describe('Specific table index (0-based). Omit to export all tables.'),
|
|
843
|
+
format: z.enum(['csv', 'json', 'markdown']).default('json')
|
|
844
|
+
.describe('Export format: csv (RFC 4180), json (structured), or markdown (pipe-delimited)'),
|
|
845
|
+
});
|
|
846
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
847
|
+
// HANDLER: ocr_table_export
|
|
848
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
849
|
+
/**
|
|
850
|
+
* Handle ocr_table_export - Export table data in CSV, JSON, or markdown format
|
|
851
|
+
*/
|
|
852
|
+
async function handleTableExport(params) {
|
|
853
|
+
try {
|
|
854
|
+
const input = validateInput(TableExportInput, params);
|
|
855
|
+
const { db } = requireDatabase();
|
|
856
|
+
const doc = db.getDocument(input.document_id);
|
|
857
|
+
if (!doc) {
|
|
858
|
+
throw documentNotFoundError(input.document_id);
|
|
859
|
+
}
|
|
860
|
+
const nextSteps = [
|
|
861
|
+
{ tool: 'ocr_document_tables', description: 'View table structure and cell data' },
|
|
862
|
+
{ tool: 'ocr_search', description: 'Search for related content' },
|
|
863
|
+
];
|
|
864
|
+
const blocksResult = fetchJsonBlocks(db.getConnection(), input.document_id);
|
|
865
|
+
if (!blocksResult.ok) {
|
|
866
|
+
return formatResponse(successResult({
|
|
867
|
+
document_id: input.document_id,
|
|
868
|
+
file_name: doc.file_name,
|
|
869
|
+
tables: [],
|
|
870
|
+
total_tables: 0,
|
|
871
|
+
format: input.format,
|
|
872
|
+
message: blocksResult.reason === 'parse_error'
|
|
873
|
+
? 'Failed to parse JSON blocks.'
|
|
874
|
+
: 'No OCR results or JSON blocks available for export.',
|
|
875
|
+
next_steps: nextSteps,
|
|
876
|
+
}));
|
|
877
|
+
}
|
|
878
|
+
const allTables = extractTablesFromBlocks(blocksResult.blocks);
|
|
879
|
+
const tables = filterTablesByIndex(allTables, input.table_index);
|
|
880
|
+
if (tables === null) {
|
|
881
|
+
return formatResponse(successResult({
|
|
882
|
+
document_id: input.document_id,
|
|
883
|
+
file_name: doc.file_name,
|
|
884
|
+
total_tables: allTables.length,
|
|
885
|
+
requested_index: input.table_index,
|
|
886
|
+
format: input.format,
|
|
887
|
+
message: `Table index ${input.table_index} out of range. Document has ${allTables.length} table(s).`,
|
|
888
|
+
next_steps: nextSteps,
|
|
889
|
+
}));
|
|
890
|
+
}
|
|
891
|
+
// Format output based on requested format
|
|
892
|
+
if (input.format === 'csv') {
|
|
893
|
+
const csvQuote = (value) => `"${value.replace(/"/g, '""')}"`;
|
|
894
|
+
const csvParts = [];
|
|
895
|
+
for (const table of tables) {
|
|
896
|
+
if (table.cells.length === 0)
|
|
897
|
+
continue;
|
|
898
|
+
const { rowMap, maxRow, maxCol } = buildTableGrid(table);
|
|
899
|
+
const lines = [];
|
|
900
|
+
for (let r = 0; r <= maxRow; r++) {
|
|
901
|
+
const row = rowMap.get(r);
|
|
902
|
+
const cols = [];
|
|
903
|
+
for (let c = 0; c <= maxCol; c++) {
|
|
904
|
+
cols.push(csvQuote(row?.get(c) ?? ''));
|
|
905
|
+
}
|
|
906
|
+
lines.push(cols.join(','));
|
|
907
|
+
}
|
|
908
|
+
csvParts.push(lines.join('\n'));
|
|
909
|
+
}
|
|
910
|
+
return formatResponse(successResult({
|
|
911
|
+
document_id: input.document_id,
|
|
912
|
+
file_name: doc.file_name,
|
|
913
|
+
total_tables: allTables.length,
|
|
914
|
+
exported_tables: tables.length,
|
|
915
|
+
format: 'csv',
|
|
916
|
+
data: csvParts.join('\n\n'),
|
|
917
|
+
next_steps: nextSteps,
|
|
918
|
+
}));
|
|
919
|
+
}
|
|
920
|
+
if (input.format === 'markdown') {
|
|
921
|
+
const mdParts = [];
|
|
922
|
+
for (const table of tables) {
|
|
923
|
+
if (table.cells.length === 0)
|
|
924
|
+
continue;
|
|
925
|
+
const { rowMap, maxRow, maxCol } = buildTableGrid(table);
|
|
926
|
+
const lines = [];
|
|
927
|
+
// Header row
|
|
928
|
+
const headerRow = rowMap.get(0);
|
|
929
|
+
const headerCells = [];
|
|
930
|
+
for (let c = 0; c <= maxCol; c++) {
|
|
931
|
+
headerCells.push(headerRow?.get(c) ?? '');
|
|
932
|
+
}
|
|
933
|
+
lines.push(`| ${headerCells.join(' | ')} |`);
|
|
934
|
+
lines.push(`| ${headerCells.map(() => '---').join(' | ')} |`);
|
|
935
|
+
// Data rows
|
|
936
|
+
for (let r = 1; r <= maxRow; r++) {
|
|
937
|
+
const row = rowMap.get(r);
|
|
938
|
+
const cells = [];
|
|
939
|
+
for (let c = 0; c <= maxCol; c++) {
|
|
940
|
+
cells.push(row?.get(c) ?? '');
|
|
941
|
+
}
|
|
942
|
+
lines.push(`| ${cells.join(' | ')} |`);
|
|
943
|
+
}
|
|
944
|
+
if (table.caption) {
|
|
945
|
+
lines.unshift(`**${table.caption}**`);
|
|
946
|
+
}
|
|
947
|
+
mdParts.push(lines.join('\n'));
|
|
948
|
+
}
|
|
949
|
+
return formatResponse(successResult({
|
|
950
|
+
document_id: input.document_id,
|
|
951
|
+
file_name: doc.file_name,
|
|
952
|
+
total_tables: allTables.length,
|
|
953
|
+
exported_tables: tables.length,
|
|
954
|
+
format: 'markdown',
|
|
955
|
+
data: mdParts.join('\n\n'),
|
|
956
|
+
next_steps: nextSteps,
|
|
957
|
+
}));
|
|
958
|
+
}
|
|
959
|
+
// Default: JSON format
|
|
960
|
+
const jsonTables = tables.map(t => {
|
|
961
|
+
const { rowMap, maxRow } = buildTableGrid(t);
|
|
962
|
+
// Build column names from first row
|
|
963
|
+
const headerRow = rowMap.get(0);
|
|
964
|
+
const colNames = [];
|
|
965
|
+
if (headerRow) {
|
|
966
|
+
for (const [col, text] of headerRow) {
|
|
967
|
+
colNames[col] = text;
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
// Build data rows as column-keyed objects
|
|
971
|
+
const rows = [];
|
|
972
|
+
for (let r = 1; r <= maxRow; r++) {
|
|
973
|
+
const row = rowMap.get(r);
|
|
974
|
+
const rowObj = {};
|
|
975
|
+
if (row) {
|
|
976
|
+
for (const [col, text] of row) {
|
|
977
|
+
const colName = colNames[col] ?? `col_${col}`;
|
|
978
|
+
rowObj[colName] = text;
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
rows.push(rowObj);
|
|
982
|
+
}
|
|
983
|
+
return {
|
|
984
|
+
table_index: t.table_index,
|
|
985
|
+
page_number: t.page_number,
|
|
986
|
+
caption: t.caption,
|
|
987
|
+
columns: colNames.filter(Boolean),
|
|
988
|
+
row_count: rows.length,
|
|
989
|
+
rows,
|
|
990
|
+
};
|
|
991
|
+
});
|
|
992
|
+
return formatResponse(successResult({
|
|
993
|
+
document_id: input.document_id,
|
|
994
|
+
file_name: doc.file_name,
|
|
995
|
+
total_tables: allTables.length,
|
|
996
|
+
exported_tables: tables.length,
|
|
997
|
+
format: 'json',
|
|
998
|
+
tables: jsonTables,
|
|
999
|
+
next_steps: nextSteps,
|
|
1000
|
+
}));
|
|
1001
|
+
}
|
|
1002
|
+
catch (error) {
|
|
1003
|
+
return handleError(error);
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1007
|
+
// TOOL DEFINITIONS EXPORT
|
|
1008
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1009
|
+
/**
|
|
1010
|
+
* Intelligence tools collection for MCP server registration
|
|
1011
|
+
*/
|
|
1012
|
+
export const intelligenceTools = {
|
|
1013
|
+
ocr_guide: {
|
|
1014
|
+
description: '[ESSENTIAL] System state overview with prioritized next_steps. Shows databases, stats, and tool recommendations. Optional intent: explore/search/ingest/analyze/status.',
|
|
1015
|
+
inputSchema: GuideInput.shape,
|
|
1016
|
+
handler: handleGuide,
|
|
1017
|
+
},
|
|
1018
|
+
ocr_document_tables: {
|
|
1019
|
+
description: '[ANALYSIS] Use to extract structured table data from a document. Returns rows, columns, and cell text for each table. Specify table_index for a specific table, or omit for all.',
|
|
1020
|
+
inputSchema: DocumentTablesInput.shape,
|
|
1021
|
+
handler: handleDocumentTables,
|
|
1022
|
+
},
|
|
1023
|
+
ocr_document_recommend: {
|
|
1024
|
+
description: '[ANALYSIS] Related document recommendations via cluster membership and vector similarity. Requires embeddings and/or clustering.',
|
|
1025
|
+
inputSchema: DocumentRecommendInput.shape,
|
|
1026
|
+
handler: handleDocumentRecommend,
|
|
1027
|
+
},
|
|
1028
|
+
ocr_document_extras: {
|
|
1029
|
+
description: '[ANALYSIS] Supplementary OCR data: charts, links, tracked changes, bounding boxes, infographics. Specify section to filter.',
|
|
1030
|
+
inputSchema: DocumentExtrasInput.shape,
|
|
1031
|
+
handler: handleDocumentExtras,
|
|
1032
|
+
},
|
|
1033
|
+
ocr_table_export: {
|
|
1034
|
+
description: '[ANALYSIS] Export table data as CSV, JSON, or markdown. Specify table_index for one table, or omit for all. JSON format returns rows with column-keyed objects.',
|
|
1035
|
+
inputSchema: TableExportInput.shape,
|
|
1036
|
+
handler: handleTableExport,
|
|
1037
|
+
},
|
|
1038
|
+
};
|
|
1039
|
+
//# sourceMappingURL=intelligence.js.map
|