ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,735 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Comparison Tools
|
|
3
|
+
*
|
|
4
|
+
* MCP tools for comparing two OCR-processed documents.
|
|
5
|
+
* Provides text diff and structural diff.
|
|
6
|
+
*
|
|
7
|
+
* CRITICAL: NEVER use console.log() - stdout is JSON-RPC protocol.
|
|
8
|
+
*
|
|
9
|
+
* @module tools/comparison
|
|
10
|
+
*/
|
|
11
|
+
import { z } from 'zod';
|
|
12
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
13
|
+
import { formatResponse, handleError, fetchProvenanceChain, } from './shared.js';
|
|
14
|
+
import { successResult } from '../server/types.js';
|
|
15
|
+
import { validateInput } from '../utils/validation.js';
|
|
16
|
+
import { requireDatabase } from '../server/state.js';
|
|
17
|
+
import { computeHash } from '../utils/hash.js';
|
|
18
|
+
import { MCPError } from '../server/errors.js';
|
|
19
|
+
import { compareText, compareStructure, generateSummary, } from '../services/comparison/diff-service.js';
|
|
20
|
+
import { insertComparison, getComparison, listComparisons, } from '../services/storage/database/comparison-operations.js';
|
|
21
|
+
import { getCluster, getClusterDocuments, } from '../services/storage/database/cluster-operations.js';
|
|
22
|
+
import { computeDocumentEmbeddings, cosineSimilarity, } from '../services/clustering/clustering-service.js';
|
|
23
|
+
import { getProvenanceTracker } from '../services/provenance/index.js';
|
|
24
|
+
import { ProvenanceType } from '../models/provenance.js';
|
|
25
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
26
|
+
// INPUT SCHEMAS
|
|
27
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
28
|
+
const DocumentCompareInput = z.object({
|
|
29
|
+
document_id_1: z.string().min(1).describe('First document ID'),
|
|
30
|
+
document_id_2: z.string().min(1).describe('Second document ID'),
|
|
31
|
+
include_text_diff: z.boolean().default(true).describe('Include text-level diff operations'),
|
|
32
|
+
max_diff_operations: z
|
|
33
|
+
.number()
|
|
34
|
+
.int()
|
|
35
|
+
.min(1)
|
|
36
|
+
.max(10000)
|
|
37
|
+
.default(1000)
|
|
38
|
+
.describe('Maximum diff operations to return'),
|
|
39
|
+
include_provenance: z
|
|
40
|
+
.boolean()
|
|
41
|
+
.default(false)
|
|
42
|
+
.describe('Include provenance chain for the comparison'),
|
|
43
|
+
});
|
|
44
|
+
const ComparisonListInput = z.object({
|
|
45
|
+
document_id: z
|
|
46
|
+
.string()
|
|
47
|
+
.optional()
|
|
48
|
+
.describe('Filter by document ID (matches either doc1 or doc2)'),
|
|
49
|
+
limit: z.number().int().min(1).max(100).default(50).describe('Maximum results'),
|
|
50
|
+
offset: z.number().int().min(0).default(0).describe('Offset for pagination'),
|
|
51
|
+
});
|
|
52
|
+
const ComparisonGetInput = z.object({
|
|
53
|
+
comparison_id: z.string().min(1).describe('Comparison ID'),
|
|
54
|
+
});
|
|
55
|
+
const ComparisonDiscoverInput = z.object({
|
|
56
|
+
min_similarity: z
|
|
57
|
+
.number()
|
|
58
|
+
.min(0)
|
|
59
|
+
.max(1)
|
|
60
|
+
.default(0.7)
|
|
61
|
+
.describe('Minimum cosine similarity threshold (0-1)'),
|
|
62
|
+
document_filter: z
|
|
63
|
+
.array(z.string())
|
|
64
|
+
.optional()
|
|
65
|
+
.describe('Only consider these document IDs'),
|
|
66
|
+
exclude_existing: z
|
|
67
|
+
.boolean()
|
|
68
|
+
.default(true)
|
|
69
|
+
.describe('Exclude document pairs that already have comparisons'),
|
|
70
|
+
limit: z
|
|
71
|
+
.number()
|
|
72
|
+
.int()
|
|
73
|
+
.min(1)
|
|
74
|
+
.max(100)
|
|
75
|
+
.default(20)
|
|
76
|
+
.describe('Maximum pairs to return'),
|
|
77
|
+
});
|
|
78
|
+
const ComparisonBatchInput = z.object({
|
|
79
|
+
pairs: z
|
|
80
|
+
.array(z.object({
|
|
81
|
+
doc1: z.string().min(1).describe('First document ID'),
|
|
82
|
+
doc2: z.string().min(1).describe('Second document ID'),
|
|
83
|
+
}))
|
|
84
|
+
.optional()
|
|
85
|
+
.describe('Explicit document pairs to compare'),
|
|
86
|
+
cluster_id: z
|
|
87
|
+
.string()
|
|
88
|
+
.optional()
|
|
89
|
+
.describe('Compare all documents within this cluster'),
|
|
90
|
+
include_text_diff: z
|
|
91
|
+
.boolean()
|
|
92
|
+
.default(true)
|
|
93
|
+
.describe('Include text-level diff operations in each comparison'),
|
|
94
|
+
});
|
|
95
|
+
function countChunks(conn, docId) {
|
|
96
|
+
return conn.prepare('SELECT COUNT(*) as cnt FROM chunks WHERE document_id = ?').get(docId).cnt;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Parse stored JSON with descriptive error on malformed data.
|
|
100
|
+
* Throws MCPError instead of returning undefined.
|
|
101
|
+
*/
|
|
102
|
+
function parseStoredJSON(field, fieldName, comparisonId) {
|
|
103
|
+
try {
|
|
104
|
+
return JSON.parse(field);
|
|
105
|
+
}
|
|
106
|
+
catch (e) {
|
|
107
|
+
throw new MCPError('INTERNAL_ERROR', `Failed to parse ${fieldName} for comparison '${comparisonId}': stored JSON is malformed. Error: ${e instanceof Error ? e.message : String(e)}`);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
function fetchCompleteDocument(conn, docId) {
|
|
111
|
+
const doc = conn.prepare('SELECT * FROM documents WHERE id = ?').get(docId);
|
|
112
|
+
if (!doc) {
|
|
113
|
+
throw new MCPError('DOCUMENT_NOT_FOUND', `Document '${docId}' not found`);
|
|
114
|
+
}
|
|
115
|
+
if (doc.status !== 'complete') {
|
|
116
|
+
throw new MCPError('VALIDATION_ERROR', `Document '${docId}' has status '${String(doc.status)}', expected 'complete'. Run ocr_process_pending first.`);
|
|
117
|
+
}
|
|
118
|
+
const ocr = conn.prepare('SELECT * FROM ocr_results WHERE document_id = ?').get(docId);
|
|
119
|
+
if (!ocr) {
|
|
120
|
+
throw new MCPError('INTERNAL_ERROR', `No OCR result found for document '${docId}'. Document may need reprocessing.`);
|
|
121
|
+
}
|
|
122
|
+
return { doc, ocr };
|
|
123
|
+
}
|
|
124
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
125
|
+
// MULTI-SIGNAL SIMILARITY HELPERS
|
|
126
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
127
|
+
/**
|
|
128
|
+
* Compute embedding centroid similarity between two documents.
|
|
129
|
+
* Fetches all chunk embedding vectors for each document, computes centroids,
|
|
130
|
+
* and returns cosine similarity between them.
|
|
131
|
+
*
|
|
132
|
+
* @returns Cosine similarity (0-1) or null if either document has no embeddings
|
|
133
|
+
*/
|
|
134
|
+
function computeEmbeddingCentroidSimilarity(conn, docId1, docId2) {
|
|
135
|
+
const docEmbeddings = computeDocumentEmbeddings(conn, [docId1, docId2]);
|
|
136
|
+
const emb1 = docEmbeddings.find((d) => d.document_id === docId1);
|
|
137
|
+
const emb2 = docEmbeddings.find((d) => d.document_id === docId2);
|
|
138
|
+
if (!emb1 || !emb2)
|
|
139
|
+
return null;
|
|
140
|
+
return cosineSimilarity(emb1.embedding, Array.from(emb2.embedding));
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Compute structural similarity between two documents based on block type distributions.
|
|
144
|
+
* Uses block_type_stats from extras_json of OCR results (added in Phase 4).
|
|
145
|
+
* Computes cosine similarity of block type distribution vectors.
|
|
146
|
+
*
|
|
147
|
+
* @returns Similarity score (0-1), or 0 if stats unavailable
|
|
148
|
+
*/
|
|
149
|
+
function computeStructuralSimilarity(conn, docId1, docId2) {
|
|
150
|
+
const stats1 = getBlockTypeStats(conn, docId1);
|
|
151
|
+
const stats2 = getBlockTypeStats(conn, docId2);
|
|
152
|
+
if (!stats1 || !stats2)
|
|
153
|
+
return 0;
|
|
154
|
+
// Build unified set of block types
|
|
155
|
+
const allTypes = new Set([...Object.keys(stats1), ...Object.keys(stats2)]);
|
|
156
|
+
if (allTypes.size === 0)
|
|
157
|
+
return 0;
|
|
158
|
+
// Build distribution vectors
|
|
159
|
+
const vec1 = [];
|
|
160
|
+
const vec2 = [];
|
|
161
|
+
for (const type of allTypes) {
|
|
162
|
+
vec1.push(stats1[type] ?? 0);
|
|
163
|
+
vec2.push(stats2[type] ?? 0);
|
|
164
|
+
}
|
|
165
|
+
// Compute cosine similarity
|
|
166
|
+
let dotProduct = 0;
|
|
167
|
+
let norm1 = 0;
|
|
168
|
+
let norm2 = 0;
|
|
169
|
+
for (let i = 0; i < vec1.length; i++) {
|
|
170
|
+
dotProduct += vec1[i] * vec2[i];
|
|
171
|
+
norm1 += vec1[i] * vec1[i];
|
|
172
|
+
norm2 += vec2[i] * vec2[i];
|
|
173
|
+
}
|
|
174
|
+
if (norm1 === 0 || norm2 === 0)
|
|
175
|
+
return 0;
|
|
176
|
+
return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Extract block_type_stats from extras_json of a document's OCR result.
|
|
180
|
+
* Returns a map of block_type -> count, or null if not available.
|
|
181
|
+
*/
|
|
182
|
+
function getBlockTypeStats(conn, docId) {
|
|
183
|
+
const row = conn
|
|
184
|
+
.prepare('SELECT extras_json FROM ocr_results WHERE document_id = ? ORDER BY processing_completed_at DESC LIMIT 1')
|
|
185
|
+
.get(docId);
|
|
186
|
+
if (!row?.extras_json)
|
|
187
|
+
return null;
|
|
188
|
+
try {
|
|
189
|
+
const extras = JSON.parse(row.extras_json);
|
|
190
|
+
const blockTypeStats = extras.block_type_stats;
|
|
191
|
+
if (!blockTypeStats || typeof blockTypeStats !== 'object')
|
|
192
|
+
return null;
|
|
193
|
+
return blockTypeStats;
|
|
194
|
+
}
|
|
195
|
+
catch (error) {
|
|
196
|
+
console.error(`[comparison] Failed to parse extras_json for block_type_stats of document ${docId}: ${error instanceof Error ? error.message : String(error)}`);
|
|
197
|
+
return null;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
201
|
+
// TOOL HANDLERS
|
|
202
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
203
|
+
async function handleDocumentCompare(params) {
|
|
204
|
+
try {
|
|
205
|
+
const startTime = Date.now();
|
|
206
|
+
const input = validateInput(DocumentCompareInput, params);
|
|
207
|
+
const { db } = requireDatabase();
|
|
208
|
+
const conn = db.getConnection();
|
|
209
|
+
if (input.document_id_1 === input.document_id_2) {
|
|
210
|
+
throw new MCPError('VALIDATION_ERROR', 'Cannot compare document with itself. Provide two different document IDs.');
|
|
211
|
+
}
|
|
212
|
+
const { doc: doc1, ocr: ocr1 } = fetchCompleteDocument(conn, input.document_id_1);
|
|
213
|
+
const { doc: doc2, ocr: ocr2 } = fetchCompleteDocument(conn, input.document_id_2);
|
|
214
|
+
// Duplicate comparison detection
|
|
215
|
+
const existingComparison = conn
|
|
216
|
+
.prepare(`SELECT c.id, c.created_at, c.similarity_ratio
|
|
217
|
+
FROM comparisons c
|
|
218
|
+
WHERE (c.document_id_1 = ? AND c.document_id_2 = ?)
|
|
219
|
+
OR (c.document_id_1 = ? AND c.document_id_2 = ?)
|
|
220
|
+
ORDER BY c.created_at DESC LIMIT 1`)
|
|
221
|
+
.get(input.document_id_1, input.document_id_2, input.document_id_2, input.document_id_1);
|
|
222
|
+
if (existingComparison) {
|
|
223
|
+
// Check if underlying OCR data has changed since last comparison
|
|
224
|
+
const currentInputHash = computeHash(String(ocr1.content_hash) + ':' + String(ocr2.content_hash));
|
|
225
|
+
const prevInputHash = conn
|
|
226
|
+
.prepare('SELECT input_hash FROM provenance WHERE id = (SELECT provenance_id FROM comparisons WHERE id = ?)')
|
|
227
|
+
.get(existingComparison.id);
|
|
228
|
+
if (prevInputHash && prevInputHash.input_hash === currentInputHash) {
|
|
229
|
+
throw new MCPError('VALIDATION_ERROR', `These documents were already compared with identical OCR content. ` +
|
|
230
|
+
`Existing comparison: ${existingComparison.id} (created ${existingComparison.created_at}, similarity ${(existingComparison.similarity_ratio * 100).toFixed(1)}%). ` +
|
|
231
|
+
`To re-compare, first reprocess one of the documents with ocr_reprocess.`);
|
|
232
|
+
}
|
|
233
|
+
// If input hashes differ, the OCR content has changed, allow re-comparison
|
|
234
|
+
}
|
|
235
|
+
const chunks1Count = countChunks(conn, input.document_id_1);
|
|
236
|
+
const chunks2Count = countChunks(conn, input.document_id_2);
|
|
237
|
+
// Text diff
|
|
238
|
+
const textDiff = input.include_text_diff
|
|
239
|
+
? compareText(String(ocr1.extracted_text), String(ocr2.extracted_text), input.max_diff_operations)
|
|
240
|
+
: null;
|
|
241
|
+
// Structural diff
|
|
242
|
+
const structuralDiff = compareStructure({
|
|
243
|
+
page_count: doc1.page_count,
|
|
244
|
+
text_length: Number(ocr1.text_length),
|
|
245
|
+
quality_score: ocr1.parse_quality_score,
|
|
246
|
+
ocr_mode: String(ocr1.datalab_mode),
|
|
247
|
+
chunk_count: chunks1Count,
|
|
248
|
+
}, {
|
|
249
|
+
page_count: doc2.page_count,
|
|
250
|
+
text_length: Number(ocr2.text_length),
|
|
251
|
+
quality_score: ocr2.parse_quality_score,
|
|
252
|
+
ocr_mode: String(ocr2.datalab_mode),
|
|
253
|
+
chunk_count: chunks2Count,
|
|
254
|
+
});
|
|
255
|
+
// Generate summary
|
|
256
|
+
const summary = generateSummary(textDiff, structuralDiff, String(doc1.file_name), String(doc2.file_name));
|
|
257
|
+
// Compute similarity from text diff or default to structural comparison
|
|
258
|
+
const similarityRatio = textDiff ? textDiff.similarity_ratio : 0;
|
|
259
|
+
// Multi-signal similarity computation (ME-6)
|
|
260
|
+
// Track which components failed to surface in response instead of silently swallowing
|
|
261
|
+
const componentsFailed = [];
|
|
262
|
+
let embeddingSimilarity = null;
|
|
263
|
+
try {
|
|
264
|
+
embeddingSimilarity = computeEmbeddingCentroidSimilarity(conn, input.document_id_1, input.document_id_2);
|
|
265
|
+
}
|
|
266
|
+
catch (error) {
|
|
267
|
+
console.error('[comparison] Centroid similarity failed:', error instanceof Error ? error.message : String(error));
|
|
268
|
+
componentsFailed.push('centroid_similarity');
|
|
269
|
+
}
|
|
270
|
+
let structSimilarity = 0;
|
|
271
|
+
try {
|
|
272
|
+
structSimilarity = computeStructuralSimilarity(conn, input.document_id_1, input.document_id_2);
|
|
273
|
+
}
|
|
274
|
+
catch (error) {
|
|
275
|
+
console.error('[comparison] Structural similarity failed:', error instanceof Error ? error.message : String(error));
|
|
276
|
+
componentsFailed.push('structural_similarity');
|
|
277
|
+
}
|
|
278
|
+
// Quality alignment: how close are the OCR quality scores
|
|
279
|
+
const q1 = ocr1.parse_quality_score ?? 0;
|
|
280
|
+
const q2 = ocr2.parse_quality_score ?? 0;
|
|
281
|
+
const qualityAlignment = q1 > 0 && q2 > 0
|
|
282
|
+
? 1 - Math.abs(q1 - q2) / Math.max(q1, q2)
|
|
283
|
+
: 0;
|
|
284
|
+
// Composite similarity: weighted blend of all signals
|
|
285
|
+
const compositeSimilarity = 0.4 * similarityRatio +
|
|
286
|
+
0.3 * (embeddingSimilarity ?? similarityRatio) +
|
|
287
|
+
0.2 * structSimilarity +
|
|
288
|
+
0.1 * qualityAlignment;
|
|
289
|
+
// Compute content hash
|
|
290
|
+
const diffContent = JSON.stringify({
|
|
291
|
+
text_diff: textDiff,
|
|
292
|
+
structural_diff: structuralDiff,
|
|
293
|
+
});
|
|
294
|
+
const contentHash = computeHash(diffContent);
|
|
295
|
+
// Create provenance record
|
|
296
|
+
const comparisonId = uuidv4();
|
|
297
|
+
const now = new Date().toISOString();
|
|
298
|
+
const inputHash = computeHash(String(ocr1.content_hash) + ':' + String(ocr2.content_hash));
|
|
299
|
+
const tracker = getProvenanceTracker(db);
|
|
300
|
+
const provId = tracker.createProvenance({
|
|
301
|
+
type: ProvenanceType.COMPARISON,
|
|
302
|
+
source_type: 'COMPARISON',
|
|
303
|
+
source_id: String(ocr1.provenance_id),
|
|
304
|
+
root_document_id: String(doc1.provenance_id),
|
|
305
|
+
content_hash: contentHash,
|
|
306
|
+
input_hash: inputHash,
|
|
307
|
+
file_hash: String(doc1.file_hash),
|
|
308
|
+
source_path: `${String(doc1.file_path)} <-> ${String(doc2.file_path)}`,
|
|
309
|
+
processor: 'document-comparison',
|
|
310
|
+
processor_version: '1.0.0',
|
|
311
|
+
processing_params: { document_id_1: input.document_id_1, document_id_2: input.document_id_2 },
|
|
312
|
+
});
|
|
313
|
+
const processingDurationMs = Date.now() - startTime;
|
|
314
|
+
// Update provenance with actual duration (not known at creation time)
|
|
315
|
+
conn
|
|
316
|
+
.prepare('UPDATE provenance SET processing_duration_ms = ? WHERE id = ?')
|
|
317
|
+
.run(processingDurationMs, provId);
|
|
318
|
+
// Insert comparison record
|
|
319
|
+
const comparison = {
|
|
320
|
+
id: comparisonId,
|
|
321
|
+
document_id_1: input.document_id_1,
|
|
322
|
+
document_id_2: input.document_id_2,
|
|
323
|
+
similarity_ratio: similarityRatio,
|
|
324
|
+
text_diff_json: JSON.stringify(textDiff ?? {}),
|
|
325
|
+
structural_diff_json: JSON.stringify(structuralDiff),
|
|
326
|
+
summary,
|
|
327
|
+
content_hash: contentHash,
|
|
328
|
+
provenance_id: provId,
|
|
329
|
+
created_at: now,
|
|
330
|
+
processing_duration_ms: processingDurationMs,
|
|
331
|
+
};
|
|
332
|
+
// F-INTEG-10: Delete stale comparisons for this document pair before inserting
|
|
333
|
+
// (handles re-OCR creating new comparisons alongside outdated ones)
|
|
334
|
+
conn
|
|
335
|
+
.prepare(`DELETE FROM comparisons WHERE
|
|
336
|
+
(document_id_1 = ? AND document_id_2 = ?) OR
|
|
337
|
+
(document_id_1 = ? AND document_id_2 = ?)`)
|
|
338
|
+
.run(input.document_id_1, input.document_id_2, input.document_id_2, input.document_id_1);
|
|
339
|
+
insertComparison(conn, comparison);
|
|
340
|
+
const comparisonResponse = {
|
|
341
|
+
comparison_id: comparisonId,
|
|
342
|
+
document_1: { id: input.document_id_1, file_name: doc1.file_name },
|
|
343
|
+
document_2: { id: input.document_id_2, file_name: doc2.file_name },
|
|
344
|
+
similarity_ratio: similarityRatio,
|
|
345
|
+
composite_similarity: Math.round(compositeSimilarity * 10000) / 10000,
|
|
346
|
+
similarity_signals: {
|
|
347
|
+
text_similarity: similarityRatio,
|
|
348
|
+
embedding_centroid_similarity: embeddingSimilarity !== null
|
|
349
|
+
? Math.round(embeddingSimilarity * 10000) / 10000
|
|
350
|
+
: null,
|
|
351
|
+
structural_similarity: Math.round(structSimilarity * 10000) / 10000,
|
|
352
|
+
quality_alignment: Math.round(qualityAlignment * 10000) / 10000,
|
|
353
|
+
weights: { text: 0.4, embedding: 0.3, structural: 0.2, quality: 0.1 },
|
|
354
|
+
},
|
|
355
|
+
summary,
|
|
356
|
+
text_diff: textDiff,
|
|
357
|
+
structural_diff: structuralDiff,
|
|
358
|
+
provenance_id: provId,
|
|
359
|
+
processing_duration_ms: processingDurationMs,
|
|
360
|
+
};
|
|
361
|
+
if (componentsFailed.length > 0) {
|
|
362
|
+
comparisonResponse.components_failed = componentsFailed;
|
|
363
|
+
}
|
|
364
|
+
if (input.include_provenance) {
|
|
365
|
+
comparisonResponse.provenance_chain = fetchProvenanceChain(db, provId, 'comparison');
|
|
366
|
+
}
|
|
367
|
+
comparisonResponse.next_steps = [
|
|
368
|
+
{ tool: 'ocr_comparison_list', description: 'View all comparisons in the database' },
|
|
369
|
+
];
|
|
370
|
+
return formatResponse(successResult(comparisonResponse));
|
|
371
|
+
}
|
|
372
|
+
catch (error) {
|
|
373
|
+
return handleError(error);
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
async function handleComparisonList(params) {
|
|
377
|
+
try {
|
|
378
|
+
const input = validateInput(ComparisonListInput, params);
|
|
379
|
+
const { db } = requireDatabase();
|
|
380
|
+
const conn = db.getConnection();
|
|
381
|
+
const comparisons = listComparisons(conn, input);
|
|
382
|
+
// Return summaries without large JSON fields
|
|
383
|
+
const results = comparisons.map((c) => ({
|
|
384
|
+
id: c.id,
|
|
385
|
+
document_id_1: c.document_id_1,
|
|
386
|
+
document_id_2: c.document_id_2,
|
|
387
|
+
similarity_ratio: c.similarity_ratio,
|
|
388
|
+
summary: c.summary,
|
|
389
|
+
created_at: c.created_at,
|
|
390
|
+
processing_duration_ms: c.processing_duration_ms,
|
|
391
|
+
}));
|
|
392
|
+
return formatResponse(successResult({
|
|
393
|
+
comparisons: results,
|
|
394
|
+
count: results.length,
|
|
395
|
+
offset: input.offset,
|
|
396
|
+
limit: input.limit,
|
|
397
|
+
next_steps: [{ tool: 'ocr_comparison_get', description: 'View full diff data for a comparison' }, { tool: 'ocr_document_compare', description: 'Compare two new documents' }],
|
|
398
|
+
}));
|
|
399
|
+
}
|
|
400
|
+
catch (error) {
|
|
401
|
+
return handleError(error);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
async function handleComparisonGet(params) {
|
|
405
|
+
try {
|
|
406
|
+
const input = validateInput(ComparisonGetInput, params);
|
|
407
|
+
const { db } = requireDatabase();
|
|
408
|
+
const conn = db.getConnection();
|
|
409
|
+
const comparison = getComparison(conn, input.comparison_id);
|
|
410
|
+
if (!comparison) {
|
|
411
|
+
throw new MCPError('DOCUMENT_NOT_FOUND', `Comparison '${input.comparison_id}' not found`);
|
|
412
|
+
}
|
|
413
|
+
// Parse stored JSON fields with error handling
|
|
414
|
+
return formatResponse(successResult({
|
|
415
|
+
...comparison,
|
|
416
|
+
text_diff_json: parseStoredJSON(comparison.text_diff_json, 'text_diff_json', input.comparison_id),
|
|
417
|
+
structural_diff_json: parseStoredJSON(comparison.structural_diff_json, 'structural_diff_json', input.comparison_id),
|
|
418
|
+
next_steps: [{ tool: 'ocr_document_get', description: 'View one of the compared documents' }, { tool: 'ocr_comparison_list', description: 'Browse other comparisons' }],
|
|
419
|
+
}));
|
|
420
|
+
}
|
|
421
|
+
catch (error) {
|
|
422
|
+
return handleError(error);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
426
|
+
// DISCOVER & BATCH HANDLERS
|
|
427
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
428
|
+
/**
|
|
429
|
+
* Discover document pairs likely similar based on embedding proximity.
|
|
430
|
+
* Computes document centroid embeddings (average chunk embeddings),
|
|
431
|
+
* then pairwise cosine similarity.
|
|
432
|
+
*/
|
|
433
|
+
async function handleComparisonDiscover(params) {
|
|
434
|
+
try {
|
|
435
|
+
const input = validateInput(ComparisonDiscoverInput, params);
|
|
436
|
+
const { db } = requireDatabase();
|
|
437
|
+
const conn = db.getConnection();
|
|
438
|
+
const minSimilarity = input.min_similarity ?? 0.7;
|
|
439
|
+
const excludeExisting = input.exclude_existing ?? true;
|
|
440
|
+
const limit = input.limit ?? 20;
|
|
441
|
+
// Compute document centroid embeddings
|
|
442
|
+
const docEmbeddings = computeDocumentEmbeddings(conn, input.document_filter);
|
|
443
|
+
if (docEmbeddings.length < 2) {
|
|
444
|
+
return formatResponse(successResult({
|
|
445
|
+
pairs: [],
|
|
446
|
+
total_pairs: 0,
|
|
447
|
+
documents_analyzed: docEmbeddings.length,
|
|
448
|
+
message: docEmbeddings.length === 0
|
|
449
|
+
? 'No documents with embeddings found'
|
|
450
|
+
: 'At least 2 documents with embeddings required for comparison discovery',
|
|
451
|
+
next_steps: [{ tool: 'ocr_process_pending', description: 'Process more documents to enable comparison' }],
|
|
452
|
+
}));
|
|
453
|
+
}
|
|
454
|
+
// Build set of existing comparison pairs for exclusion
|
|
455
|
+
const existingPairs = new Set();
|
|
456
|
+
if (excludeExisting) {
|
|
457
|
+
const existing = conn
|
|
458
|
+
.prepare('SELECT document_id_1, document_id_2 FROM comparisons')
|
|
459
|
+
.all();
|
|
460
|
+
for (const row of existing) {
|
|
461
|
+
// Store both orderings
|
|
462
|
+
existingPairs.add(`${row.document_id_1}:${row.document_id_2}`);
|
|
463
|
+
existingPairs.add(`${row.document_id_2}:${row.document_id_1}`);
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
// Compute pairwise cosine similarity
|
|
467
|
+
const pairs = [];
|
|
468
|
+
// Get file names for all documents
|
|
469
|
+
const fileNameMap = new Map();
|
|
470
|
+
for (const de of docEmbeddings) {
|
|
471
|
+
const doc = db.getDocument(de.document_id);
|
|
472
|
+
fileNameMap.set(de.document_id, doc?.file_name ?? 'unknown');
|
|
473
|
+
}
|
|
474
|
+
for (let i = 0; i < docEmbeddings.length; i++) {
|
|
475
|
+
for (let j = i + 1; j < docEmbeddings.length; j++) {
|
|
476
|
+
const docA = docEmbeddings[i];
|
|
477
|
+
const docB = docEmbeddings[j];
|
|
478
|
+
// Skip if already compared
|
|
479
|
+
if (excludeExisting && existingPairs.has(`${docA.document_id}:${docB.document_id}`)) {
|
|
480
|
+
continue;
|
|
481
|
+
}
|
|
482
|
+
const similarity = cosineSimilarity(docA.embedding, Array.from(docB.embedding));
|
|
483
|
+
if (similarity >= minSimilarity) {
|
|
484
|
+
pairs.push({
|
|
485
|
+
document_id_1: docA.document_id,
|
|
486
|
+
document_id_2: docB.document_id,
|
|
487
|
+
similarity: Math.round(similarity * 10000) / 10000,
|
|
488
|
+
file_name_1: fileNameMap.get(docA.document_id) ?? 'unknown',
|
|
489
|
+
file_name_2: fileNameMap.get(docB.document_id) ?? 'unknown',
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
// Sort by similarity descending, then limit
|
|
495
|
+
pairs.sort((a, b) => b.similarity - a.similarity);
|
|
496
|
+
const limitedPairs = pairs.slice(0, limit);
|
|
497
|
+
return formatResponse(successResult({
|
|
498
|
+
pairs: limitedPairs,
|
|
499
|
+
total_pairs: pairs.length,
|
|
500
|
+
returned_pairs: limitedPairs.length,
|
|
501
|
+
documents_analyzed: docEmbeddings.length,
|
|
502
|
+
min_similarity: minSimilarity,
|
|
503
|
+
exclude_existing: excludeExisting,
|
|
504
|
+
next_steps: [{ tool: 'ocr_document_compare', description: 'Compare a discovered similar pair' }, { tool: 'ocr_comparison_batch', description: 'Compare all discovered pairs at once' }],
|
|
505
|
+
}));
|
|
506
|
+
}
|
|
507
|
+
catch (error) {
|
|
508
|
+
return handleError(error);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
/**
|
|
512
|
+
* Compare multiple document pairs in one batch operation.
|
|
513
|
+
* Can specify explicit pairs or compare all documents in a cluster.
|
|
514
|
+
*/
|
|
515
|
+
async function handleComparisonBatch(params) {
|
|
516
|
+
try {
|
|
517
|
+
const input = validateInput(ComparisonBatchInput, params);
|
|
518
|
+
const { db } = requireDatabase();
|
|
519
|
+
const conn = db.getConnection();
|
|
520
|
+
// Build list of pairs to compare
|
|
521
|
+
let pairsToCompare = [];
|
|
522
|
+
if (input.cluster_id) {
|
|
523
|
+
// Get all documents in cluster and generate all pairs
|
|
524
|
+
const cluster = getCluster(conn, input.cluster_id);
|
|
525
|
+
if (!cluster) {
|
|
526
|
+
throw new MCPError('DOCUMENT_NOT_FOUND', `Cluster "${input.cluster_id}" not found`);
|
|
527
|
+
}
|
|
528
|
+
const members = getClusterDocuments(conn, input.cluster_id);
|
|
529
|
+
if (members.length < 2) {
|
|
530
|
+
return formatResponse(successResult({
|
|
531
|
+
results: [],
|
|
532
|
+
total_compared: 0,
|
|
533
|
+
message: `Cluster has ${members.length} document(s), need at least 2 for comparison`,
|
|
534
|
+
next_steps: [{ tool: 'ocr_cluster_list', description: 'Find a cluster with more documents' }],
|
|
535
|
+
}));
|
|
536
|
+
}
|
|
537
|
+
for (let i = 0; i < members.length; i++) {
|
|
538
|
+
for (let j = i + 1; j < members.length; j++) {
|
|
539
|
+
pairsToCompare.push({
|
|
540
|
+
doc1: members[i].document_id,
|
|
541
|
+
doc2: members[j].document_id,
|
|
542
|
+
});
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
else if (input.pairs && input.pairs.length > 0) {
|
|
547
|
+
pairsToCompare = input.pairs;
|
|
548
|
+
}
|
|
549
|
+
else {
|
|
550
|
+
throw new MCPError('VALIDATION_ERROR', 'Either pairs or cluster_id must be provided');
|
|
551
|
+
}
|
|
552
|
+
if (pairsToCompare.length === 0) {
|
|
553
|
+
return formatResponse(successResult({
|
|
554
|
+
results: [],
|
|
555
|
+
total_compared: 0,
|
|
556
|
+
message: 'No pairs to compare',
|
|
557
|
+
next_steps: [{ tool: 'ocr_comparison_list', description: 'View existing comparisons' }],
|
|
558
|
+
}));
|
|
559
|
+
}
|
|
560
|
+
// Compare each pair by calling the existing compare handler
|
|
561
|
+
const results = [];
|
|
562
|
+
const errors = [];
|
|
563
|
+
for (const pair of pairsToCompare) {
|
|
564
|
+
try {
|
|
565
|
+
const compareResult = await handleDocumentCompare({
|
|
566
|
+
document_id_1: pair.doc1,
|
|
567
|
+
document_id_2: pair.doc2,
|
|
568
|
+
include_text_diff: input.include_text_diff ?? true,
|
|
569
|
+
max_diff_operations: 100, // Use smaller limit for batch
|
|
570
|
+
include_provenance: false,
|
|
571
|
+
});
|
|
572
|
+
const parsed = JSON.parse(compareResult.content[0].text);
|
|
573
|
+
if (parsed.success && parsed.data) {
|
|
574
|
+
results.push({
|
|
575
|
+
document_id_1: pair.doc1,
|
|
576
|
+
document_id_2: pair.doc2,
|
|
577
|
+
comparison_id: parsed.data.comparison_id,
|
|
578
|
+
similarity_ratio: parsed.data.similarity_ratio,
|
|
579
|
+
summary: parsed.data.summary,
|
|
580
|
+
});
|
|
581
|
+
}
|
|
582
|
+
else {
|
|
583
|
+
errors.push({
|
|
584
|
+
doc1: pair.doc1,
|
|
585
|
+
doc2: pair.doc2,
|
|
586
|
+
error: parsed.error?.message ?? 'Unknown error',
|
|
587
|
+
});
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
catch (e) {
|
|
591
|
+
errors.push({
|
|
592
|
+
doc1: pair.doc1,
|
|
593
|
+
doc2: pair.doc2,
|
|
594
|
+
error: e instanceof Error ? e.message : String(e),
|
|
595
|
+
});
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
// M-4: If every comparison failed, throw an error instead of returning success
|
|
599
|
+
if (results.length === 0 && errors.length > 0) {
|
|
600
|
+
const errorDetails = errors
|
|
601
|
+
.map((e) => ` ${e.doc1} <-> ${e.doc2}: ${e.error}`)
|
|
602
|
+
.join('\n');
|
|
603
|
+
throw new MCPError('INTERNAL_ERROR', `All ${errors.length} comparison(s) failed:\n${errorDetails}`);
|
|
604
|
+
}
|
|
605
|
+
return formatResponse(successResult({
|
|
606
|
+
results,
|
|
607
|
+
errors: errors.length > 0 ? errors : undefined,
|
|
608
|
+
total_compared: results.length,
|
|
609
|
+
total_errors: errors.length,
|
|
610
|
+
total_pairs_requested: pairsToCompare.length,
|
|
611
|
+
next_steps: [{ tool: 'ocr_comparison_list', description: 'List all comparison results' }, { tool: 'ocr_comparison_get', description: 'View details for a specific comparison' }],
|
|
612
|
+
}));
|
|
613
|
+
}
|
|
614
|
+
catch (error) {
|
|
615
|
+
return handleError(error);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
619
|
+
// COMPARISON MATRIX HANDLER
|
|
620
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
621
|
+
const ComparisonMatrixInput = z.object({
|
|
622
|
+
document_ids: z.array(z.string()).optional()
|
|
623
|
+
.describe('Document IDs to include (default: all documents with embeddings)'),
|
|
624
|
+
max_documents: z.number().int().min(2).max(100).default(50)
|
|
625
|
+
.describe('Maximum documents in matrix'),
|
|
626
|
+
});
|
|
627
|
+
/**
|
|
628
|
+
* Handle ocr_comparison_matrix - Compute pairwise similarity matrix for documents
|
|
629
|
+
*/
|
|
630
|
+
async function handleComparisonMatrix(params) {
|
|
631
|
+
try {
|
|
632
|
+
const input = validateInput(ComparisonMatrixInput, params);
|
|
633
|
+
const { db } = requireDatabase();
|
|
634
|
+
const conn = db.getConnection();
|
|
635
|
+
// Compute document centroid embeddings
|
|
636
|
+
const docEmbeddings = computeDocumentEmbeddings(conn, input.document_ids);
|
|
637
|
+
if (docEmbeddings.length < 2) {
|
|
638
|
+
throw new MCPError('VALIDATION_ERROR', `Need at least 2 documents with embeddings for a similarity matrix. Found: ${docEmbeddings.length}`);
|
|
639
|
+
}
|
|
640
|
+
// Limit to max_documents (default 50 from schema)
|
|
641
|
+
const limited = docEmbeddings.slice(0, input.max_documents);
|
|
642
|
+
// Get file names for all documents
|
|
643
|
+
const documentIds = [];
|
|
644
|
+
const fileNames = [];
|
|
645
|
+
for (const de of limited) {
|
|
646
|
+
documentIds.push(de.document_id);
|
|
647
|
+
const doc = db.getDocument(de.document_id);
|
|
648
|
+
fileNames.push(doc?.file_name ?? 'unknown');
|
|
649
|
+
}
|
|
650
|
+
// Compute NxN similarity matrix
|
|
651
|
+
const n = limited.length;
|
|
652
|
+
const matrix = [];
|
|
653
|
+
let mostSimilarPair = { doc1_index: 0, doc2_index: 1, similarity: -1 };
|
|
654
|
+
let leastSimilarPair = { doc1_index: 0, doc2_index: 1, similarity: 2 };
|
|
655
|
+
let totalSimilarity = 0;
|
|
656
|
+
let pairCount = 0;
|
|
657
|
+
for (let i = 0; i < n; i++) {
|
|
658
|
+
const row = [];
|
|
659
|
+
for (let j = 0; j < n; j++) {
|
|
660
|
+
if (i === j) {
|
|
661
|
+
row.push(1.0);
|
|
662
|
+
}
|
|
663
|
+
else {
|
|
664
|
+
const sim = cosineSimilarity(limited[i].embedding, Array.from(limited[j].embedding));
|
|
665
|
+
const rounded = Math.round(sim * 10000) / 10000;
|
|
666
|
+
row.push(rounded);
|
|
667
|
+
// Only track for upper triangle to avoid double-counting
|
|
668
|
+
if (j > i) {
|
|
669
|
+
totalSimilarity += rounded;
|
|
670
|
+
pairCount++;
|
|
671
|
+
if (rounded > mostSimilarPair.similarity) {
|
|
672
|
+
mostSimilarPair = { doc1_index: i, doc2_index: j, similarity: rounded };
|
|
673
|
+
}
|
|
674
|
+
if (rounded < leastSimilarPair.similarity) {
|
|
675
|
+
leastSimilarPair = { doc1_index: i, doc2_index: j, similarity: rounded };
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
matrix.push(row);
|
|
681
|
+
}
|
|
682
|
+
const averageSimilarity = pairCount > 0
|
|
683
|
+
? Math.round((totalSimilarity / pairCount) * 10000) / 10000
|
|
684
|
+
: 0;
|
|
685
|
+
return formatResponse(successResult({
|
|
686
|
+
document_ids: documentIds,
|
|
687
|
+
file_names: fileNames,
|
|
688
|
+
matrix,
|
|
689
|
+
most_similar_pair: mostSimilarPair,
|
|
690
|
+
least_similar_pair: leastSimilarPair,
|
|
691
|
+
average_similarity: averageSimilarity,
|
|
692
|
+
documents_analyzed: n,
|
|
693
|
+
next_steps: [{ tool: 'ocr_document_compare', description: 'Compare the most similar pair in detail' }, { tool: 'ocr_cluster_documents', description: 'Cluster documents by similarity' }],
|
|
694
|
+
}));
|
|
695
|
+
}
|
|
696
|
+
catch (error) {
|
|
697
|
+
return handleError(error);
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
701
|
+
// TOOL EXPORTS
|
|
702
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
703
|
+
export const comparisonTools = {
|
|
704
|
+
ocr_document_compare: {
|
|
705
|
+
description: '[ANALYSIS] Diff two documents for text and structural differences. Returns similarity ratios and diffs. Both must have status "complete".',
|
|
706
|
+
inputSchema: DocumentCompareInput.shape,
|
|
707
|
+
handler: handleDocumentCompare,
|
|
708
|
+
},
|
|
709
|
+
ocr_comparison_list: {
|
|
710
|
+
description: '[ANALYSIS] Use to list past document comparisons with optional filtering by document ID. Returns comparison summaries with similarity ratios. Use ocr_comparison_get for full diff data.',
|
|
711
|
+
inputSchema: ComparisonListInput.shape,
|
|
712
|
+
handler: handleComparisonList,
|
|
713
|
+
},
|
|
714
|
+
ocr_comparison_get: {
|
|
715
|
+
description: '[ANALYSIS] Use to retrieve full diff data for a specific comparison by ID. Returns text diff operations and structural differences. Use after ocr_comparison_list.',
|
|
716
|
+
inputSchema: ComparisonGetInput.shape,
|
|
717
|
+
handler: handleComparisonGet,
|
|
718
|
+
},
|
|
719
|
+
ocr_comparison_discover: {
|
|
720
|
+
description: '[ANALYSIS] Find likely-similar document pairs ranked by embedding similarity. Follow with ocr_document_compare or ocr_comparison_batch.',
|
|
721
|
+
inputSchema: ComparisonDiscoverInput.shape,
|
|
722
|
+
handler: handleComparisonDiscover,
|
|
723
|
+
},
|
|
724
|
+
ocr_comparison_batch: {
|
|
725
|
+
description: '[ANALYSIS] Compare multiple document pairs at once. Provide explicit pairs or a cluster_id to compare all within a cluster.',
|
|
726
|
+
inputSchema: ComparisonBatchInput.shape,
|
|
727
|
+
handler: handleComparisonBatch,
|
|
728
|
+
},
|
|
729
|
+
ocr_comparison_matrix: {
|
|
730
|
+
description: '[ANALYSIS] NxN pairwise cosine similarity matrix across documents. Returns most/least similar pairs and averages. Requires embeddings.',
|
|
731
|
+
inputSchema: ComparisonMatrixInput.shape,
|
|
732
|
+
handler: handleComparisonMatrix,
|
|
733
|
+
},
|
|
734
|
+
};
|
|
735
|
+
//# sourceMappingURL=comparison.js.map
|