ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,2528 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Search MCP Tools
|
|
3
|
+
*
|
|
4
|
+
* Tools: ocr_search (unified: keyword/semantic/hybrid), ocr_fts_manage,
|
|
5
|
+
* ocr_search_export, ocr_benchmark_compare, ocr_rag_context,
|
|
6
|
+
* ocr_search_saved (unified: save/list/get/execute)
|
|
7
|
+
*
|
|
8
|
+
* CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
|
|
9
|
+
* Use console.error() for all logging.
|
|
10
|
+
*
|
|
11
|
+
* @module tools/search
|
|
12
|
+
*/
|
|
13
|
+
import * as fs from 'fs';
|
|
14
|
+
import * as path from 'path';
|
|
15
|
+
import { safeMin, safeMax } from '../utils/math.js';
|
|
16
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
17
|
+
import { z } from 'zod';
|
|
18
|
+
import { getEmbeddingService } from '../services/embedding/embedder.js';
|
|
19
|
+
import { DatabaseService } from '../services/storage/database/index.js';
|
|
20
|
+
import { VectorService } from '../services/storage/vector.js';
|
|
21
|
+
import { requireDatabase, getDefaultStoragePath, withDatabaseOperation } from '../server/state.js';
|
|
22
|
+
import { successResult } from '../server/types.js';
|
|
23
|
+
import { validateInput, sanitizePath, escapeLikePattern, SearchUnifiedInput, FTSManageInput, } from '../utils/validation.js';
|
|
24
|
+
import { MCPError } from '../server/errors.js';
|
|
25
|
+
import { formatResponse, handleError } from './shared.js';
|
|
26
|
+
import { BM25SearchService, sanitizeFTS5Query } from '../services/search/bm25.js';
|
|
27
|
+
import { RRFFusion } from '../services/search/fusion.js';
|
|
28
|
+
import { rerankResults } from '../services/search/reranker.js';
|
|
29
|
+
import { expandQuery, getExpandedTerms } from '../services/search/query-expander.js';
|
|
30
|
+
import { classifyQuery, isTableQuery } from '../services/search/query-classifier.js';
|
|
31
|
+
import { getClusterSummariesForDocument } from '../services/storage/database/cluster-operations.js';
|
|
32
|
+
import { getImage } from '../services/storage/database/image-operations.js';
|
|
33
|
+
import { computeBlockConfidence, isRepeatedHeaderFooter } from '../services/chunking/json-block-analyzer.js';
|
|
34
|
+
/**
|
|
35
|
+
* Group flat search results by their source document.
|
|
36
|
+
* Each group contains document-level metadata and the subset of results
|
|
37
|
+
* belonging to that document. Groups are sorted by result_count descending.
|
|
38
|
+
*/
|
|
39
|
+
function groupResultsByDocument(results) {
|
|
40
|
+
const groups = new Map();
|
|
41
|
+
for (const r of results) {
|
|
42
|
+
const docId = (r.document_id ?? r.source_document_id);
|
|
43
|
+
if (!docId)
|
|
44
|
+
continue;
|
|
45
|
+
if (!groups.has(docId)) {
|
|
46
|
+
groups.set(docId, {
|
|
47
|
+
document_id: docId,
|
|
48
|
+
file_name: r.source_file_name ?? '',
|
|
49
|
+
file_path: r.source_file_path ?? '',
|
|
50
|
+
doc_title: r.doc_title ?? null,
|
|
51
|
+
doc_author: r.doc_author ?? null,
|
|
52
|
+
total_pages: r.doc_page_count ?? null,
|
|
53
|
+
total_chunks: r.total_chunks ?? 0,
|
|
54
|
+
ocr_quality_score: r.ocr_quality_score ?? null,
|
|
55
|
+
result_count: 0,
|
|
56
|
+
results: [],
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
const group = groups.get(docId);
|
|
60
|
+
group.result_count++;
|
|
61
|
+
group.results.push(r);
|
|
62
|
+
}
|
|
63
|
+
return {
|
|
64
|
+
grouped: Array.from(groups.values()).sort((a, b) => b.result_count - a.result_count),
|
|
65
|
+
total_documents: groups.size,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
69
|
+
// METADATA FILTER RESOLVER
|
|
70
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
71
|
+
/**
|
|
72
|
+
* Resolve metadata_filter to document IDs.
|
|
73
|
+
* Returns existingDocFilter unchanged if no metadata filter is specified.
|
|
74
|
+
* Returns ['__no_match__'] sentinel if filter is specified but matches zero documents,
|
|
75
|
+
* ensuring downstream filters (e.g. resolveClusterFilter) correctly block all results.
|
|
76
|
+
*/
|
|
77
|
+
function resolveMetadataFilter(db, metadataFilter, existingDocFilter) {
|
|
78
|
+
if (!metadataFilter)
|
|
79
|
+
return existingDocFilter;
|
|
80
|
+
const { doc_title, doc_author, doc_subject } = metadataFilter;
|
|
81
|
+
if (!doc_title && !doc_author && !doc_subject)
|
|
82
|
+
return existingDocFilter;
|
|
83
|
+
let sql = 'SELECT id FROM documents WHERE 1=1';
|
|
84
|
+
const params = [];
|
|
85
|
+
if (doc_title) {
|
|
86
|
+
sql += " AND doc_title LIKE ? ESCAPE '\\'";
|
|
87
|
+
params.push(`%${escapeLikePattern(doc_title)}%`);
|
|
88
|
+
}
|
|
89
|
+
if (doc_author) {
|
|
90
|
+
sql += " AND doc_author LIKE ? ESCAPE '\\'";
|
|
91
|
+
params.push(`%${escapeLikePattern(doc_author)}%`);
|
|
92
|
+
}
|
|
93
|
+
if (doc_subject) {
|
|
94
|
+
sql += " AND doc_subject LIKE ? ESCAPE '\\'";
|
|
95
|
+
params.push(`%${escapeLikePattern(doc_subject)}%`);
|
|
96
|
+
}
|
|
97
|
+
// If existing doc filter, intersect with it
|
|
98
|
+
if (existingDocFilter && existingDocFilter.length > 0) {
|
|
99
|
+
sql += ` AND id IN (${existingDocFilter.map(() => '?').join(',')})`;
|
|
100
|
+
params.push(...existingDocFilter);
|
|
101
|
+
}
|
|
102
|
+
const rows = db
|
|
103
|
+
.getConnection()
|
|
104
|
+
.prepare(sql)
|
|
105
|
+
.all(...params);
|
|
106
|
+
const ids = rows.map((r) => r.id);
|
|
107
|
+
// Return sentinel when metadata filter was specified but matched zero documents,
|
|
108
|
+
// so downstream filters (e.g. resolveClusterFilter) correctly intersect with empty set
|
|
109
|
+
// instead of treating it as "no filter".
|
|
110
|
+
if (ids.length === 0)
|
|
111
|
+
return ['__no_match__'];
|
|
112
|
+
return ids;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Resolve min_quality_score to filtered document IDs.
|
|
116
|
+
* If minQualityScore is undefined, returns existingDocFilter unchanged.
|
|
117
|
+
* If set, queries for documents with OCR quality >= threshold and intersects with existing filter.
|
|
118
|
+
*/
|
|
119
|
+
function resolveQualityFilter(db, minQualityScore, existingDocFilter) {
|
|
120
|
+
if (minQualityScore === undefined || minQualityScore === 0)
|
|
121
|
+
return existingDocFilter;
|
|
122
|
+
const rows = db
|
|
123
|
+
.getConnection()
|
|
124
|
+
.prepare(`SELECT DISTINCT d.id FROM documents d
|
|
125
|
+
JOIN ocr_results o ON o.document_id = d.id
|
|
126
|
+
WHERE o.parse_quality_score IS NOT NULL AND o.parse_quality_score >= ?`)
|
|
127
|
+
.all(minQualityScore);
|
|
128
|
+
const qualityIds = new Set(rows.map((r) => r.id));
|
|
129
|
+
if (!existingDocFilter) {
|
|
130
|
+
// Return sentinel non-matchable ID when no documents pass quality filter,
|
|
131
|
+
// so BM25/semantic/hybrid search applies the empty IN() filter correctly.
|
|
132
|
+
if (qualityIds.size === 0)
|
|
133
|
+
return ['__no_match__'];
|
|
134
|
+
return [...qualityIds];
|
|
135
|
+
}
|
|
136
|
+
const filtered = existingDocFilter.filter((id) => qualityIds.has(id));
|
|
137
|
+
if (filtered.length === 0)
|
|
138
|
+
return ['__no_match__'];
|
|
139
|
+
return filtered;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Format provenance chain as summary array
|
|
143
|
+
*/
|
|
144
|
+
function formatProvenanceChain(db, provenanceId) {
|
|
145
|
+
const chain = db.getProvenanceChain(provenanceId);
|
|
146
|
+
return chain.map((p) => ({
|
|
147
|
+
id: p.id,
|
|
148
|
+
type: p.type,
|
|
149
|
+
chain_depth: p.chain_depth,
|
|
150
|
+
processor: p.processor,
|
|
151
|
+
content_hash: p.content_hash,
|
|
152
|
+
}));
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Resolve cluster_id filter to document IDs.
|
|
156
|
+
* Queries document_clusters to find all documents in the specified cluster,
|
|
157
|
+
* then intersects with any existing document filter.
|
|
158
|
+
*/
|
|
159
|
+
function resolveClusterFilter(conn, clusterId, existingDocFilter) {
|
|
160
|
+
if (!clusterId)
|
|
161
|
+
return existingDocFilter;
|
|
162
|
+
const rows = conn
|
|
163
|
+
.prepare('SELECT document_id FROM document_clusters WHERE cluster_id = ?')
|
|
164
|
+
.all(clusterId);
|
|
165
|
+
const clusterDocIds = rows.map((r) => r.document_id);
|
|
166
|
+
if (clusterDocIds.length === 0)
|
|
167
|
+
return ['__no_match__'];
|
|
168
|
+
if (existingDocFilter && existingDocFilter.length > 0) {
|
|
169
|
+
const clusterSet = new Set(clusterDocIds);
|
|
170
|
+
const intersected = existingDocFilter.filter((id) => clusterSet.has(id));
|
|
171
|
+
return intersected.length === 0 ? ['__no_match__'] : intersected;
|
|
172
|
+
}
|
|
173
|
+
return clusterDocIds;
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Resolve chunk-level filters to SQL WHERE clause fragments.
|
|
177
|
+
* Filters apply to the chunks table (alias 'c' in BM25, 'ch' in vector).
|
|
178
|
+
* The caller is responsible for alias translation if needed.
|
|
179
|
+
*/
|
|
180
|
+
function resolveChunkFilter(filters) {
|
|
181
|
+
const conditions = [];
|
|
182
|
+
const params = [];
|
|
183
|
+
if (filters.content_type_filter && filters.content_type_filter.length > 0) {
|
|
184
|
+
// content_types is JSON array like '["table","text"]'
|
|
185
|
+
// Match if ANY of the requested types appear
|
|
186
|
+
const typeConditions = filters.content_type_filter.map(() => "c.content_types LIKE '%' || ? || '%'");
|
|
187
|
+
conditions.push(`(${typeConditions.join(' OR ')})`);
|
|
188
|
+
params.push(...filters.content_type_filter.map(t => `"${t}"`));
|
|
189
|
+
}
|
|
190
|
+
if (filters.section_path_filter) {
|
|
191
|
+
conditions.push("c.section_path LIKE ? || '%' ESCAPE '\\'");
|
|
192
|
+
params.push(escapeLikePattern(filters.section_path_filter));
|
|
193
|
+
}
|
|
194
|
+
if (filters.heading_filter) {
|
|
195
|
+
conditions.push("c.heading_context LIKE '%' || ? || '%' ESCAPE '\\'");
|
|
196
|
+
params.push(escapeLikePattern(filters.heading_filter));
|
|
197
|
+
}
|
|
198
|
+
if (filters.page_range_filter) {
|
|
199
|
+
if (filters.page_range_filter.min_page !== undefined) {
|
|
200
|
+
conditions.push('c.page_number >= ?');
|
|
201
|
+
params.push(filters.page_range_filter.min_page);
|
|
202
|
+
}
|
|
203
|
+
if (filters.page_range_filter.max_page !== undefined) {
|
|
204
|
+
conditions.push('c.page_number <= ?');
|
|
205
|
+
params.push(filters.page_range_filter.max_page);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
if (filters.is_atomic_filter !== undefined) {
|
|
209
|
+
conditions.push(`c.is_atomic = ?`);
|
|
210
|
+
params.push(filters.is_atomic_filter ? 1 : 0);
|
|
211
|
+
}
|
|
212
|
+
if (filters.heading_level_filter) {
|
|
213
|
+
if (filters.heading_level_filter.min_level !== undefined) {
|
|
214
|
+
conditions.push('c.heading_level >= ?');
|
|
215
|
+
params.push(filters.heading_level_filter.min_level);
|
|
216
|
+
}
|
|
217
|
+
if (filters.heading_level_filter.max_level !== undefined) {
|
|
218
|
+
conditions.push('c.heading_level <= ?');
|
|
219
|
+
params.push(filters.heading_level_filter.max_level);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
if (filters.min_page_count !== undefined) {
|
|
223
|
+
conditions.push('(SELECT page_count FROM documents WHERE id = c.document_id) >= ?');
|
|
224
|
+
params.push(filters.min_page_count);
|
|
225
|
+
}
|
|
226
|
+
if (filters.max_page_count !== undefined) {
|
|
227
|
+
conditions.push('(SELECT page_count FROM documents WHERE id = c.document_id) <= ?');
|
|
228
|
+
params.push(filters.max_page_count);
|
|
229
|
+
}
|
|
230
|
+
if (filters.table_columns_contain) {
|
|
231
|
+
// Filter to atomic table chunks with matching column headers in provenance processing_params
|
|
232
|
+
conditions.push(`c.is_atomic = 1`);
|
|
233
|
+
conditions.push(`EXISTS (SELECT 1 FROM provenance p WHERE p.id = c.provenance_id AND LOWER(p.processing_params) LIKE '%' || LOWER(?) || '%')`);
|
|
234
|
+
params.push(filters.table_columns_contain);
|
|
235
|
+
}
|
|
236
|
+
return { conditions, params };
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Attach neighboring chunk context to search results.
|
|
240
|
+
* For each result with a chunk_id and chunk_index, fetches N neighbors before and after.
|
|
241
|
+
* Deduplicates: skips neighbors that are already primary results.
|
|
242
|
+
*/
|
|
243
|
+
function attachContextChunks(conn, results, contextSize) {
|
|
244
|
+
if (contextSize <= 0 || results.length === 0)
|
|
245
|
+
return;
|
|
246
|
+
// Build set of primary result chunk IDs for dedup
|
|
247
|
+
const primaryChunkIds = new Set(results.map(r => r.chunk_id).filter(Boolean));
|
|
248
|
+
// Group results by document_id for batch queries
|
|
249
|
+
const byDoc = new Map();
|
|
250
|
+
for (const r of results) {
|
|
251
|
+
const docId = r.document_id;
|
|
252
|
+
const chunkIndex = r.chunk_index;
|
|
253
|
+
if (!docId || chunkIndex === undefined) {
|
|
254
|
+
r.context_before = [];
|
|
255
|
+
r.context_after = [];
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
if (!byDoc.has(docId))
|
|
259
|
+
byDoc.set(docId, []);
|
|
260
|
+
byDoc.get(docId).push(r);
|
|
261
|
+
}
|
|
262
|
+
for (const [docId, docResults] of byDoc) {
|
|
263
|
+
// Batch query: get all potentially needed chunks for this doc
|
|
264
|
+
const allIndices = docResults.map(r => r.chunk_index);
|
|
265
|
+
const minIdx = (safeMin(allIndices) ?? 0) - contextSize;
|
|
266
|
+
const maxIdx = (safeMax(allIndices) ?? 0) + contextSize;
|
|
267
|
+
const neighbors = conn.prepare(`SELECT id, text, chunk_index, page_number, heading_context, section_path, content_types
|
|
268
|
+
FROM chunks
|
|
269
|
+
WHERE document_id = ? AND chunk_index BETWEEN ? AND ?
|
|
270
|
+
ORDER BY chunk_index`).all(docId, minIdx, maxIdx);
|
|
271
|
+
const neighborMap = new Map(neighbors.map(n => [n.chunk_index, n]));
|
|
272
|
+
for (const r of docResults) {
|
|
273
|
+
const idx = r.chunk_index;
|
|
274
|
+
const before = [];
|
|
275
|
+
const after = [];
|
|
276
|
+
for (let i = idx - contextSize; i < idx; i++) {
|
|
277
|
+
const n = neighborMap.get(i);
|
|
278
|
+
if (n && !primaryChunkIds.has(n.id)) {
|
|
279
|
+
before.push({
|
|
280
|
+
chunk_id: n.id,
|
|
281
|
+
chunk_index: n.chunk_index,
|
|
282
|
+
text: n.text.substring(0, 500),
|
|
283
|
+
page_number: n.page_number,
|
|
284
|
+
heading_context: n.heading_context,
|
|
285
|
+
is_context: true,
|
|
286
|
+
});
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
for (let i = idx + 1; i <= idx + contextSize; i++) {
|
|
290
|
+
const n = neighborMap.get(i);
|
|
291
|
+
if (n && !primaryChunkIds.has(n.id)) {
|
|
292
|
+
after.push({
|
|
293
|
+
chunk_id: n.id,
|
|
294
|
+
chunk_index: n.chunk_index,
|
|
295
|
+
text: n.text.substring(0, 500),
|
|
296
|
+
page_number: n.page_number,
|
|
297
|
+
heading_context: n.heading_context,
|
|
298
|
+
is_context: true,
|
|
299
|
+
});
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
r.context_before = before;
|
|
303
|
+
r.context_after = after;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Attach table metadata to search results for table chunks.
|
|
309
|
+
* For each result where content_types contains "table",
|
|
310
|
+
* queries provenance processing_params to extract table_columns, table_row_count, table_column_count.
|
|
311
|
+
* Batches queries by chunk_id.
|
|
312
|
+
*/
|
|
313
|
+
function attachTableMetadata(conn, results) {
|
|
314
|
+
// Find table chunk IDs (any chunk with "table" in content_types, not just atomic)
|
|
315
|
+
const tableChunkIds = [];
|
|
316
|
+
for (const r of results) {
|
|
317
|
+
if (r.chunk_id && typeof r.content_types === 'string' && r.content_types.includes('"table"')) {
|
|
318
|
+
tableChunkIds.push(r.chunk_id);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
if (tableChunkIds.length === 0)
|
|
322
|
+
return;
|
|
323
|
+
// Batch query provenance for table metadata via chunks.provenance_id -> provenance.id
|
|
324
|
+
const placeholders = tableChunkIds.map(() => '?').join(',');
|
|
325
|
+
const rows = conn.prepare(`SELECT c.id AS chunk_id, p.processing_params
|
|
326
|
+
FROM chunks c
|
|
327
|
+
INNER JOIN provenance p ON c.provenance_id = p.id
|
|
328
|
+
WHERE c.id IN (${placeholders})`).all(...tableChunkIds);
|
|
329
|
+
// Build map: chunk_id -> table metadata
|
|
330
|
+
const metadataMap = new Map();
|
|
331
|
+
for (const row of rows) {
|
|
332
|
+
if (metadataMap.has(row.chunk_id))
|
|
333
|
+
continue;
|
|
334
|
+
try {
|
|
335
|
+
const params = JSON.parse(row.processing_params);
|
|
336
|
+
if (params.table_columns) {
|
|
337
|
+
metadataMap.set(row.chunk_id, {
|
|
338
|
+
table_columns: params.table_columns,
|
|
339
|
+
table_row_count: params.table_row_count ?? 0,
|
|
340
|
+
table_column_count: params.table_column_count ?? 0,
|
|
341
|
+
});
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
catch (error) {
|
|
345
|
+
console.error(`[search] Failed to parse processing_params for chunk ${row.chunk_id}: ${error instanceof Error ? error.message : String(error)}`);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
// Attach to results as top-level fields
|
|
349
|
+
for (const r of results) {
|
|
350
|
+
const meta = r.chunk_id ? metadataMap.get(r.chunk_id) : undefined;
|
|
351
|
+
if (meta) {
|
|
352
|
+
r.table_columns = meta.table_columns;
|
|
353
|
+
r.table_row_count = meta.table_row_count;
|
|
354
|
+
r.table_column_count = meta.table_column_count;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
/**
|
|
359
|
+
* Exclude chunks tagged as repeated headers/footers (T2.8).
|
|
360
|
+
* Queries entity_tags for the system:repeated_header_footer tag
|
|
361
|
+
* and filters them out of the results array.
|
|
362
|
+
* Returns a new filtered array.
|
|
363
|
+
*/
|
|
364
|
+
function excludeRepeatedHeaderFooterChunks(conn, results) {
|
|
365
|
+
const taggedChunks = conn.prepare(`SELECT et.entity_id FROM entity_tags et
|
|
366
|
+
JOIN tags t ON t.id = et.tag_id
|
|
367
|
+
WHERE t.name = 'system:repeated_header_footer' AND et.entity_type = 'chunk'`).all();
|
|
368
|
+
if (taggedChunks.length === 0)
|
|
369
|
+
return results;
|
|
370
|
+
const excludeChunkIds = new Set(taggedChunks.map(t => t.entity_id));
|
|
371
|
+
return results.filter(r => {
|
|
372
|
+
const chunkId = r.chunk_id;
|
|
373
|
+
return !chunkId || !excludeChunkIds.has(chunkId);
|
|
374
|
+
});
|
|
375
|
+
}
|
|
376
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
377
|
+
// V7 INTELLIGENCE OPTIMIZATION - COMPACT MODE & PROVENANCE SUMMARY
|
|
378
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
379
|
+
/**
|
|
380
|
+
* Map a full search result to compact format, keeping only essential fields.
|
|
381
|
+
* Reduces token count by ~77% per result.
|
|
382
|
+
*/
|
|
383
|
+
function compactResult(r, mode) {
|
|
384
|
+
let scoreField;
|
|
385
|
+
switch (mode) {
|
|
386
|
+
case 'keyword':
|
|
387
|
+
scoreField = 'bm25_score';
|
|
388
|
+
break;
|
|
389
|
+
case 'hybrid':
|
|
390
|
+
scoreField = 'rrf_score';
|
|
391
|
+
break;
|
|
392
|
+
default:
|
|
393
|
+
scoreField = 'similarity_score';
|
|
394
|
+
break;
|
|
395
|
+
}
|
|
396
|
+
return {
|
|
397
|
+
document_id: r.document_id,
|
|
398
|
+
chunk_id: r.chunk_id,
|
|
399
|
+
original_text: r.original_text,
|
|
400
|
+
source_file_name: r.source_file_name,
|
|
401
|
+
page_number: r.page_number,
|
|
402
|
+
score: r[scoreField] ?? r.similarity_score ?? r.bm25_score ?? r.rrf_score,
|
|
403
|
+
result_type: r.result_type,
|
|
404
|
+
};
|
|
405
|
+
}
|
|
406
|
+
/**
|
|
407
|
+
* Build a one-line provenance summary string from the provenance chain.
|
|
408
|
+
* Format: "FILE → OCR (marker, 92% quality) → Chunk 3 → Embedding"
|
|
409
|
+
*/
|
|
410
|
+
function buildProvenanceSummary(db, provenanceId) {
|
|
411
|
+
if (!provenanceId)
|
|
412
|
+
return undefined;
|
|
413
|
+
try {
|
|
414
|
+
const chain = db.getProvenanceChain(provenanceId);
|
|
415
|
+
if (!chain || chain.length === 0)
|
|
416
|
+
return undefined;
|
|
417
|
+
const parts = [];
|
|
418
|
+
for (const link of chain) {
|
|
419
|
+
switch (link.type) {
|
|
420
|
+
case 'DOCUMENT': {
|
|
421
|
+
const sourceType = link.source_type;
|
|
422
|
+
parts.push(sourceType?.toUpperCase() ?? 'DOCUMENT');
|
|
423
|
+
break;
|
|
424
|
+
}
|
|
425
|
+
case 'OCR_RESULT': {
|
|
426
|
+
const qualityScore = link.processing_quality_score;
|
|
427
|
+
const qualityStr = qualityScore != null
|
|
428
|
+
? `, quality ${qualityScore.toFixed(1)}/5.0`
|
|
429
|
+
: '';
|
|
430
|
+
parts.push(`OCR (${link.processor ?? 'unknown'}${qualityStr})`);
|
|
431
|
+
break;
|
|
432
|
+
}
|
|
433
|
+
case 'CHUNK': {
|
|
434
|
+
const chunkIndex = link.location?.chunk_index;
|
|
435
|
+
const chunkStr = chunkIndex !== undefined ? ` ${chunkIndex + 1}` : '';
|
|
436
|
+
parts.push(`Chunk${chunkStr}`);
|
|
437
|
+
break;
|
|
438
|
+
}
|
|
439
|
+
case 'EMBEDDING':
|
|
440
|
+
parts.push('Embedding');
|
|
441
|
+
break;
|
|
442
|
+
case 'VLM_DESCRIPTION':
|
|
443
|
+
parts.push('VLM');
|
|
444
|
+
break;
|
|
445
|
+
default:
|
|
446
|
+
parts.push(link.type);
|
|
447
|
+
break;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
return parts.join(' \u2192 ');
|
|
451
|
+
}
|
|
452
|
+
catch (err) {
|
|
453
|
+
console.error(`[search] Failed to build provenance summary for ${provenanceId}: ${err instanceof Error ? err.message : String(err)}`);
|
|
454
|
+
return undefined;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
/**
|
|
458
|
+
* Apply V7 compact mode and provenance summary to response data.
|
|
459
|
+
* Modifies responseData.results in place. Must be called BEFORE grouping.
|
|
460
|
+
*/
|
|
461
|
+
function applyV7Transforms(responseData, input, db, mode) {
|
|
462
|
+
// V7: Attach provenance summary one-liners BEFORE compact (compact strips provenance_id)
|
|
463
|
+
if (input.include_provenance_summary) {
|
|
464
|
+
for (const r of responseData.results) {
|
|
465
|
+
r.provenance_summary = buildProvenanceSummary(db, r.provenance_id);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
// V7: Apply compact mode - strip results to essential fields only
|
|
469
|
+
if (input.compact) {
|
|
470
|
+
responseData.results = responseData.results.map(r => {
|
|
471
|
+
const compacted = compactResult(r, mode);
|
|
472
|
+
// Preserve provenance_summary if it was attached above
|
|
473
|
+
if (r.provenance_summary)
|
|
474
|
+
compacted.provenance_summary = r.provenance_summary;
|
|
475
|
+
return compacted;
|
|
476
|
+
});
|
|
477
|
+
responseData.compact = true;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
/**
|
|
481
|
+
* Attach cluster context to search results.
|
|
482
|
+
* For each unique document_id in results, queries cluster membership
|
|
483
|
+
* and attaches cluster_context array to each result.
|
|
484
|
+
*/
|
|
485
|
+
function attachClusterContext(conn, results) {
|
|
486
|
+
const docIds = [...new Set(results.map((r) => r.document_id).filter(Boolean))];
|
|
487
|
+
if (docIds.length === 0)
|
|
488
|
+
return;
|
|
489
|
+
const clusterCache = new Map();
|
|
490
|
+
for (const docId of docIds) {
|
|
491
|
+
try {
|
|
492
|
+
const summaries = getClusterSummariesForDocument(conn, docId);
|
|
493
|
+
clusterCache.set(docId, summaries.map((s) => ({
|
|
494
|
+
cluster_id: s.id,
|
|
495
|
+
cluster_label: s.label,
|
|
496
|
+
run_id: s.run_id,
|
|
497
|
+
})));
|
|
498
|
+
}
|
|
499
|
+
catch (error) {
|
|
500
|
+
console.error(`[Search] Failed to get cluster summaries for document ${docId}: ${String(error)}`);
|
|
501
|
+
clusterCache.set(docId, []);
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
for (const r of results) {
|
|
505
|
+
const docId = r.document_id;
|
|
506
|
+
if (docId) {
|
|
507
|
+
r.cluster_context = clusterCache.get(docId) ?? [];
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
/**
|
|
512
|
+
* Attach cross-document context (cluster memberships and related comparisons)
|
|
513
|
+
* to the first result per document. This gives callers awareness of how each
|
|
514
|
+
* source document relates to the wider corpus without bloating every result.
|
|
515
|
+
*/
|
|
516
|
+
function attachCrossDocumentContext(conn, results) {
|
|
517
|
+
const docIds = [...new Set(results.map(r => (r.document_id ?? r.source_document_id)).filter(Boolean))];
|
|
518
|
+
if (docIds.length === 0)
|
|
519
|
+
return;
|
|
520
|
+
const contextMap = new Map();
|
|
521
|
+
for (const docId of docIds) {
|
|
522
|
+
try {
|
|
523
|
+
// Get cluster memberships
|
|
524
|
+
const clusters = conn.prepare(`SELECT c.id, c.label, c.classification_tag, dc.similarity_to_centroid
|
|
525
|
+
FROM document_clusters dc JOIN clusters c ON c.id = dc.cluster_id
|
|
526
|
+
WHERE dc.document_id = ? LIMIT 3`).all(docId);
|
|
527
|
+
// Get comparison summaries (documents already compared to this one)
|
|
528
|
+
const comparisons = conn.prepare(`SELECT
|
|
529
|
+
CASE WHEN document_id_1 = ? THEN document_id_2 ELSE document_id_1 END as related_doc_id,
|
|
530
|
+
similarity_ratio, summary
|
|
531
|
+
FROM comparisons
|
|
532
|
+
WHERE document_id_1 = ? OR document_id_2 = ?
|
|
533
|
+
ORDER BY similarity_ratio DESC LIMIT 3`).all(docId, docId, docId);
|
|
534
|
+
contextMap.set(docId, {
|
|
535
|
+
clusters: clusters.length > 0 ? clusters : null,
|
|
536
|
+
related_documents: comparisons.length > 0 ? comparisons : null,
|
|
537
|
+
});
|
|
538
|
+
}
|
|
539
|
+
catch (error) {
|
|
540
|
+
console.error(`[Search] Failed to get cross-document context for ${docId}: ${String(error)}`);
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
// Attach to first result per document (not every result to reduce noise)
|
|
544
|
+
const seen = new Set();
|
|
545
|
+
for (const r of results) {
|
|
546
|
+
const docId = (r.document_id ?? r.source_document_id);
|
|
547
|
+
if (docId && !seen.has(docId)) {
|
|
548
|
+
seen.add(docId);
|
|
549
|
+
const ctx = contextMap.get(docId);
|
|
550
|
+
if (ctx) {
|
|
551
|
+
r.document_context = ctx;
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
/**
|
|
557
|
+
* Enrich VLM search results with image metadata (extracted_path, page_number, dimensions, etc.).
|
|
558
|
+
* For results with an image_id, looks up the image record and attaches its metadata.
|
|
559
|
+
* Non-VLM results and results with missing images are left unchanged.
|
|
560
|
+
*/
|
|
561
|
+
function enrichVLMResultsWithImageMetadata(conn, results) {
|
|
562
|
+
for (const result of results) {
|
|
563
|
+
if (result.image_id) {
|
|
564
|
+
const image = getImage(conn, result.image_id);
|
|
565
|
+
if (image) {
|
|
566
|
+
result.image_extracted_path = image.extracted_path;
|
|
567
|
+
result.image_page_number = image.page_number;
|
|
568
|
+
result.image_dimensions = { width: image.dimensions.width, height: image.dimensions.height };
|
|
569
|
+
result.image_block_type = image.block_type;
|
|
570
|
+
result.image_format = image.format;
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
/**
|
|
576
|
+
* Apply post-retrieval score boosting based on chunk metadata.
|
|
577
|
+
*
|
|
578
|
+
* Tasks 2.1-2.3 + 4.3 integration:
|
|
579
|
+
* - Heading level boost: H1=1.3x, H2=1.2x, H3=1.1x, body=1.0x
|
|
580
|
+
* - Atomic chunk boost: complete semantic units get 1.1x
|
|
581
|
+
* - Content-type preference: query keyword matching boosts table/code/list results
|
|
582
|
+
* - Block confidence: computed from content types via computeBlockConfidence (0.8x-1.16x)
|
|
583
|
+
*
|
|
584
|
+
* Mutates score fields (bm25_score, similarity_score, rrf_score) in place.
|
|
585
|
+
*/
|
|
586
|
+
function applyMetadataBoosts(results, options) {
|
|
587
|
+
for (const r of results) {
|
|
588
|
+
let boost = 1.0;
|
|
589
|
+
// Task 2.1: Heading level boost: H1=1.3x, H2=1.2x, H3=1.1x, body=1.0x
|
|
590
|
+
if (options.headingBoost !== false) {
|
|
591
|
+
const level = r.heading_level ?? 5;
|
|
592
|
+
const clampedLevel = Math.min(Math.max(level, 1), 4);
|
|
593
|
+
boost *= 1 + (0.1 * (4 - clampedLevel));
|
|
594
|
+
}
|
|
595
|
+
// Task 2.2: Atomic chunk boost: complete semantic units get 1.1x
|
|
596
|
+
if (options.atomicBoost !== false && r.is_atomic) {
|
|
597
|
+
boost *= 1.1;
|
|
598
|
+
}
|
|
599
|
+
// Task 2.3: Content-type preference based on query keywords
|
|
600
|
+
if (options.contentTypeQuery) {
|
|
601
|
+
const q = options.contentTypeQuery.toLowerCase();
|
|
602
|
+
const contentTypes = r.content_types;
|
|
603
|
+
if (contentTypes) {
|
|
604
|
+
if (/\b(table|data|statistic|row|column|figure|chart)\b/.test(q) && contentTypes.includes('"table"')) {
|
|
605
|
+
boost *= 1.2;
|
|
606
|
+
}
|
|
607
|
+
if (/\b(code|function|class|method|import|variable|api)\b/.test(q) && contentTypes.includes('"code"')) {
|
|
608
|
+
boost *= 1.2;
|
|
609
|
+
}
|
|
610
|
+
if (/\b(list|items|steps|requirements|criteria)\b/.test(q) && contentTypes.includes('"list"')) {
|
|
611
|
+
boost *= 1.15;
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
// Task 4.3 integration: Block confidence from content types (computed on-the-fly)
|
|
616
|
+
try {
|
|
617
|
+
const contentTypesRaw = r.content_types;
|
|
618
|
+
if (contentTypesRaw) {
|
|
619
|
+
const parsed = JSON.parse(contentTypesRaw);
|
|
620
|
+
if (Array.isArray(parsed) && parsed.length > 0) {
|
|
621
|
+
const blockConf = computeBlockConfidence(parsed);
|
|
622
|
+
boost *= 0.8 + (0.4 * blockConf); // range: 0.8x to 1.16x
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
catch (error) {
|
|
627
|
+
console.error(`[search] Failed to parse content_types for chunk ${r.chunk_id ?? 'unknown'} during quality boost: ${error instanceof Error ? error.message : String(error)}`);
|
|
628
|
+
}
|
|
629
|
+
// Task 7.1: Header/footer penalty - demote chunks matching repeated headers/footers
|
|
630
|
+
// Two-tier detection:
|
|
631
|
+
// 1. Explicit: caller provides known repeated texts from detectRepeatedHeadersFooters()
|
|
632
|
+
// 2. Heuristic: short chunks with typical header/footer patterns get penalized
|
|
633
|
+
const chunkText = r.original_text ?? '';
|
|
634
|
+
if (options.repeatedHeaderFooterTexts && options.repeatedHeaderFooterTexts.length > 0) {
|
|
635
|
+
if (chunkText.length > 0 && isRepeatedHeaderFooter(chunkText, options.repeatedHeaderFooterTexts)) {
|
|
636
|
+
boost *= 0.5;
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
// Heuristic header/footer detection for short, boilerplate-like chunks
|
|
640
|
+
const trimmed = chunkText.trim();
|
|
641
|
+
if (trimmed.length > 0 && trimmed.length < 80) {
|
|
642
|
+
const lowerText = trimmed.toLowerCase();
|
|
643
|
+
const isLikelyBoilerplate = /^page\s+\d+(\s+of\s+\d+)?$/i.test(trimmed) ||
|
|
644
|
+
/^\d+$/.test(trimmed) ||
|
|
645
|
+
/^-\s*\d+\s*-$/.test(trimmed) ||
|
|
646
|
+
lowerText.includes('confidential') ||
|
|
647
|
+
lowerText.includes('all rights reserved') ||
|
|
648
|
+
/^copyright\s/i.test(trimmed) ||
|
|
649
|
+
/^\u00a9\s/.test(trimmed);
|
|
650
|
+
if (isLikelyBoilerplate) {
|
|
651
|
+
boost *= 0.5;
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
// Clamp aggregate multiplier to [0.5, 2.0] to prevent compounding penalties (M-9)
|
|
655
|
+
// from overwhelming relevance scores and to cap the max boost ratio at 4x (M-11).
|
|
656
|
+
const clampedBoost = Math.max(0.5, Math.min(2.0, boost));
|
|
657
|
+
// Apply clamped boost to whichever score field exists
|
|
658
|
+
if (r.bm25_score != null)
|
|
659
|
+
r.bm25_score = r.bm25_score * clampedBoost;
|
|
660
|
+
if (r.similarity_score != null)
|
|
661
|
+
r.similarity_score = r.similarity_score * clampedBoost;
|
|
662
|
+
if (r.rrf_score != null)
|
|
663
|
+
r.rrf_score = r.rrf_score * clampedBoost;
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
/**
|
|
667
|
+
* Apply document length normalization to gently penalize results from very long documents.
|
|
668
|
+
* Uses sqrt(median/docChunks) clamped to [0.7, 1.0] so short documents are unaffected
|
|
669
|
+
* and very long documents get a modest penalty.
|
|
670
|
+
*
|
|
671
|
+
* Mutates score fields (bm25_score, similarity_score, rrf_score) in place.
|
|
672
|
+
* Skips normalization when all results come from a single document.
|
|
673
|
+
*/
|
|
674
|
+
function applyLengthNormalization(results, db) {
|
|
675
|
+
const docIds = [...new Set(results.map(r => r.document_id).filter(Boolean))];
|
|
676
|
+
if (docIds.length <= 1)
|
|
677
|
+
return; // No normalization needed for single-document results
|
|
678
|
+
const placeholders = docIds.map(() => '?').join(',');
|
|
679
|
+
const rows = db.getConnection()
|
|
680
|
+
.prepare(`SELECT document_id, COUNT(*) as chunk_count FROM chunks WHERE document_id IN (${placeholders}) GROUP BY document_id`)
|
|
681
|
+
.all(...docIds);
|
|
682
|
+
const chunkCounts = new Map(rows.map(r => [r.document_id, r.chunk_count]));
|
|
683
|
+
const counts = [...chunkCounts.values()].sort((a, b) => a - b);
|
|
684
|
+
const median = counts[Math.floor(counts.length / 2)] || 1;
|
|
685
|
+
for (const r of results) {
|
|
686
|
+
const docChunks = chunkCounts.get(r.document_id) ?? median;
|
|
687
|
+
const factor = Math.sqrt(median / Math.max(docChunks, 1));
|
|
688
|
+
const clampedFactor = Math.max(0.7, Math.min(1.0, factor));
|
|
689
|
+
if (r.bm25_score != null)
|
|
690
|
+
r.bm25_score = r.bm25_score * clampedFactor;
|
|
691
|
+
if (r.similarity_score != null)
|
|
692
|
+
r.similarity_score = r.similarity_score * clampedFactor;
|
|
693
|
+
if (r.rrf_score != null)
|
|
694
|
+
r.rrf_score = r.rrf_score * clampedFactor;
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Remove duplicate chunks from search results by content_hash (Task 7.3).
|
|
699
|
+
* Keeps only the first occurrence of each hash value. Results without a hash
|
|
700
|
+
* are always kept. Returns a new array (does not mutate the input).
|
|
701
|
+
*/
|
|
702
|
+
function deduplicateByContentHash(results) {
|
|
703
|
+
const seen = new Set();
|
|
704
|
+
return results.filter(r => {
|
|
705
|
+
const hash = r.content_hash ?? null;
|
|
706
|
+
if (!hash)
|
|
707
|
+
return true;
|
|
708
|
+
if (seen.has(hash))
|
|
709
|
+
return false;
|
|
710
|
+
seen.add(hash);
|
|
711
|
+
return true;
|
|
712
|
+
});
|
|
713
|
+
}
|
|
714
|
+
/**
|
|
715
|
+
* Attach optional provenance chain to a search result object.
|
|
716
|
+
* Shared by BM25, semantic, and hybrid handlers (both reranked and non-reranked paths).
|
|
717
|
+
*
|
|
718
|
+
* @param provenanceKey - Response field name for provenance chain ('provenance' or 'provenance_chain')
|
|
719
|
+
*/
|
|
720
|
+
function attachProvenance(result, db, provenanceId, includeProvenance, provenanceKey = 'provenance') {
|
|
721
|
+
if (includeProvenance) {
|
|
722
|
+
result[provenanceKey] = formatProvenanceChain(db, provenanceId);
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
/**
|
|
726
|
+
* Apply chunk proximity boost to hybrid search results.
|
|
727
|
+
* Results from the same document whose chunk indexes are within 2 of each other
|
|
728
|
+
* get their rrf_score multiplied by (1 + 0.1 * nearbyCount), rewarding
|
|
729
|
+
* clusters of nearby relevant chunks.
|
|
730
|
+
*/
|
|
731
|
+
function applyChunkProximityBoost(results) {
|
|
732
|
+
const byDoc = new Map();
|
|
733
|
+
for (let i = 0; i < results.length; i++) {
|
|
734
|
+
const docId = results[i].document_id;
|
|
735
|
+
const chunkIndex = results[i].chunk_index;
|
|
736
|
+
if (docId && chunkIndex !== undefined && chunkIndex !== null) {
|
|
737
|
+
if (!byDoc.has(docId))
|
|
738
|
+
byDoc.set(docId, []);
|
|
739
|
+
byDoc.get(docId).push({ idx: i, chunkIndex });
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
let boostedCount = 0;
|
|
743
|
+
for (const entries of byDoc.values()) {
|
|
744
|
+
if (entries.length < 2)
|
|
745
|
+
continue;
|
|
746
|
+
for (const entry of entries) {
|
|
747
|
+
const nearbyCount = entries.filter((e) => Math.abs(e.chunkIndex - entry.chunkIndex) <= 2 && e.chunkIndex !== entry.chunkIndex).length;
|
|
748
|
+
if (nearbyCount > 0) {
|
|
749
|
+
const currentScore = results[entry.idx].rrf_score;
|
|
750
|
+
if (typeof currentScore === 'number') {
|
|
751
|
+
results[entry.idx].rrf_score = currentScore * (1 + 0.1 * nearbyCount);
|
|
752
|
+
boostedCount++;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
return boostedCount > 0 ? { boosted_results: boostedCount } : undefined;
|
|
758
|
+
}
|
|
759
|
+
/**
|
|
760
|
+
* Convert BM25 results (with bm25_score and rank) to ranked format for RRF fusion.
|
|
761
|
+
*/
|
|
762
|
+
function toBm25Ranked(results) {
|
|
763
|
+
return results.map((r) => ({
|
|
764
|
+
chunk_id: r.chunk_id,
|
|
765
|
+
image_id: r.image_id,
|
|
766
|
+
extraction_id: r.extraction_id,
|
|
767
|
+
embedding_id: r.embedding_id ?? '',
|
|
768
|
+
document_id: r.document_id,
|
|
769
|
+
original_text: r.original_text,
|
|
770
|
+
result_type: r.result_type,
|
|
771
|
+
source_file_path: r.source_file_path,
|
|
772
|
+
source_file_name: r.source_file_name,
|
|
773
|
+
source_file_hash: r.source_file_hash,
|
|
774
|
+
page_number: r.page_number,
|
|
775
|
+
character_start: r.character_start,
|
|
776
|
+
character_end: r.character_end,
|
|
777
|
+
chunk_index: r.chunk_index,
|
|
778
|
+
provenance_id: r.provenance_id,
|
|
779
|
+
content_hash: r.content_hash,
|
|
780
|
+
rank: r.rank,
|
|
781
|
+
score: r.bm25_score,
|
|
782
|
+
heading_context: r.heading_context ?? null,
|
|
783
|
+
section_path: r.section_path ?? null,
|
|
784
|
+
content_types: r.content_types ?? null,
|
|
785
|
+
is_atomic: r.is_atomic ?? false,
|
|
786
|
+
page_range: r.page_range ?? null,
|
|
787
|
+
heading_level: r.heading_level ?? null,
|
|
788
|
+
ocr_quality_score: r.ocr_quality_score ?? null,
|
|
789
|
+
doc_title: r.doc_title ?? null,
|
|
790
|
+
doc_author: r.doc_author ?? null,
|
|
791
|
+
doc_subject: r.doc_subject ?? null,
|
|
792
|
+
overlap_previous: r.overlap_previous ?? 0,
|
|
793
|
+
overlap_next: r.overlap_next ?? 0,
|
|
794
|
+
chunking_strategy: r.chunking_strategy ?? null,
|
|
795
|
+
embedding_status: r.embedding_status ?? 'pending',
|
|
796
|
+
doc_page_count: r.doc_page_count ?? null,
|
|
797
|
+
datalab_mode: r.datalab_mode ?? null,
|
|
798
|
+
total_chunks: r.total_chunks ?? 0,
|
|
799
|
+
}));
|
|
800
|
+
}
|
|
801
|
+
/**
|
|
802
|
+
* Convert semantic search results (with similarity_score) to ranked format for RRF fusion.
|
|
803
|
+
*/
|
|
804
|
+
function toSemanticRanked(results) {
|
|
805
|
+
return results.map((r, i) => ({
|
|
806
|
+
chunk_id: r.chunk_id,
|
|
807
|
+
image_id: r.image_id,
|
|
808
|
+
extraction_id: r.extraction_id,
|
|
809
|
+
embedding_id: r.embedding_id,
|
|
810
|
+
document_id: r.document_id,
|
|
811
|
+
original_text: r.original_text,
|
|
812
|
+
result_type: r.result_type,
|
|
813
|
+
source_file_path: r.source_file_path,
|
|
814
|
+
source_file_name: r.source_file_name,
|
|
815
|
+
source_file_hash: r.source_file_hash,
|
|
816
|
+
page_number: r.page_number,
|
|
817
|
+
character_start: r.character_start,
|
|
818
|
+
character_end: r.character_end,
|
|
819
|
+
chunk_index: r.chunk_index,
|
|
820
|
+
total_chunks: r.total_chunks ?? 0,
|
|
821
|
+
provenance_id: r.provenance_id,
|
|
822
|
+
content_hash: r.content_hash,
|
|
823
|
+
rank: i + 1,
|
|
824
|
+
score: r.similarity_score,
|
|
825
|
+
heading_context: r.heading_context ?? null,
|
|
826
|
+
section_path: r.section_path ?? null,
|
|
827
|
+
content_types: r.content_types ?? null,
|
|
828
|
+
is_atomic: r.is_atomic ?? false,
|
|
829
|
+
page_range: r.chunk_page_range ?? null,
|
|
830
|
+
heading_level: r.heading_level ?? null,
|
|
831
|
+
ocr_quality_score: r.ocr_quality_score ?? null,
|
|
832
|
+
doc_title: r.doc_title ?? null,
|
|
833
|
+
doc_author: r.doc_author ?? null,
|
|
834
|
+
doc_subject: r.doc_subject ?? null,
|
|
835
|
+
overlap_previous: r.overlap_previous ?? 0,
|
|
836
|
+
overlap_next: r.overlap_next ?? 0,
|
|
837
|
+
chunking_strategy: r.chunking_strategy ?? null,
|
|
838
|
+
embedding_status: r.embedding_status ?? 'pending',
|
|
839
|
+
doc_page_count: r.doc_page_count ?? null,
|
|
840
|
+
datalab_mode: r.datalab_mode ?? null,
|
|
841
|
+
}));
|
|
842
|
+
}
|
|
843
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
844
|
+
// SEARCH TOOL HANDLERS
|
|
845
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
846
|
+
/**
|
|
847
|
+
* Internal: Semantic vector search logic (called by unified handler)
|
|
848
|
+
*/
|
|
849
|
+
async function handleSearchSemanticInternal(params) {
|
|
850
|
+
try {
|
|
851
|
+
return await withDatabaseOperation(async ({ db, vector }) => {
|
|
852
|
+
// Params already validated and enriched by handleSearchUnified
|
|
853
|
+
const input = params;
|
|
854
|
+
const conn = db.getConnection();
|
|
855
|
+
// Semantic mode: skip query expansion entirely.
|
|
856
|
+
// expand_query produces FTS5 OR-joined terms which have zero effect on vector search.
|
|
857
|
+
// The embedding is always generated from the original query.
|
|
858
|
+
// Resolve metadata filter to document IDs, then chain through quality + cluster filters
|
|
859
|
+
const documentFilter = resolveClusterFilter(conn, input.cluster_id, resolveQualityFilter(db, input.min_quality_score, resolveMetadataFilter(db, input.metadata_filter, input.document_filter)));
|
|
860
|
+
// Resolve chunk-level filters
|
|
861
|
+
const chunkFilter = resolveChunkFilter({
|
|
862
|
+
content_type_filter: input.content_type_filter,
|
|
863
|
+
section_path_filter: input.section_path_filter,
|
|
864
|
+
heading_filter: input.heading_filter,
|
|
865
|
+
page_range_filter: input.page_range_filter,
|
|
866
|
+
is_atomic_filter: input.is_atomic_filter,
|
|
867
|
+
heading_level_filter: input.heading_level_filter,
|
|
868
|
+
min_page_count: input.min_page_count,
|
|
869
|
+
max_page_count: input.max_page_count,
|
|
870
|
+
table_columns_contain: input.table_columns_contain,
|
|
871
|
+
});
|
|
872
|
+
// Generate query embedding from original query
|
|
873
|
+
const embedder = getEmbeddingService();
|
|
874
|
+
let embeddingQuery = input.query;
|
|
875
|
+
if (input.section_path_filter) {
|
|
876
|
+
embeddingQuery = `[Section: ${input.section_path_filter}] ${embeddingQuery}`;
|
|
877
|
+
}
|
|
878
|
+
const queryVector = await embedder.embedSearchQuery(embeddingQuery);
|
|
879
|
+
const limit = input.limit ?? 10;
|
|
880
|
+
const searchLimit = input.rerank ? Math.max(limit * 2, 20) : limit;
|
|
881
|
+
const requestedThreshold = input.similarity_threshold ?? 0.7;
|
|
882
|
+
// Task 3.5: Adaptive similarity threshold
|
|
883
|
+
// When user does NOT explicitly provide a threshold, use adaptive mode:
|
|
884
|
+
// fetch extra candidates with low floor, then compute threshold from distribution
|
|
885
|
+
const userExplicitlySetThreshold = params.similarity_threshold !== undefined;
|
|
886
|
+
const useAdaptiveThreshold = !userExplicitlySetThreshold;
|
|
887
|
+
const searchThreshold = useAdaptiveThreshold ? 0.1 : requestedThreshold;
|
|
888
|
+
const adaptiveFetchLimit = useAdaptiveThreshold ? Math.max(searchLimit * 3, 30) : searchLimit;
|
|
889
|
+
// Search for similar vectors
|
|
890
|
+
const results = vector.searchSimilar(queryVector, {
|
|
891
|
+
limit: adaptiveFetchLimit,
|
|
892
|
+
threshold: searchThreshold,
|
|
893
|
+
documentFilter,
|
|
894
|
+
chunkFilter: chunkFilter.conditions.length > 0 ? chunkFilter : undefined,
|
|
895
|
+
pageRangeFilter: input.page_range_filter,
|
|
896
|
+
});
|
|
897
|
+
// Task 3.5: Compute adaptive threshold from result distribution
|
|
898
|
+
let effectiveThreshold = requestedThreshold;
|
|
899
|
+
let thresholdInfo;
|
|
900
|
+
if (useAdaptiveThreshold && results.length > 1) {
|
|
901
|
+
const scores = results.map(r => r.similarity_score);
|
|
902
|
+
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
903
|
+
const variance = scores.reduce((a, b) => a + (b - mean) ** 2, 0) / scores.length;
|
|
904
|
+
const stddev = Math.sqrt(variance);
|
|
905
|
+
const adaptiveRaw = mean - stddev;
|
|
906
|
+
effectiveThreshold = Math.max(0.15, Math.min(0.5, adaptiveRaw));
|
|
907
|
+
thresholdInfo = {
|
|
908
|
+
mode: 'adaptive',
|
|
909
|
+
requested: requestedThreshold,
|
|
910
|
+
effective: Math.round(effectiveThreshold * 1000) / 1000,
|
|
911
|
+
adaptive_raw: Math.round(adaptiveRaw * 1000) / 1000,
|
|
912
|
+
distribution: {
|
|
913
|
+
mean: Math.round(mean * 1000) / 1000,
|
|
914
|
+
stddev: Math.round(stddev * 1000) / 1000,
|
|
915
|
+
candidates_evaluated: results.length,
|
|
916
|
+
},
|
|
917
|
+
};
|
|
918
|
+
}
|
|
919
|
+
else if (useAdaptiveThreshold) {
|
|
920
|
+
// Too few results for stats, fall back to default
|
|
921
|
+
effectiveThreshold = requestedThreshold;
|
|
922
|
+
thresholdInfo = {
|
|
923
|
+
mode: 'adaptive_fallback',
|
|
924
|
+
requested: requestedThreshold,
|
|
925
|
+
effective: requestedThreshold,
|
|
926
|
+
reason: 'too_few_results_for_adaptive',
|
|
927
|
+
};
|
|
928
|
+
}
|
|
929
|
+
else {
|
|
930
|
+
thresholdInfo = {
|
|
931
|
+
mode: 'explicit',
|
|
932
|
+
requested: requestedThreshold,
|
|
933
|
+
effective: requestedThreshold,
|
|
934
|
+
};
|
|
935
|
+
}
|
|
936
|
+
// Filter results by effective threshold and apply final limit
|
|
937
|
+
const thresholdFiltered = results
|
|
938
|
+
.filter(r => r.similarity_score >= effectiveThreshold)
|
|
939
|
+
.slice(0, searchLimit);
|
|
940
|
+
let finalResults;
|
|
941
|
+
let rerankInfo;
|
|
942
|
+
if (input.rerank && thresholdFiltered.length > 0) {
|
|
943
|
+
const rerankInput = thresholdFiltered.map((r) => ({
|
|
944
|
+
chunk_id: r.chunk_id,
|
|
945
|
+
image_id: r.image_id,
|
|
946
|
+
extraction_id: r.extraction_id,
|
|
947
|
+
embedding_id: r.embedding_id,
|
|
948
|
+
document_id: r.document_id,
|
|
949
|
+
original_text: r.original_text,
|
|
950
|
+
result_type: r.result_type,
|
|
951
|
+
source_file_path: r.source_file_path,
|
|
952
|
+
source_file_name: r.source_file_name,
|
|
953
|
+
source_file_hash: r.source_file_hash,
|
|
954
|
+
page_number: r.page_number,
|
|
955
|
+
character_start: r.character_start,
|
|
956
|
+
character_end: r.character_end,
|
|
957
|
+
chunk_index: r.chunk_index,
|
|
958
|
+
provenance_id: r.provenance_id,
|
|
959
|
+
content_hash: r.content_hash,
|
|
960
|
+
rank: 0,
|
|
961
|
+
score: r.similarity_score,
|
|
962
|
+
}));
|
|
963
|
+
const reranked = await rerankResults(input.query, rerankInput, limit);
|
|
964
|
+
finalResults = reranked.map((r) => {
|
|
965
|
+
const original = thresholdFiltered[r.original_index];
|
|
966
|
+
const result = {
|
|
967
|
+
embedding_id: original.embedding_id,
|
|
968
|
+
chunk_id: original.chunk_id,
|
|
969
|
+
image_id: original.image_id,
|
|
970
|
+
extraction_id: original.extraction_id ?? null,
|
|
971
|
+
document_id: original.document_id,
|
|
972
|
+
result_type: original.result_type,
|
|
973
|
+
similarity_score: original.similarity_score,
|
|
974
|
+
original_text: original.original_text,
|
|
975
|
+
source_file_path: original.source_file_path,
|
|
976
|
+
source_file_name: original.source_file_name,
|
|
977
|
+
source_file_hash: original.source_file_hash,
|
|
978
|
+
page_number: original.page_number,
|
|
979
|
+
character_start: original.character_start,
|
|
980
|
+
character_end: original.character_end,
|
|
981
|
+
chunk_index: original.chunk_index,
|
|
982
|
+
total_chunks: original.total_chunks,
|
|
983
|
+
content_hash: original.content_hash,
|
|
984
|
+
provenance_id: original.provenance_id,
|
|
985
|
+
heading_context: original.heading_context ?? null,
|
|
986
|
+
section_path: original.section_path ?? null,
|
|
987
|
+
content_types: original.content_types ?? null,
|
|
988
|
+
is_atomic: original.is_atomic ?? false,
|
|
989
|
+
chunk_page_range: original.chunk_page_range ?? null,
|
|
990
|
+
heading_level: original.heading_level ?? null,
|
|
991
|
+
ocr_quality_score: original.ocr_quality_score ?? null,
|
|
992
|
+
doc_title: original.doc_title ?? null,
|
|
993
|
+
doc_author: original.doc_author ?? null,
|
|
994
|
+
doc_subject: original.doc_subject ?? null,
|
|
995
|
+
overlap_previous: original.overlap_previous ?? 0,
|
|
996
|
+
overlap_next: original.overlap_next ?? 0,
|
|
997
|
+
chunking_strategy: original.chunking_strategy ?? null,
|
|
998
|
+
embedding_status: original.embedding_status ?? 'pending',
|
|
999
|
+
doc_page_count: original.doc_page_count ?? null,
|
|
1000
|
+
datalab_mode: original.datalab_mode ?? null,
|
|
1001
|
+
rerank_score: r.relevance_score,
|
|
1002
|
+
rerank_reasoning: r.reasoning,
|
|
1003
|
+
};
|
|
1004
|
+
attachProvenance(result, db, original.provenance_id, !!input.include_provenance);
|
|
1005
|
+
return result;
|
|
1006
|
+
});
|
|
1007
|
+
rerankInfo = {
|
|
1008
|
+
reranked: true,
|
|
1009
|
+
candidates_evaluated: Math.min(thresholdFiltered.length, 20),
|
|
1010
|
+
results_returned: finalResults.length,
|
|
1011
|
+
};
|
|
1012
|
+
}
|
|
1013
|
+
else {
|
|
1014
|
+
finalResults = thresholdFiltered.map((r) => {
|
|
1015
|
+
const result = {
|
|
1016
|
+
embedding_id: r.embedding_id,
|
|
1017
|
+
chunk_id: r.chunk_id,
|
|
1018
|
+
image_id: r.image_id,
|
|
1019
|
+
extraction_id: r.extraction_id ?? null,
|
|
1020
|
+
document_id: r.document_id,
|
|
1021
|
+
result_type: r.result_type,
|
|
1022
|
+
similarity_score: r.similarity_score,
|
|
1023
|
+
original_text: r.original_text,
|
|
1024
|
+
source_file_path: r.source_file_path,
|
|
1025
|
+
source_file_name: r.source_file_name,
|
|
1026
|
+
source_file_hash: r.source_file_hash,
|
|
1027
|
+
page_number: r.page_number,
|
|
1028
|
+
character_start: r.character_start,
|
|
1029
|
+
character_end: r.character_end,
|
|
1030
|
+
chunk_index: r.chunk_index,
|
|
1031
|
+
total_chunks: r.total_chunks,
|
|
1032
|
+
content_hash: r.content_hash,
|
|
1033
|
+
provenance_id: r.provenance_id,
|
|
1034
|
+
heading_context: r.heading_context ?? null,
|
|
1035
|
+
section_path: r.section_path ?? null,
|
|
1036
|
+
content_types: r.content_types ?? null,
|
|
1037
|
+
is_atomic: r.is_atomic ?? false,
|
|
1038
|
+
chunk_page_range: r.chunk_page_range ?? null,
|
|
1039
|
+
heading_level: r.heading_level ?? null,
|
|
1040
|
+
ocr_quality_score: r.ocr_quality_score ?? null,
|
|
1041
|
+
doc_title: r.doc_title ?? null,
|
|
1042
|
+
doc_author: r.doc_author ?? null,
|
|
1043
|
+
doc_subject: r.doc_subject ?? null,
|
|
1044
|
+
overlap_previous: r.overlap_previous ?? 0,
|
|
1045
|
+
overlap_next: r.overlap_next ?? 0,
|
|
1046
|
+
chunking_strategy: r.chunking_strategy ?? null,
|
|
1047
|
+
embedding_status: r.embedding_status ?? 'pending',
|
|
1048
|
+
doc_page_count: r.doc_page_count ?? null,
|
|
1049
|
+
datalab_mode: r.datalab_mode ?? null,
|
|
1050
|
+
};
|
|
1051
|
+
attachProvenance(result, db, r.provenance_id, !!input.include_provenance);
|
|
1052
|
+
return result;
|
|
1053
|
+
});
|
|
1054
|
+
}
|
|
1055
|
+
// Apply metadata-based score boosts and length normalization
|
|
1056
|
+
applyMetadataBoosts(finalResults, { contentTypeQuery: input.query });
|
|
1057
|
+
applyLengthNormalization(finalResults, db);
|
|
1058
|
+
// Re-sort by similarity_score after boosts
|
|
1059
|
+
finalResults.sort((a, b) => b.similarity_score - a.similarity_score);
|
|
1060
|
+
// Enrich VLM results with image metadata
|
|
1061
|
+
enrichVLMResultsWithImageMetadata(conn, finalResults);
|
|
1062
|
+
// Task 7.3: Deduplicate by content_hash if requested
|
|
1063
|
+
if (input.exclude_duplicate_chunks) {
|
|
1064
|
+
finalResults = deduplicateByContentHash(finalResults);
|
|
1065
|
+
}
|
|
1066
|
+
// T2.8: Exclude system:repeated_header_footer tagged chunks by default
|
|
1067
|
+
if (!input.include_headers_footers) {
|
|
1068
|
+
finalResults = excludeRepeatedHeaderFooterChunks(conn, finalResults);
|
|
1069
|
+
}
|
|
1070
|
+
// Task 3.1: Cluster context included by default (unless explicitly false)
|
|
1071
|
+
const clusterContextIncluded = input.include_cluster_context && finalResults.length > 0;
|
|
1072
|
+
if (clusterContextIncluded) {
|
|
1073
|
+
attachClusterContext(conn, finalResults);
|
|
1074
|
+
}
|
|
1075
|
+
// Phase 4: Attach neighbor context chunks if requested
|
|
1076
|
+
const contextChunkCount = input.include_context_chunks ?? 0;
|
|
1077
|
+
if (contextChunkCount > 0) {
|
|
1078
|
+
attachContextChunks(conn, finalResults, contextChunkCount);
|
|
1079
|
+
}
|
|
1080
|
+
// Phase 5: Attach table metadata for atomic table chunks
|
|
1081
|
+
attachTableMetadata(conn, finalResults);
|
|
1082
|
+
// T2.12: Attach cross-document context if requested
|
|
1083
|
+
if (input.include_document_context) {
|
|
1084
|
+
attachCrossDocumentContext(conn, finalResults);
|
|
1085
|
+
}
|
|
1086
|
+
const responseData = {
|
|
1087
|
+
query: input.query,
|
|
1088
|
+
results: finalResults,
|
|
1089
|
+
total: finalResults.length,
|
|
1090
|
+
threshold: effectiveThreshold,
|
|
1091
|
+
threshold_info: thresholdInfo,
|
|
1092
|
+
metadata_boosts_applied: true,
|
|
1093
|
+
cluster_context_included: clusterContextIncluded,
|
|
1094
|
+
next_steps: finalResults.length === 0
|
|
1095
|
+
? [
|
|
1096
|
+
{ tool: 'ocr_search', description: 'Try different keywords, mode, or broader query' },
|
|
1097
|
+
{ tool: 'ocr_ingest_files', description: 'Add more documents to expand searchable content' },
|
|
1098
|
+
]
|
|
1099
|
+
: finalResults.length === 1
|
|
1100
|
+
? [
|
|
1101
|
+
{ tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
|
|
1102
|
+
{ tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
|
|
1103
|
+
{ tool: 'ocr_document_find_similar', description: 'Find related documents' },
|
|
1104
|
+
]
|
|
1105
|
+
: [
|
|
1106
|
+
{ tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
|
|
1107
|
+
{ tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
|
|
1108
|
+
{ tool: 'ocr_document_page', description: 'Read the full page a result came from' },
|
|
1109
|
+
],
|
|
1110
|
+
};
|
|
1111
|
+
// No query_expansion in semantic mode — expansion only applies to BM25/hybrid.
|
|
1112
|
+
if (rerankInfo) {
|
|
1113
|
+
responseData.rerank = rerankInfo;
|
|
1114
|
+
}
|
|
1115
|
+
// V7: Apply compact mode and provenance summaries before grouping
|
|
1116
|
+
applyV7Transforms(responseData, input, db, 'semantic');
|
|
1117
|
+
if (input.group_by_document) {
|
|
1118
|
+
const { grouped, total_documents } = groupResultsByDocument(responseData.results);
|
|
1119
|
+
const groupedResponse = {
|
|
1120
|
+
...responseData,
|
|
1121
|
+
total_results: finalResults.length,
|
|
1122
|
+
total_documents,
|
|
1123
|
+
documents: grouped,
|
|
1124
|
+
};
|
|
1125
|
+
delete groupedResponse.results;
|
|
1126
|
+
delete groupedResponse.total;
|
|
1127
|
+
return formatResponse(successResult(groupedResponse));
|
|
1128
|
+
}
|
|
1129
|
+
return formatResponse(successResult(responseData));
|
|
1130
|
+
}); // end withDatabaseOperation
|
|
1131
|
+
}
|
|
1132
|
+
catch (error) {
|
|
1133
|
+
return handleError(error);
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
/**
|
|
1137
|
+
* Internal: BM25 full-text keyword search logic (called by unified handler)
|
|
1138
|
+
*/
|
|
1139
|
+
async function handleSearchKeywordInternal(params) {
|
|
1140
|
+
try {
|
|
1141
|
+
return await withDatabaseOperation(async ({ db }) => {
|
|
1142
|
+
// Params already validated and enriched by handleSearchUnified
|
|
1143
|
+
const input = params;
|
|
1144
|
+
const conn = db.getConnection();
|
|
1145
|
+
// Expand query with domain-specific synonyms + corpus cluster terms if requested
|
|
1146
|
+
const tableQueryDetected = isTableQuery(input.query);
|
|
1147
|
+
let searchQuery = input.query;
|
|
1148
|
+
let queryExpansion;
|
|
1149
|
+
if (input.expand_query) {
|
|
1150
|
+
searchQuery = expandQuery(input.query, db, tableQueryDetected);
|
|
1151
|
+
queryExpansion = getExpandedTerms(input.query, db, tableQueryDetected);
|
|
1152
|
+
}
|
|
1153
|
+
// Resolve metadata filter to document IDs, then chain through quality + cluster filters
|
|
1154
|
+
const documentFilter = resolveClusterFilter(conn, input.cluster_id, resolveQualityFilter(db, input.min_quality_score, resolveMetadataFilter(db, input.metadata_filter, input.document_filter)));
|
|
1155
|
+
// Resolve chunk-level filters
|
|
1156
|
+
const chunkFilter = resolveChunkFilter({
|
|
1157
|
+
content_type_filter: input.content_type_filter,
|
|
1158
|
+
section_path_filter: input.section_path_filter,
|
|
1159
|
+
heading_filter: input.heading_filter,
|
|
1160
|
+
page_range_filter: input.page_range_filter,
|
|
1161
|
+
is_atomic_filter: input.is_atomic_filter,
|
|
1162
|
+
heading_level_filter: input.heading_level_filter,
|
|
1163
|
+
min_page_count: input.min_page_count,
|
|
1164
|
+
max_page_count: input.max_page_count,
|
|
1165
|
+
table_columns_contain: input.table_columns_contain,
|
|
1166
|
+
});
|
|
1167
|
+
const bm25 = new BM25SearchService(conn);
|
|
1168
|
+
const limit = input.limit ?? 10;
|
|
1169
|
+
// Over-fetch from both sources (limit * 2) since we merge and truncate
|
|
1170
|
+
const fetchLimit = input.rerank ? Math.max(limit * 2, 20) : limit * 2;
|
|
1171
|
+
// Search chunks FTS
|
|
1172
|
+
// When expand_query produced an OR-joined FTS5 expression, pass preSanitized
|
|
1173
|
+
// to prevent sanitizeFTS5Query from inserting implicit AND (H-2 fix).
|
|
1174
|
+
const preSanitized = !!input.expand_query;
|
|
1175
|
+
const chunkResults = bm25.search({
|
|
1176
|
+
query: searchQuery,
|
|
1177
|
+
limit: fetchLimit,
|
|
1178
|
+
phraseSearch: input.phrase_search,
|
|
1179
|
+
documentFilter,
|
|
1180
|
+
includeHighlight: input.include_highlight,
|
|
1181
|
+
chunkFilter: chunkFilter.conditions.length > 0 ? chunkFilter : undefined,
|
|
1182
|
+
preSanitized,
|
|
1183
|
+
});
|
|
1184
|
+
// Search VLM FTS
|
|
1185
|
+
const vlmResults = bm25.searchVLM({
|
|
1186
|
+
query: searchQuery,
|
|
1187
|
+
limit: fetchLimit,
|
|
1188
|
+
phraseSearch: input.phrase_search,
|
|
1189
|
+
documentFilter,
|
|
1190
|
+
includeHighlight: input.include_highlight,
|
|
1191
|
+
pageRangeFilter: input.page_range_filter,
|
|
1192
|
+
preSanitized,
|
|
1193
|
+
});
|
|
1194
|
+
// Search extractions FTS
|
|
1195
|
+
const extractionResults = bm25.searchExtractions({
|
|
1196
|
+
query: searchQuery,
|
|
1197
|
+
limit: fetchLimit,
|
|
1198
|
+
phraseSearch: input.phrase_search,
|
|
1199
|
+
documentFilter,
|
|
1200
|
+
includeHighlight: input.include_highlight,
|
|
1201
|
+
preSanitized,
|
|
1202
|
+
});
|
|
1203
|
+
// Merge by score (higher is better), apply combined limit
|
|
1204
|
+
const mergeLimit = input.rerank ? Math.max(limit * 2, 20) : limit;
|
|
1205
|
+
const allResults = [...chunkResults, ...vlmResults, ...extractionResults]
|
|
1206
|
+
.sort((a, b) => b.bm25_score - a.bm25_score)
|
|
1207
|
+
.slice(0, mergeLimit);
|
|
1208
|
+
// Re-rank after merge
|
|
1209
|
+
const rankedResults = allResults.map((r, i) => ({ ...r, rank: i + 1 }));
|
|
1210
|
+
let finalResults;
|
|
1211
|
+
let rerankInfo;
|
|
1212
|
+
if (input.rerank && rankedResults.length > 0) {
|
|
1213
|
+
const rerankInput = rankedResults.map((r) => ({ ...r }));
|
|
1214
|
+
const reranked = await rerankResults(input.query, rerankInput, limit);
|
|
1215
|
+
finalResults = reranked.map((r) => {
|
|
1216
|
+
const original = rankedResults[r.original_index];
|
|
1217
|
+
const base = {
|
|
1218
|
+
...original,
|
|
1219
|
+
rerank_score: r.relevance_score,
|
|
1220
|
+
rerank_reasoning: r.reasoning,
|
|
1221
|
+
};
|
|
1222
|
+
attachProvenance(base, db, original.provenance_id, !!input.include_provenance, 'provenance_chain');
|
|
1223
|
+
return base;
|
|
1224
|
+
});
|
|
1225
|
+
rerankInfo = {
|
|
1226
|
+
reranked: true,
|
|
1227
|
+
candidates_evaluated: Math.min(rankedResults.length, 20),
|
|
1228
|
+
results_returned: finalResults.length,
|
|
1229
|
+
};
|
|
1230
|
+
}
|
|
1231
|
+
else {
|
|
1232
|
+
finalResults = rankedResults.map((r) => {
|
|
1233
|
+
const base = { ...r };
|
|
1234
|
+
attachProvenance(base, db, r.provenance_id, !!input.include_provenance, 'provenance_chain');
|
|
1235
|
+
return base;
|
|
1236
|
+
});
|
|
1237
|
+
}
|
|
1238
|
+
// Apply metadata-based score boosts and length normalization
|
|
1239
|
+
applyMetadataBoosts(finalResults, { contentTypeQuery: input.query });
|
|
1240
|
+
applyLengthNormalization(finalResults, db);
|
|
1241
|
+
// Re-sort by bm25_score after boosts
|
|
1242
|
+
finalResults.sort((a, b) => b.bm25_score - a.bm25_score);
|
|
1243
|
+
// Enrich VLM results with image metadata
|
|
1244
|
+
enrichVLMResultsWithImageMetadata(conn, finalResults);
|
|
1245
|
+
// Task 7.3: Deduplicate by content_hash if requested
|
|
1246
|
+
if (input.exclude_duplicate_chunks) {
|
|
1247
|
+
finalResults = deduplicateByContentHash(finalResults);
|
|
1248
|
+
}
|
|
1249
|
+
// T2.8: Exclude system:repeated_header_footer tagged chunks by default
|
|
1250
|
+
if (!input.include_headers_footers) {
|
|
1251
|
+
finalResults = excludeRepeatedHeaderFooterChunks(conn, finalResults);
|
|
1252
|
+
}
|
|
1253
|
+
// Compute source counts from final merged results (not pre-merge candidates)
|
|
1254
|
+
let finalChunkCount = 0;
|
|
1255
|
+
let finalVlmCount = 0;
|
|
1256
|
+
let finalExtractionCount = 0;
|
|
1257
|
+
for (const r of finalResults) {
|
|
1258
|
+
if (r.result_type === 'chunk')
|
|
1259
|
+
finalChunkCount++;
|
|
1260
|
+
else if (r.result_type === 'vlm')
|
|
1261
|
+
finalVlmCount++;
|
|
1262
|
+
else
|
|
1263
|
+
finalExtractionCount++;
|
|
1264
|
+
}
|
|
1265
|
+
// Task 3.1: Cluster context included by default (unless explicitly false)
|
|
1266
|
+
const clusterContextIncluded = input.include_cluster_context && finalResults.length > 0;
|
|
1267
|
+
if (clusterContextIncluded) {
|
|
1268
|
+
attachClusterContext(conn, finalResults);
|
|
1269
|
+
}
|
|
1270
|
+
// Phase 4: Attach neighbor context chunks if requested
|
|
1271
|
+
const contextChunkCount = input.include_context_chunks ?? 0;
|
|
1272
|
+
if (contextChunkCount > 0) {
|
|
1273
|
+
attachContextChunks(conn, finalResults, contextChunkCount);
|
|
1274
|
+
}
|
|
1275
|
+
// Phase 5: Attach table metadata for atomic table chunks
|
|
1276
|
+
attachTableMetadata(conn, finalResults);
|
|
1277
|
+
// T2.12: Attach cross-document context if requested
|
|
1278
|
+
if (input.include_document_context) {
|
|
1279
|
+
attachCrossDocumentContext(conn, finalResults);
|
|
1280
|
+
}
|
|
1281
|
+
// Document metadata matches (v30 FTS5 on doc_title/author/subject)
|
|
1282
|
+
let documentMetadataMatches;
|
|
1283
|
+
const metadataResults = bm25.searchDocumentMetadata({
|
|
1284
|
+
query: input.query,
|
|
1285
|
+
limit: 5,
|
|
1286
|
+
phraseSearch: input.phrase_search,
|
|
1287
|
+
});
|
|
1288
|
+
if (metadataResults.length > 0) {
|
|
1289
|
+
documentMetadataMatches = metadataResults;
|
|
1290
|
+
}
|
|
1291
|
+
const responseData = {
|
|
1292
|
+
query: input.query,
|
|
1293
|
+
search_type: 'bm25',
|
|
1294
|
+
results: finalResults,
|
|
1295
|
+
total: finalResults.length,
|
|
1296
|
+
sources: {
|
|
1297
|
+
chunk_count: finalChunkCount,
|
|
1298
|
+
vlm_count: finalVlmCount,
|
|
1299
|
+
extraction_count: finalExtractionCount,
|
|
1300
|
+
},
|
|
1301
|
+
metadata_boosts_applied: true,
|
|
1302
|
+
cluster_context_included: clusterContextIncluded,
|
|
1303
|
+
next_steps: finalResults.length === 0
|
|
1304
|
+
? [
|
|
1305
|
+
{ tool: 'ocr_search', description: 'Try different keywords, mode, or broader query' },
|
|
1306
|
+
{ tool: 'ocr_ingest_files', description: 'Add more documents to expand searchable content' },
|
|
1307
|
+
]
|
|
1308
|
+
: finalResults.length === 1
|
|
1309
|
+
? [
|
|
1310
|
+
{ tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
|
|
1311
|
+
{ tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
|
|
1312
|
+
{ tool: 'ocr_document_find_similar', description: 'Find related documents' },
|
|
1313
|
+
]
|
|
1314
|
+
: [
|
|
1315
|
+
{ tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
|
|
1316
|
+
{ tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
|
|
1317
|
+
{ tool: 'ocr_document_page', description: 'Read the full page a result came from' },
|
|
1318
|
+
],
|
|
1319
|
+
};
|
|
1320
|
+
if (documentMetadataMatches) {
|
|
1321
|
+
responseData.document_metadata_matches = documentMetadataMatches;
|
|
1322
|
+
}
|
|
1323
|
+
// Task 3.2: Standardized query expansion details
|
|
1324
|
+
if (queryExpansion) {
|
|
1325
|
+
responseData.query_expansion = {
|
|
1326
|
+
original_query: queryExpansion.original,
|
|
1327
|
+
expanded_query: searchQuery,
|
|
1328
|
+
synonyms_found: queryExpansion.synonyms_found,
|
|
1329
|
+
terms_added: queryExpansion.expanded.length,
|
|
1330
|
+
corpus_terms: queryExpansion.corpus_terms,
|
|
1331
|
+
};
|
|
1332
|
+
}
|
|
1333
|
+
if (rerankInfo) {
|
|
1334
|
+
responseData.rerank = rerankInfo;
|
|
1335
|
+
}
|
|
1336
|
+
// V7: Apply compact mode and provenance summaries before grouping
|
|
1337
|
+
applyV7Transforms(responseData, input, db, 'keyword');
|
|
1338
|
+
if (input.group_by_document) {
|
|
1339
|
+
const { grouped, total_documents } = groupResultsByDocument(responseData.results);
|
|
1340
|
+
const groupedResponse = {
|
|
1341
|
+
...responseData,
|
|
1342
|
+
total_results: finalResults.length,
|
|
1343
|
+
total_documents,
|
|
1344
|
+
documents: grouped,
|
|
1345
|
+
};
|
|
1346
|
+
delete groupedResponse.results;
|
|
1347
|
+
delete groupedResponse.total;
|
|
1348
|
+
return formatResponse(successResult(groupedResponse));
|
|
1349
|
+
}
|
|
1350
|
+
return formatResponse(successResult(responseData));
|
|
1351
|
+
}); // end withDatabaseOperation
|
|
1352
|
+
}
|
|
1353
|
+
catch (error) {
|
|
1354
|
+
return handleError(error);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
/**
|
|
1358
|
+
* Internal: Hybrid search using Reciprocal Rank Fusion (called by unified handler)
|
|
1359
|
+
*/
|
|
1360
|
+
async function handleSearchHybridInternal(params) {
|
|
1361
|
+
try {
|
|
1362
|
+
return await withDatabaseOperation(async ({ db, vector }) => {
|
|
1363
|
+
// Params already validated and enriched by handleSearchUnified
|
|
1364
|
+
const input = params;
|
|
1365
|
+
const limit = input.limit ?? 10;
|
|
1366
|
+
const conn = db.getConnection();
|
|
1367
|
+
// Auto-route: classify query and adjust weights
|
|
1368
|
+
let queryClassification;
|
|
1369
|
+
if (input.auto_route) {
|
|
1370
|
+
queryClassification = classifyQuery(input.query);
|
|
1371
|
+
if (queryClassification.query_type === 'exact') {
|
|
1372
|
+
input.bm25_weight = 1.5;
|
|
1373
|
+
input.semantic_weight = 0.5;
|
|
1374
|
+
}
|
|
1375
|
+
else if (queryClassification.query_type === 'semantic') {
|
|
1376
|
+
input.bm25_weight = 0.5;
|
|
1377
|
+
input.semantic_weight = 1.5;
|
|
1378
|
+
}
|
|
1379
|
+
// 'mixed' keeps defaults (1.0/1.0)
|
|
1380
|
+
}
|
|
1381
|
+
// Expand query with domain-specific synonyms + corpus cluster terms if requested
|
|
1382
|
+
const tableQueryDetected = isTableQuery(input.query);
|
|
1383
|
+
let searchQuery = input.query;
|
|
1384
|
+
let queryExpansion;
|
|
1385
|
+
if (input.expand_query) {
|
|
1386
|
+
searchQuery = expandQuery(input.query, db, tableQueryDetected);
|
|
1387
|
+
queryExpansion = getExpandedTerms(input.query, db, tableQueryDetected);
|
|
1388
|
+
}
|
|
1389
|
+
// Resolve metadata filter to document IDs, then chain through quality + cluster filters
|
|
1390
|
+
const documentFilter = resolveClusterFilter(conn, input.cluster_id, resolveQualityFilter(db, input.min_quality_score, resolveMetadataFilter(db, input.metadata_filter, input.document_filter)));
|
|
1391
|
+
// Resolve chunk-level filters
|
|
1392
|
+
const chunkFilter = resolveChunkFilter({
|
|
1393
|
+
content_type_filter: input.content_type_filter,
|
|
1394
|
+
section_path_filter: input.section_path_filter,
|
|
1395
|
+
heading_filter: input.heading_filter,
|
|
1396
|
+
page_range_filter: input.page_range_filter,
|
|
1397
|
+
is_atomic_filter: input.is_atomic_filter,
|
|
1398
|
+
heading_level_filter: input.heading_level_filter,
|
|
1399
|
+
min_page_count: input.min_page_count,
|
|
1400
|
+
max_page_count: input.max_page_count,
|
|
1401
|
+
table_columns_contain: input.table_columns_contain,
|
|
1402
|
+
});
|
|
1403
|
+
// Get BM25 results (chunks + VLM + extractions)
|
|
1404
|
+
const bm25 = new BM25SearchService(db.getConnection());
|
|
1405
|
+
// When expand_query produced an OR-joined FTS5 expression, pass preSanitized
|
|
1406
|
+
// to prevent sanitizeFTS5Query from inserting implicit AND (H-2 fix).
|
|
1407
|
+
const preSanitized = !!input.expand_query;
|
|
1408
|
+
// includeHighlight: false -- hybrid discards BM25 highlights (RRF doesn't surface snippets)
|
|
1409
|
+
const bm25ChunkResults = bm25.search({
|
|
1410
|
+
query: searchQuery,
|
|
1411
|
+
limit: limit * 2,
|
|
1412
|
+
documentFilter,
|
|
1413
|
+
includeHighlight: false,
|
|
1414
|
+
chunkFilter: chunkFilter.conditions.length > 0 ? chunkFilter : undefined,
|
|
1415
|
+
preSanitized,
|
|
1416
|
+
});
|
|
1417
|
+
const bm25VlmResults = bm25.searchVLM({
|
|
1418
|
+
query: searchQuery,
|
|
1419
|
+
limit: limit * 2,
|
|
1420
|
+
documentFilter,
|
|
1421
|
+
includeHighlight: false,
|
|
1422
|
+
pageRangeFilter: input.page_range_filter,
|
|
1423
|
+
preSanitized,
|
|
1424
|
+
});
|
|
1425
|
+
const bm25ExtractionResults = bm25.searchExtractions({
|
|
1426
|
+
query: searchQuery,
|
|
1427
|
+
limit: limit * 2,
|
|
1428
|
+
documentFilter,
|
|
1429
|
+
includeHighlight: false,
|
|
1430
|
+
preSanitized,
|
|
1431
|
+
});
|
|
1432
|
+
// Merge BM25 results by score
|
|
1433
|
+
const allBm25 = [...bm25ChunkResults, ...bm25VlmResults, ...bm25ExtractionResults]
|
|
1434
|
+
.sort((a, b) => b.bm25_score - a.bm25_score)
|
|
1435
|
+
.slice(0, limit * 2)
|
|
1436
|
+
.map((r, i) => ({ ...r, rank: i + 1 }));
|
|
1437
|
+
// Get semantic results using ORIGINAL query (not FTS5-expanded)
|
|
1438
|
+
// The expanded query contains OR operators that contaminate embedding vectors
|
|
1439
|
+
const embedder = getEmbeddingService();
|
|
1440
|
+
let hybridEmbeddingQuery = input.query;
|
|
1441
|
+
if (input.section_path_filter) {
|
|
1442
|
+
hybridEmbeddingQuery = `[Section: ${input.section_path_filter}] ${hybridEmbeddingQuery}`;
|
|
1443
|
+
}
|
|
1444
|
+
const queryVector = await embedder.embedSearchQuery(hybridEmbeddingQuery);
|
|
1445
|
+
const semanticResults = vector.searchSimilar(queryVector, {
|
|
1446
|
+
limit: limit * 2,
|
|
1447
|
+
// Lower threshold than standalone (0.7) -- RRF de-ranks low-quality results
|
|
1448
|
+
threshold: 0.3,
|
|
1449
|
+
documentFilter,
|
|
1450
|
+
chunkFilter: chunkFilter.conditions.length > 0 ? chunkFilter : undefined,
|
|
1451
|
+
pageRangeFilter: input.page_range_filter,
|
|
1452
|
+
});
|
|
1453
|
+
// Convert to ranked format and fuse with RRF
|
|
1454
|
+
const bm25Ranked = toBm25Ranked(allBm25);
|
|
1455
|
+
const semanticRanked = toSemanticRanked(semanticResults);
|
|
1456
|
+
const fusion = new RRFFusion({
|
|
1457
|
+
k: input.rrf_k,
|
|
1458
|
+
bm25Weight: input.bm25_weight,
|
|
1459
|
+
semanticWeight: input.semantic_weight,
|
|
1460
|
+
});
|
|
1461
|
+
const fusionLimit = input.rerank ? Math.max(limit * 2, 20) : limit;
|
|
1462
|
+
const rawResults = fusion.fuse(bm25Ranked, semanticRanked, fusionLimit);
|
|
1463
|
+
let finalResults;
|
|
1464
|
+
let rerankInfo;
|
|
1465
|
+
if (input.rerank && rawResults.length > 0) {
|
|
1466
|
+
const rerankInput = rawResults.map((r) => ({ ...r }));
|
|
1467
|
+
const reranked = await rerankResults(input.query, rerankInput, limit);
|
|
1468
|
+
finalResults = reranked.map((r) => {
|
|
1469
|
+
const original = rawResults[r.original_index];
|
|
1470
|
+
const base = {
|
|
1471
|
+
...original,
|
|
1472
|
+
rerank_score: r.relevance_score,
|
|
1473
|
+
rerank_reasoning: r.reasoning,
|
|
1474
|
+
};
|
|
1475
|
+
attachProvenance(base, db, original.provenance_id, !!input.include_provenance, 'provenance_chain');
|
|
1476
|
+
return base;
|
|
1477
|
+
});
|
|
1478
|
+
rerankInfo = {
|
|
1479
|
+
reranked: true,
|
|
1480
|
+
candidates_evaluated: Math.min(rawResults.length, 20),
|
|
1481
|
+
results_returned: finalResults.length,
|
|
1482
|
+
};
|
|
1483
|
+
}
|
|
1484
|
+
else {
|
|
1485
|
+
finalResults = rawResults.map((r) => {
|
|
1486
|
+
const base = { ...r };
|
|
1487
|
+
attachProvenance(base, db, r.provenance_id, !!input.include_provenance, 'provenance_chain');
|
|
1488
|
+
return base;
|
|
1489
|
+
});
|
|
1490
|
+
}
|
|
1491
|
+
// Chunk proximity boost - reward clusters of nearby relevant chunks
|
|
1492
|
+
const chunkProximityInfo = finalResults.length > 0 ? applyChunkProximityBoost(finalResults) : undefined;
|
|
1493
|
+
// Apply metadata-based score boosts and length normalization
|
|
1494
|
+
applyMetadataBoosts(finalResults, { contentTypeQuery: input.query });
|
|
1495
|
+
applyLengthNormalization(finalResults, db);
|
|
1496
|
+
// Enrich VLM results with image metadata
|
|
1497
|
+
enrichVLMResultsWithImageMetadata(conn, finalResults);
|
|
1498
|
+
// Re-sort by rrf_score after proximity boost and metadata boosts may have changed scores
|
|
1499
|
+
finalResults.sort((a, b) => b.rrf_score - a.rrf_score);
|
|
1500
|
+
// Task 7.3: Deduplicate by content_hash if requested
|
|
1501
|
+
if (input.exclude_duplicate_chunks) {
|
|
1502
|
+
finalResults = deduplicateByContentHash(finalResults);
|
|
1503
|
+
}
|
|
1504
|
+
// T2.8: Exclude system:repeated_header_footer tagged chunks by default
|
|
1505
|
+
if (!input.include_headers_footers) {
|
|
1506
|
+
finalResults = excludeRepeatedHeaderFooterChunks(conn, finalResults);
|
|
1507
|
+
}
|
|
1508
|
+
// Task 3.1: Cluster context included by default (unless explicitly false)
|
|
1509
|
+
const clusterContextIncluded = input.include_cluster_context && finalResults.length > 0;
|
|
1510
|
+
if (clusterContextIncluded) {
|
|
1511
|
+
attachClusterContext(conn, finalResults);
|
|
1512
|
+
}
|
|
1513
|
+
// Phase 4: Attach neighbor context chunks if requested
|
|
1514
|
+
const contextChunkCount = input.include_context_chunks ?? 0;
|
|
1515
|
+
if (contextChunkCount > 0) {
|
|
1516
|
+
attachContextChunks(conn, finalResults, contextChunkCount);
|
|
1517
|
+
}
|
|
1518
|
+
// Phase 5: Attach table metadata for atomic table chunks
|
|
1519
|
+
attachTableMetadata(db.getConnection(), finalResults);
|
|
1520
|
+
// T2.12: Attach cross-document context if requested
|
|
1521
|
+
if (input.include_document_context) {
|
|
1522
|
+
attachCrossDocumentContext(conn, finalResults);
|
|
1523
|
+
}
|
|
1524
|
+
const responseData = {
|
|
1525
|
+
query: input.query,
|
|
1526
|
+
search_type: 'rrf_hybrid',
|
|
1527
|
+
config: {
|
|
1528
|
+
bm25_weight: input.bm25_weight,
|
|
1529
|
+
semantic_weight: input.semantic_weight,
|
|
1530
|
+
rrf_k: input.rrf_k,
|
|
1531
|
+
},
|
|
1532
|
+
results: finalResults,
|
|
1533
|
+
total: finalResults.length,
|
|
1534
|
+
sources: {
|
|
1535
|
+
bm25_chunk_count: bm25ChunkResults.length,
|
|
1536
|
+
bm25_vlm_count: bm25VlmResults.length,
|
|
1537
|
+
bm25_extraction_count: bm25ExtractionResults.length,
|
|
1538
|
+
semantic_count: semanticResults.length,
|
|
1539
|
+
},
|
|
1540
|
+
metadata_boosts_applied: true,
|
|
1541
|
+
cluster_context_included: clusterContextIncluded,
|
|
1542
|
+
next_steps: finalResults.length === 0
|
|
1543
|
+
? [
|
|
1544
|
+
{ tool: 'ocr_search', description: 'Try different keywords, mode, or broader query' },
|
|
1545
|
+
{ tool: 'ocr_ingest_files', description: 'Add more documents to expand searchable content' },
|
|
1546
|
+
]
|
|
1547
|
+
: finalResults.length === 1
|
|
1548
|
+
? [
|
|
1549
|
+
{ tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
|
|
1550
|
+
{ tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
|
|
1551
|
+
{ tool: 'ocr_document_find_similar', description: 'Find related documents' },
|
|
1552
|
+
]
|
|
1553
|
+
: [
|
|
1554
|
+
{ tool: 'ocr_chunk_context', description: 'Expand a result with neighboring chunks for more context' },
|
|
1555
|
+
{ tool: 'ocr_document_get', description: 'Deep-dive into a specific source document' },
|
|
1556
|
+
{ tool: 'ocr_document_page', description: 'Read the full page a result came from' },
|
|
1557
|
+
],
|
|
1558
|
+
};
|
|
1559
|
+
// Task 3.2: Standardized query expansion details
|
|
1560
|
+
if (queryExpansion) {
|
|
1561
|
+
responseData.query_expansion = {
|
|
1562
|
+
original_query: queryExpansion.original,
|
|
1563
|
+
expanded_query: searchQuery,
|
|
1564
|
+
synonyms_found: queryExpansion.synonyms_found,
|
|
1565
|
+
terms_added: queryExpansion.expanded.length,
|
|
1566
|
+
corpus_terms: queryExpansion.corpus_terms,
|
|
1567
|
+
};
|
|
1568
|
+
}
|
|
1569
|
+
if (rerankInfo) {
|
|
1570
|
+
responseData.rerank = rerankInfo;
|
|
1571
|
+
}
|
|
1572
|
+
if (chunkProximityInfo) {
|
|
1573
|
+
responseData.chunk_proximity_boost = chunkProximityInfo;
|
|
1574
|
+
}
|
|
1575
|
+
if (queryClassification) {
|
|
1576
|
+
responseData.query_classification = queryClassification;
|
|
1577
|
+
}
|
|
1578
|
+
// V7: Apply compact mode and provenance summaries before grouping
|
|
1579
|
+
applyV7Transforms(responseData, input, db, 'hybrid');
|
|
1580
|
+
if (input.group_by_document) {
|
|
1581
|
+
const { grouped, total_documents } = groupResultsByDocument(responseData.results);
|
|
1582
|
+
const groupedResponse = {
|
|
1583
|
+
...responseData,
|
|
1584
|
+
total_results: finalResults.length,
|
|
1585
|
+
total_documents,
|
|
1586
|
+
documents: grouped,
|
|
1587
|
+
};
|
|
1588
|
+
delete groupedResponse.results;
|
|
1589
|
+
delete groupedResponse.total;
|
|
1590
|
+
return formatResponse(successResult(groupedResponse));
|
|
1591
|
+
}
|
|
1592
|
+
return formatResponse(successResult(responseData));
|
|
1593
|
+
}); // end withDatabaseOperation
|
|
1594
|
+
}
|
|
1595
|
+
catch (error) {
|
|
1596
|
+
return handleError(error);
|
|
1597
|
+
}
|
|
1598
|
+
}
|
|
1599
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1600
|
+
// UNIFIED SEARCH HANDLER
|
|
1601
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1602
|
+
/**
|
|
1603
|
+
* Handle ocr_search - Unified search across keyword (BM25), semantic (vector),
|
|
1604
|
+
* and hybrid (BM25+semantic RRF fusion) modes.
|
|
1605
|
+
*
|
|
1606
|
+
* Always-on optimizations (hardcoded, no parameters needed):
|
|
1607
|
+
* - quality_boost: true (quality-weighted ranking)
|
|
1608
|
+
* - expand_query: true (domain synonym + corpus term expansion)
|
|
1609
|
+
* - exclude_duplicate_chunks: true (deduplicate by content hash)
|
|
1610
|
+
* - exclude headers/footers: true (filter repeated header/footer chunks)
|
|
1611
|
+
* - include_cluster_context: true (cluster membership in results)
|
|
1612
|
+
*/
|
|
1613
|
+
export async function handleSearchUnified(params) {
|
|
1614
|
+
try {
|
|
1615
|
+
const input = validateInput(SearchUnifiedInput, params);
|
|
1616
|
+
// Flatten filters from nested object into top-level params for internal handlers.
|
|
1617
|
+
// Internal handlers (InternalSearchParams) expect flat params, not nested filters.
|
|
1618
|
+
const filters = input.filters ?? {};
|
|
1619
|
+
// Pass similarity_threshold through if the user explicitly provided any value.
|
|
1620
|
+
// The internal semantic handler uses adaptive threshold when it's undefined.
|
|
1621
|
+
const userSetThreshold = input.similarity_threshold !== undefined;
|
|
1622
|
+
const enrichedParams = {
|
|
1623
|
+
// Spread validated top-level params
|
|
1624
|
+
query: input.query,
|
|
1625
|
+
mode: input.mode,
|
|
1626
|
+
limit: input.limit,
|
|
1627
|
+
include_provenance: input.include_provenance,
|
|
1628
|
+
rerank: input.rerank,
|
|
1629
|
+
include_context_chunks: input.include_context_chunks,
|
|
1630
|
+
group_by_document: input.group_by_document,
|
|
1631
|
+
phrase_search: input.phrase_search,
|
|
1632
|
+
include_highlight: input.include_highlight,
|
|
1633
|
+
...(userSetThreshold ? { similarity_threshold: input.similarity_threshold } : {}),
|
|
1634
|
+
bm25_weight: input.bm25_weight,
|
|
1635
|
+
semantic_weight: input.semantic_weight,
|
|
1636
|
+
rrf_k: input.rrf_k,
|
|
1637
|
+
auto_route: input.auto_route,
|
|
1638
|
+
// Flatten nested filters to top-level for internal handlers
|
|
1639
|
+
document_filter: filters.document_filter,
|
|
1640
|
+
metadata_filter: filters.metadata_filter,
|
|
1641
|
+
min_quality_score: filters.min_quality_score,
|
|
1642
|
+
cluster_id: filters.cluster_id,
|
|
1643
|
+
content_type_filter: filters.content_type_filter,
|
|
1644
|
+
section_path_filter: filters.section_path_filter,
|
|
1645
|
+
heading_filter: filters.heading_filter,
|
|
1646
|
+
page_range_filter: filters.page_range_filter,
|
|
1647
|
+
is_atomic_filter: filters.is_atomic_filter,
|
|
1648
|
+
heading_level_filter: filters.heading_level_filter,
|
|
1649
|
+
min_page_count: filters.min_page_count,
|
|
1650
|
+
max_page_count: filters.max_page_count,
|
|
1651
|
+
table_columns_contain: filters.table_columns_contain,
|
|
1652
|
+
// Hardcode always-on defaults
|
|
1653
|
+
quality_boost: true,
|
|
1654
|
+
expand_query: true,
|
|
1655
|
+
exclude_duplicate_chunks: true,
|
|
1656
|
+
include_headers_footers: false,
|
|
1657
|
+
include_cluster_context: true,
|
|
1658
|
+
include_document_context: true,
|
|
1659
|
+
// V7 Intelligence Optimization params
|
|
1660
|
+
compact: input.compact,
|
|
1661
|
+
include_provenance_summary: input.include_provenance_summary,
|
|
1662
|
+
};
|
|
1663
|
+
// Route to internal handler based on mode
|
|
1664
|
+
switch (input.mode) {
|
|
1665
|
+
case 'keyword':
|
|
1666
|
+
return await handleSearchKeywordInternal(enrichedParams);
|
|
1667
|
+
case 'semantic':
|
|
1668
|
+
return await handleSearchSemanticInternal(enrichedParams);
|
|
1669
|
+
case 'hybrid':
|
|
1670
|
+
default:
|
|
1671
|
+
return await handleSearchHybridInternal(enrichedParams);
|
|
1672
|
+
}
|
|
1673
|
+
}
|
|
1674
|
+
catch (error) {
|
|
1675
|
+
return handleError(error);
|
|
1676
|
+
}
|
|
1677
|
+
}
|
|
1678
|
+
/**
|
|
1679
|
+
* Handle ocr_fts_manage - Manage FTS5 indexes (rebuild or check status)
|
|
1680
|
+
* Covers both chunks FTS and VLM FTS indexes
|
|
1681
|
+
*/
|
|
1682
|
+
export async function handleFTSManage(params) {
|
|
1683
|
+
try {
|
|
1684
|
+
const input = validateInput(FTSManageInput, params);
|
|
1685
|
+
const { db } = requireDatabase();
|
|
1686
|
+
const bm25 = new BM25SearchService(db.getConnection());
|
|
1687
|
+
if (input.action === 'rebuild') {
|
|
1688
|
+
const result = bm25.rebuildIndex();
|
|
1689
|
+
return formatResponse(successResult({ operation: 'fts_rebuild', ...result, next_steps: [{ tool: 'ocr_search', description: 'Search using the rebuilt index' }, { tool: 'ocr_db_stats', description: 'Check database statistics' }] }));
|
|
1690
|
+
}
|
|
1691
|
+
const status = bm25.getStatus();
|
|
1692
|
+
// Detect chunks without embeddings (invisible to semantic search)
|
|
1693
|
+
try {
|
|
1694
|
+
const conn = db.getConnection();
|
|
1695
|
+
const gapRow = conn
|
|
1696
|
+
.prepare(`SELECT COUNT(*) as cnt FROM chunks c
|
|
1697
|
+
LEFT JOIN embeddings e ON e.chunk_id = c.id
|
|
1698
|
+
WHERE e.id IS NULL`)
|
|
1699
|
+
.get();
|
|
1700
|
+
status.chunks_without_embeddings = gapRow.cnt;
|
|
1701
|
+
}
|
|
1702
|
+
catch (error) {
|
|
1703
|
+
console.error(`[Search] Failed to query chunks without embeddings: ${String(error)}`);
|
|
1704
|
+
}
|
|
1705
|
+
status.next_steps = [{ tool: 'ocr_search', description: 'Search using the rebuilt index' }, { tool: 'ocr_db_stats', description: 'Check database statistics' }];
|
|
1706
|
+
return formatResponse(successResult(status));
|
|
1707
|
+
}
|
|
1708
|
+
catch (error) {
|
|
1709
|
+
return handleError(error);
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1712
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1713
|
+
// RAG CONTEXT ASSEMBLY HANDLER
|
|
1714
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1715
|
+
/**
|
|
1716
|
+
* Task 3.3: Deduplicate overlapping chunks in RAG context.
|
|
1717
|
+
* Two chunks from the same document overlap if their character ranges
|
|
1718
|
+
* overlap by >50%. The higher-scored chunk is kept.
|
|
1719
|
+
* Results must be pre-sorted by score (descending) before calling.
|
|
1720
|
+
*/
|
|
1721
|
+
function deduplicateOverlappingResults(results) {
|
|
1722
|
+
if (results.length <= 1)
|
|
1723
|
+
return results;
|
|
1724
|
+
const deduplicated = [];
|
|
1725
|
+
for (const result of results) {
|
|
1726
|
+
const docId = result.document_id;
|
|
1727
|
+
const charStart = (result.character_start ?? result.char_start);
|
|
1728
|
+
const charEnd = (result.character_end ?? result.char_end);
|
|
1729
|
+
if (charStart == null || charEnd == null) {
|
|
1730
|
+
deduplicated.push(result);
|
|
1731
|
+
continue;
|
|
1732
|
+
}
|
|
1733
|
+
let isDuplicate = false;
|
|
1734
|
+
for (const prev of deduplicated) {
|
|
1735
|
+
if (prev.document_id !== docId)
|
|
1736
|
+
continue;
|
|
1737
|
+
const prevStart = (prev.character_start ?? prev.char_start);
|
|
1738
|
+
const prevEnd = (prev.character_end ?? prev.char_end);
|
|
1739
|
+
if (prevStart == null || prevEnd == null)
|
|
1740
|
+
continue;
|
|
1741
|
+
const overlapStart = Math.max(charStart, prevStart);
|
|
1742
|
+
const overlapEnd = Math.min(charEnd, prevEnd);
|
|
1743
|
+
if (overlapEnd > overlapStart) {
|
|
1744
|
+
const overlapLen = overlapEnd - overlapStart;
|
|
1745
|
+
const thisLen = charEnd - charStart;
|
|
1746
|
+
if (thisLen > 0 && overlapLen / thisLen > 0.5) {
|
|
1747
|
+
isDuplicate = true;
|
|
1748
|
+
break;
|
|
1749
|
+
}
|
|
1750
|
+
}
|
|
1751
|
+
}
|
|
1752
|
+
if (!isDuplicate)
|
|
1753
|
+
deduplicated.push(result);
|
|
1754
|
+
}
|
|
1755
|
+
return deduplicated;
|
|
1756
|
+
}
|
|
1757
|
+
/**
|
|
1758
|
+
* Task 3.4: Enforce source diversity in RAG context.
|
|
1759
|
+
* Limits the maximum number of chunks per document to prevent
|
|
1760
|
+
* a single long document from dominating context.
|
|
1761
|
+
*/
|
|
1762
|
+
function enforceSourceDiversity(results, maxPerDocument = 3) {
|
|
1763
|
+
const docCounts = new Map();
|
|
1764
|
+
const diversified = [];
|
|
1765
|
+
for (const result of results) {
|
|
1766
|
+
const docId = result.document_id;
|
|
1767
|
+
const count = docCounts.get(docId) ?? 0;
|
|
1768
|
+
if (count < maxPerDocument) {
|
|
1769
|
+
diversified.push(result);
|
|
1770
|
+
docCounts.set(docId, count + 1);
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
return diversified;
|
|
1774
|
+
}
|
|
1775
|
+
/**
|
|
1776
|
+
* RAG Context Input schema - validated inline (not exported to validation.ts
|
|
1777
|
+
* since this is a self-contained tool with a unique schema).
|
|
1778
|
+
*/
|
|
1779
|
+
const RagContextInput = z.object({
|
|
1780
|
+
question: z.string().min(1).max(2000).describe('The question to build context for'),
|
|
1781
|
+
limit: z
|
|
1782
|
+
.number()
|
|
1783
|
+
.int()
|
|
1784
|
+
.min(1)
|
|
1785
|
+
.max(20)
|
|
1786
|
+
.default(5)
|
|
1787
|
+
.describe('Maximum search results to include in context'),
|
|
1788
|
+
document_filter: z.array(z.string()).optional().describe('Restrict to specific documents'),
|
|
1789
|
+
max_context_length: z
|
|
1790
|
+
.number()
|
|
1791
|
+
.int()
|
|
1792
|
+
.min(500)
|
|
1793
|
+
.max(50000)
|
|
1794
|
+
.default(8000)
|
|
1795
|
+
.describe('Maximum total context length in characters'),
|
|
1796
|
+
max_results_per_document: z
|
|
1797
|
+
.number()
|
|
1798
|
+
.int()
|
|
1799
|
+
.min(1)
|
|
1800
|
+
.max(20)
|
|
1801
|
+
.default(3)
|
|
1802
|
+
.describe('Maximum chunks per document for source diversity (default: 3)'),
|
|
1803
|
+
});
|
|
1804
|
+
/**
|
|
1805
|
+
* Handle ocr_rag_context - Assemble a RAG context block for LLM consumption.
|
|
1806
|
+
*
|
|
1807
|
+
* Runs hybrid search (BM25 + semantic + RRF) and assembles a single markdown
|
|
1808
|
+
* context block optimized for LLM consumption.
|
|
1809
|
+
*
|
|
1810
|
+
* Pipeline:
|
|
1811
|
+
* 1. Hybrid search (BM25 + semantic + RRF)
|
|
1812
|
+
* 2. Assemble markdown: excerpts
|
|
1813
|
+
* 3. Truncate to max_context_length
|
|
1814
|
+
*/
|
|
1815
|
+
async function handleRagContext(params) {
|
|
1816
|
+
try {
|
|
1817
|
+
const input = validateInput(RagContextInput, params);
|
|
1818
|
+
const { db, vector } = requireDatabase();
|
|
1819
|
+
const conn = db.getConnection();
|
|
1820
|
+
const limit = input.limit ?? 5;
|
|
1821
|
+
const maxContextLength = input.max_context_length ?? 8000;
|
|
1822
|
+
// ── Step 1: Run hybrid search (BM25 + semantic + RRF) ──────────────────
|
|
1823
|
+
const bm25 = new BM25SearchService(conn);
|
|
1824
|
+
const fetchLimit = limit * 2;
|
|
1825
|
+
const bm25ChunkResults = bm25.search({
|
|
1826
|
+
query: input.question,
|
|
1827
|
+
limit: fetchLimit,
|
|
1828
|
+
documentFilter: input.document_filter,
|
|
1829
|
+
includeHighlight: false,
|
|
1830
|
+
});
|
|
1831
|
+
const bm25VlmResults = bm25.searchVLM({
|
|
1832
|
+
query: input.question,
|
|
1833
|
+
limit: fetchLimit,
|
|
1834
|
+
documentFilter: input.document_filter,
|
|
1835
|
+
includeHighlight: false,
|
|
1836
|
+
});
|
|
1837
|
+
const bm25ExtractionResults = bm25.searchExtractions({
|
|
1838
|
+
query: input.question,
|
|
1839
|
+
limit: fetchLimit,
|
|
1840
|
+
documentFilter: input.document_filter,
|
|
1841
|
+
includeHighlight: false,
|
|
1842
|
+
});
|
|
1843
|
+
const allBm25 = [...bm25ChunkResults, ...bm25VlmResults, ...bm25ExtractionResults]
|
|
1844
|
+
.sort((a, b) => b.bm25_score - a.bm25_score)
|
|
1845
|
+
.slice(0, fetchLimit)
|
|
1846
|
+
.map((r, i) => ({ ...r, rank: i + 1 }));
|
|
1847
|
+
// Semantic search
|
|
1848
|
+
const embedder = getEmbeddingService();
|
|
1849
|
+
const queryVector = await embedder.embedSearchQuery(input.question);
|
|
1850
|
+
const semanticResults = vector.searchSimilar(queryVector, {
|
|
1851
|
+
limit: fetchLimit,
|
|
1852
|
+
threshold: 0.3,
|
|
1853
|
+
documentFilter: input.document_filter,
|
|
1854
|
+
});
|
|
1855
|
+
// Convert to ranked format and fuse with RRF (default weights)
|
|
1856
|
+
// Over-fetch to allow room for dedup + diversity filtering
|
|
1857
|
+
const bm25Ranked = toBm25Ranked(allBm25);
|
|
1858
|
+
const semanticRanked = toSemanticRanked(semanticResults);
|
|
1859
|
+
const fusion = new RRFFusion({ k: 60, bm25Weight: 1.0, semanticWeight: 1.0 });
|
|
1860
|
+
const fusedResults = fusion.fuse(bm25Ranked, semanticRanked, limit * 3);
|
|
1861
|
+
// Handle empty results
|
|
1862
|
+
if (fusedResults.length === 0) {
|
|
1863
|
+
const emptyContext = '## Relevant Document Excerpts\n\nNo relevant documents found for the given question.';
|
|
1864
|
+
return formatResponse(successResult({
|
|
1865
|
+
question: input.question,
|
|
1866
|
+
context: emptyContext,
|
|
1867
|
+
context_length: emptyContext.length,
|
|
1868
|
+
search_results_used: 0,
|
|
1869
|
+
sources: [],
|
|
1870
|
+
deduplication: { before: 0, after: 0, removed: 0 },
|
|
1871
|
+
source_diversity: { max_per_document: input.max_results_per_document ?? 3, before: 0, after: 0 },
|
|
1872
|
+
next_steps: [{ tool: 'ocr_search', description: 'Try a broader search query' }],
|
|
1873
|
+
}));
|
|
1874
|
+
}
|
|
1875
|
+
// ── Step 1b: Deduplicate overlapping chunks (Task 3.3) ──────────────
|
|
1876
|
+
const preDedupResults = fusedResults;
|
|
1877
|
+
const deduplicated = deduplicateOverlappingResults(preDedupResults);
|
|
1878
|
+
const dedupStats = {
|
|
1879
|
+
before: preDedupResults.length,
|
|
1880
|
+
after: deduplicated.length,
|
|
1881
|
+
removed: preDedupResults.length - deduplicated.length,
|
|
1882
|
+
};
|
|
1883
|
+
// ── Step 1c: Enforce source diversity (Task 3.4) ────────────────────
|
|
1884
|
+
const maxPerDoc = input.max_results_per_document ?? 3;
|
|
1885
|
+
const diversified = enforceSourceDiversity(deduplicated, maxPerDoc);
|
|
1886
|
+
const diversityStats = {
|
|
1887
|
+
max_per_document: maxPerDoc,
|
|
1888
|
+
before: deduplicated.length,
|
|
1889
|
+
after: diversified.length,
|
|
1890
|
+
};
|
|
1891
|
+
// Apply final limit after dedup + diversity
|
|
1892
|
+
const finalFused = diversified.slice(0, limit);
|
|
1893
|
+
// Enrich VLM results with image metadata
|
|
1894
|
+
enrichVLMResultsWithImageMetadata(conn, finalFused);
|
|
1895
|
+
// ── Step 2: Assemble markdown context ──────────────────────────────────
|
|
1896
|
+
const contextParts = [];
|
|
1897
|
+
// Document excerpts
|
|
1898
|
+
contextParts.push('## Relevant Document Excerpts\n');
|
|
1899
|
+
const sources = [];
|
|
1900
|
+
for (let i = 0; i < finalFused.length; i++) {
|
|
1901
|
+
const r = finalFused[i];
|
|
1902
|
+
const score = Math.round(r.rrf_score * 1000) / 1000;
|
|
1903
|
+
const fileName = r.source_file_name || path.basename(r.source_file_path || 'unknown');
|
|
1904
|
+
const pageInfo = r.page_number !== null && r.page_number !== undefined ? `, Page ${r.page_number}` : '';
|
|
1905
|
+
contextParts.push(`### Result ${i + 1} (Score: ${score})`);
|
|
1906
|
+
contextParts.push(`**Source:** ${fileName}${pageInfo}`);
|
|
1907
|
+
if (r.section_path) {
|
|
1908
|
+
contextParts.push(`**Section:** ${r.section_path}`);
|
|
1909
|
+
}
|
|
1910
|
+
if (r.heading_context) {
|
|
1911
|
+
contextParts.push(`**Heading:** ${r.heading_context}`);
|
|
1912
|
+
}
|
|
1913
|
+
// For VLM results with image metadata, include image context
|
|
1914
|
+
if (r.image_extracted_path) {
|
|
1915
|
+
const blockType = r.image_block_type || 'Image';
|
|
1916
|
+
const imgPage = r.image_page_number ?? r.page_number ?? 'unknown';
|
|
1917
|
+
contextParts.push(`> **[Image: ${blockType} on page ${imgPage}]**`);
|
|
1918
|
+
contextParts.push(`> File: ${r.image_extracted_path}`);
|
|
1919
|
+
contextParts.push(`> Description: ${r.original_text.replace(/\n/g, '\n> ')}\n`);
|
|
1920
|
+
}
|
|
1921
|
+
else {
|
|
1922
|
+
contextParts.push(`> ${r.original_text.replace(/\n/g, '\n> ')}\n`);
|
|
1923
|
+
}
|
|
1924
|
+
sources.push({
|
|
1925
|
+
file_name: fileName,
|
|
1926
|
+
page_number: r.page_number,
|
|
1927
|
+
document_id: r.document_id,
|
|
1928
|
+
});
|
|
1929
|
+
}
|
|
1930
|
+
// ── Step 3: Truncate to max_context_length ─────────────────────────────
|
|
1931
|
+
let assembledMarkdown = contextParts.join('\n');
|
|
1932
|
+
if (assembledMarkdown.length > maxContextLength) {
|
|
1933
|
+
assembledMarkdown = assembledMarkdown.slice(0, maxContextLength - 3) + '...';
|
|
1934
|
+
}
|
|
1935
|
+
// ── Step 4: Return structured response ─────────────────────────────────
|
|
1936
|
+
const ragResponse = {
|
|
1937
|
+
question: input.question,
|
|
1938
|
+
context: assembledMarkdown,
|
|
1939
|
+
context_length: assembledMarkdown.length,
|
|
1940
|
+
search_results_used: finalFused.length,
|
|
1941
|
+
sources,
|
|
1942
|
+
deduplication: dedupStats,
|
|
1943
|
+
source_diversity: diversityStats,
|
|
1944
|
+
};
|
|
1945
|
+
ragResponse.next_steps = [{ tool: 'ocr_search', description: 'Run a more detailed search with filters' }, { tool: 'ocr_document_get', description: 'Get full details for a source document' }, { tool: 'ocr_chunk_context', description: 'Expand a specific chunk with surrounding text' }];
|
|
1946
|
+
return formatResponse(successResult(ragResponse));
|
|
1947
|
+
}
|
|
1948
|
+
catch (error) {
|
|
1949
|
+
return handleError(error);
|
|
1950
|
+
}
|
|
1951
|
+
}
|
|
1952
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1953
|
+
// BENCHMARK COMPARE HANDLER
|
|
1954
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1955
|
+
/**
|
|
1956
|
+
* Handle ocr_benchmark_compare - Compare search results across multiple databases
|
|
1957
|
+
*/
|
|
1958
|
+
async function handleBenchmarkCompare(params) {
|
|
1959
|
+
try {
|
|
1960
|
+
const input = validateInput(z.object({
|
|
1961
|
+
query: z.string().min(1).max(1000),
|
|
1962
|
+
database_names: z.array(z.string().min(1)).min(2),
|
|
1963
|
+
search_type: z.enum(['bm25', 'semantic']).default('bm25'),
|
|
1964
|
+
limit: z.number().int().min(1).max(50).default(10),
|
|
1965
|
+
}), params);
|
|
1966
|
+
const storagePath = getDefaultStoragePath();
|
|
1967
|
+
const dbResults = [];
|
|
1968
|
+
for (const dbName of input.database_names) {
|
|
1969
|
+
let tempDb = null;
|
|
1970
|
+
try {
|
|
1971
|
+
tempDb = DatabaseService.open(dbName, storagePath);
|
|
1972
|
+
const conn = tempDb.getConnection();
|
|
1973
|
+
let scores;
|
|
1974
|
+
let documentIds;
|
|
1975
|
+
if (input.search_type === 'bm25') {
|
|
1976
|
+
const bm25 = new BM25SearchService(conn);
|
|
1977
|
+
const results = bm25.search({
|
|
1978
|
+
query: input.query,
|
|
1979
|
+
limit: input.limit,
|
|
1980
|
+
includeHighlight: false,
|
|
1981
|
+
});
|
|
1982
|
+
scores = results.map((r) => r.bm25_score);
|
|
1983
|
+
documentIds = results.map((r) => r.document_id);
|
|
1984
|
+
}
|
|
1985
|
+
else {
|
|
1986
|
+
const vectorSvc = new VectorService(conn);
|
|
1987
|
+
const embedder = getEmbeddingService();
|
|
1988
|
+
const queryVector = await embedder.embedSearchQuery(input.query);
|
|
1989
|
+
const results = vectorSvc.searchSimilar(queryVector, {
|
|
1990
|
+
limit: input.limit,
|
|
1991
|
+
threshold: 0.3,
|
|
1992
|
+
});
|
|
1993
|
+
scores = results.map((r) => r.similarity_score);
|
|
1994
|
+
documentIds = results.map((r) => r.document_id);
|
|
1995
|
+
}
|
|
1996
|
+
const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
|
|
1997
|
+
dbResults.push({
|
|
1998
|
+
database_name: dbName,
|
|
1999
|
+
result_count: scores.length,
|
|
2000
|
+
top_scores: scores.slice(0, 5),
|
|
2001
|
+
avg_score: Math.round(avgScore * 1000) / 1000,
|
|
2002
|
+
document_ids: documentIds,
|
|
2003
|
+
});
|
|
2004
|
+
}
|
|
2005
|
+
catch (error) {
|
|
2006
|
+
dbResults.push({
|
|
2007
|
+
database_name: dbName,
|
|
2008
|
+
result_count: 0,
|
|
2009
|
+
top_scores: [],
|
|
2010
|
+
avg_score: 0,
|
|
2011
|
+
document_ids: [],
|
|
2012
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2013
|
+
});
|
|
2014
|
+
}
|
|
2015
|
+
finally {
|
|
2016
|
+
tempDb?.close();
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
// FIX-6: If every database had an error, return an error instead of success with 0 results
|
|
2020
|
+
const allFailed = dbResults.length > 0 && dbResults.every(r => 'error' in r && r.error);
|
|
2021
|
+
if (allFailed) {
|
|
2022
|
+
const errors = dbResults.map(r => `${r.database_name}: ${r.error}`).join('; ');
|
|
2023
|
+
return handleError(new Error(`All databases failed: ${errors}`));
|
|
2024
|
+
}
|
|
2025
|
+
// Compute overlap analysis: which document_ids appear in multiple databases
|
|
2026
|
+
const allDocIds = new Map(); // doc_id -> list of db names
|
|
2027
|
+
for (const dbResult of dbResults) {
|
|
2028
|
+
for (const docId of dbResult.document_ids) {
|
|
2029
|
+
const existing = allDocIds.get(docId) || [];
|
|
2030
|
+
existing.push(dbResult.database_name);
|
|
2031
|
+
allDocIds.set(docId, existing);
|
|
2032
|
+
}
|
|
2033
|
+
}
|
|
2034
|
+
const overlapping = Object.fromEntries([...allDocIds.entries()].filter(([, dbs]) => dbs.length > 1));
|
|
2035
|
+
return formatResponse(successResult({
|
|
2036
|
+
query: input.query,
|
|
2037
|
+
search_type: input.search_type,
|
|
2038
|
+
limit: input.limit,
|
|
2039
|
+
databases: dbResults,
|
|
2040
|
+
overlap_analysis: {
|
|
2041
|
+
overlapping_document_ids: overlapping,
|
|
2042
|
+
overlap_count: Object.keys(overlapping).length,
|
|
2043
|
+
total_unique_documents: allDocIds.size,
|
|
2044
|
+
},
|
|
2045
|
+
next_steps: [{ tool: 'ocr_search', description: 'Search in the current database' }, { tool: 'ocr_db_select', description: 'Switch to a different database' }],
|
|
2046
|
+
}));
|
|
2047
|
+
}
|
|
2048
|
+
catch (error) {
|
|
2049
|
+
return handleError(error);
|
|
2050
|
+
}
|
|
2051
|
+
}
|
|
2052
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
2053
|
+
// SEARCH EXPORT HANDLER
|
|
2054
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
2055
|
+
/**
|
|
2056
|
+
* Handle ocr_search_export - Export search results to CSV or JSON file
|
|
2057
|
+
*/
|
|
2058
|
+
async function handleSearchExport(params) {
|
|
2059
|
+
try {
|
|
2060
|
+
const input = validateInput(z.object({
|
|
2061
|
+
query: z.string().min(1).max(1000),
|
|
2062
|
+
search_type: z.enum(['bm25', 'semantic', 'hybrid']).default('hybrid'),
|
|
2063
|
+
limit: z.number().int().min(1).max(1000).default(100),
|
|
2064
|
+
format: z.enum(['csv', 'json']).default('csv'),
|
|
2065
|
+
output_path: z.string().min(1),
|
|
2066
|
+
include_text: z.boolean().default(true),
|
|
2067
|
+
}), params);
|
|
2068
|
+
// Run the appropriate search, routing through unified handler with appropriate mode
|
|
2069
|
+
const searchParams = {
|
|
2070
|
+
query: input.query,
|
|
2071
|
+
limit: input.limit,
|
|
2072
|
+
include_provenance: false,
|
|
2073
|
+
mode: input.search_type === 'bm25' ? 'keyword' : input.search_type,
|
|
2074
|
+
};
|
|
2075
|
+
const searchResult = await handleSearchUnified(searchParams);
|
|
2076
|
+
// Parse search results from the ToolResponse
|
|
2077
|
+
if (!searchResult.content || searchResult.content.length === 0) {
|
|
2078
|
+
throw new Error('Search returned empty content');
|
|
2079
|
+
}
|
|
2080
|
+
const responseContent = searchResult.content[0];
|
|
2081
|
+
if (responseContent.type !== 'text')
|
|
2082
|
+
throw new Error('Unexpected search response format');
|
|
2083
|
+
let parsedResponse;
|
|
2084
|
+
try {
|
|
2085
|
+
parsedResponse = JSON.parse(responseContent.text);
|
|
2086
|
+
}
|
|
2087
|
+
catch (error) {
|
|
2088
|
+
console.error('[search] handleSearchExport failed to parse search response as JSON:', error instanceof Error ? error.message : String(error));
|
|
2089
|
+
throw new Error('Failed to parse search response as JSON');
|
|
2090
|
+
}
|
|
2091
|
+
if (!parsedResponse.success) {
|
|
2092
|
+
const errObj = parsedResponse.error;
|
|
2093
|
+
throw new Error(`Search failed: ${errObj?.message || 'Unknown error'}`);
|
|
2094
|
+
}
|
|
2095
|
+
const dataObj = parsedResponse.data;
|
|
2096
|
+
const results = Array.isArray(dataObj?.results)
|
|
2097
|
+
? dataObj.results
|
|
2098
|
+
: [];
|
|
2099
|
+
// Sanitize output path to prevent directory traversal
|
|
2100
|
+
const safeOutputPath = sanitizePath(input.output_path);
|
|
2101
|
+
// Ensure output directory exists
|
|
2102
|
+
const outputDir = path.dirname(safeOutputPath);
|
|
2103
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
2104
|
+
if (input.format === 'json') {
|
|
2105
|
+
const exportData = {
|
|
2106
|
+
results: results.map((r) => {
|
|
2107
|
+
const row = {
|
|
2108
|
+
document_id: r.document_id,
|
|
2109
|
+
source_file: r.source_file_name || r.source_file_path,
|
|
2110
|
+
page_number: r.page_number,
|
|
2111
|
+
score: r.bm25_score ?? r.similarity_score ?? r.rrf_score,
|
|
2112
|
+
result_type: r.result_type,
|
|
2113
|
+
};
|
|
2114
|
+
if (input.include_text)
|
|
2115
|
+
row.text = r.original_text;
|
|
2116
|
+
return row;
|
|
2117
|
+
}),
|
|
2118
|
+
};
|
|
2119
|
+
fs.writeFileSync(safeOutputPath, JSON.stringify(exportData, null, 2));
|
|
2120
|
+
}
|
|
2121
|
+
else {
|
|
2122
|
+
// CSV - RFC 4180 compliant: all fields double-quoted, internal quotes doubled
|
|
2123
|
+
const csvQuote = (value) => `"${value.replace(/"/g, '""')}"`;
|
|
2124
|
+
const headers = ['document_id', 'source_file', 'page_number', 'score', 'result_type'];
|
|
2125
|
+
if (input.include_text)
|
|
2126
|
+
headers.push('text');
|
|
2127
|
+
const csvLines = [headers.map(csvQuote).join(',')];
|
|
2128
|
+
for (const r of results) {
|
|
2129
|
+
const row = [
|
|
2130
|
+
csvQuote(String(r.document_id ?? '')),
|
|
2131
|
+
csvQuote(String(r.source_file_name || r.source_file_path || '')),
|
|
2132
|
+
csvQuote(r.page_number !== null && r.page_number !== undefined ? String(r.page_number) : ''),
|
|
2133
|
+
csvQuote(String(r.bm25_score ?? r.similarity_score ?? r.rrf_score ?? '')),
|
|
2134
|
+
csvQuote(String(r.result_type || '')),
|
|
2135
|
+
];
|
|
2136
|
+
if (input.include_text) {
|
|
2137
|
+
row.push(csvQuote(String(r.original_text || '')));
|
|
2138
|
+
}
|
|
2139
|
+
csvLines.push(row.join(','));
|
|
2140
|
+
}
|
|
2141
|
+
fs.writeFileSync(safeOutputPath, csvLines.join('\n'));
|
|
2142
|
+
}
|
|
2143
|
+
return formatResponse(successResult({
|
|
2144
|
+
output_path: safeOutputPath,
|
|
2145
|
+
format: input.format,
|
|
2146
|
+
result_count: results.length,
|
|
2147
|
+
search_type: input.search_type,
|
|
2148
|
+
query: input.query,
|
|
2149
|
+
next_steps: [{ tool: 'ocr_search', description: 'Run another search with different parameters' }, { tool: 'ocr_document_get', description: 'Get details for a document from the results' }],
|
|
2150
|
+
}));
|
|
2151
|
+
}
|
|
2152
|
+
catch (error) {
|
|
2153
|
+
return handleError(error);
|
|
2154
|
+
}
|
|
2155
|
+
}
|
|
2156
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
2157
|
+
// SAVED SEARCH HANDLERS
|
|
2158
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
2159
|
+
const SearchSavedInput = z.object({
|
|
2160
|
+
action: z.enum(['list', 'get', 'execute', 'save']).describe('Action: list saved searches, get by ID, execute a saved search, or save a new search'),
|
|
2161
|
+
saved_search_id: z.string().min(1).optional().describe('ID of the saved search (required for get and execute actions)'),
|
|
2162
|
+
search_type: z.enum(['bm25', 'semantic', 'hybrid']).optional().describe('Filter by search type (list) or search method (save)'),
|
|
2163
|
+
limit: z.number().int().min(1).max(100).default(50).describe('Max results for list action'),
|
|
2164
|
+
offset: z.number().int().min(0).default(0).describe('Pagination offset for list action'),
|
|
2165
|
+
override_limit: z.number().int().min(1).max(100).optional()
|
|
2166
|
+
.describe('Override the original result limit (execute action only)'),
|
|
2167
|
+
name: z.string().min(1).max(200).optional().describe('Name for saved search (required for save action)'),
|
|
2168
|
+
query: z.string().min(1).max(1000).optional().describe('Search query (required for save action)'),
|
|
2169
|
+
search_params: z.record(z.unknown()).optional().describe('Search parameters JSON (save action)'),
|
|
2170
|
+
result_count: z.number().int().min(0).optional().describe('Number of results (save action)'),
|
|
2171
|
+
result_ids: z.array(z.string()).optional().describe('Result IDs array (save action)'),
|
|
2172
|
+
notes: z.string().optional().describe('Notes about this search (save action)'),
|
|
2173
|
+
});
|
|
2174
|
+
/**
|
|
2175
|
+
* Handle ocr_search_saved - Unified saved search management (MERGE-B: includes save action)
|
|
2176
|
+
*
|
|
2177
|
+
* Actions:
|
|
2178
|
+
* - save: Save search results for later retrieval
|
|
2179
|
+
* - list: List saved searches with optional type filtering
|
|
2180
|
+
* - get: Retrieve a saved search by ID including all parameters and result IDs
|
|
2181
|
+
* - execute: Re-execute a saved search with current data via handleSearchUnified
|
|
2182
|
+
*/
|
|
2183
|
+
async function handleSearchSaved(params) {
|
|
2184
|
+
try {
|
|
2185
|
+
const input = validateInput(SearchSavedInput, params);
|
|
2186
|
+
const { db } = requireDatabase();
|
|
2187
|
+
const conn = db.getConnection();
|
|
2188
|
+
if (input.action === 'save') {
|
|
2189
|
+
// Validate required fields for save
|
|
2190
|
+
if (!input.name)
|
|
2191
|
+
throw new MCPError('VALIDATION_ERROR', 'name is required for save action');
|
|
2192
|
+
if (!input.query)
|
|
2193
|
+
throw new MCPError('VALIDATION_ERROR', 'query is required for save action');
|
|
2194
|
+
if (!input.search_type)
|
|
2195
|
+
throw new MCPError('VALIDATION_ERROR', 'search_type is required for save action');
|
|
2196
|
+
if (input.result_count === undefined)
|
|
2197
|
+
throw new MCPError('VALIDATION_ERROR', 'result_count is required for save action');
|
|
2198
|
+
const id = uuidv4();
|
|
2199
|
+
const now = new Date().toISOString();
|
|
2200
|
+
conn.prepare(`
|
|
2201
|
+
INSERT INTO saved_searches (id, name, query, search_type, search_params, result_count, result_ids, created_at, notes)
|
|
2202
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
2203
|
+
`).run(id, input.name, input.query, input.search_type, JSON.stringify(input.search_params ?? {}), input.result_count, JSON.stringify(input.result_ids ?? []), now, input.notes ?? null);
|
|
2204
|
+
return formatResponse(successResult({
|
|
2205
|
+
saved_search_id: id,
|
|
2206
|
+
name: input.name,
|
|
2207
|
+
query: input.query,
|
|
2208
|
+
search_type: input.search_type,
|
|
2209
|
+
result_count: input.result_count,
|
|
2210
|
+
created_at: now,
|
|
2211
|
+
next_steps: [{ tool: 'ocr_search_saved', description: 'List or re-execute saved searches' }],
|
|
2212
|
+
}));
|
|
2213
|
+
}
|
|
2214
|
+
if (input.action === 'list') {
|
|
2215
|
+
let sql = 'SELECT id, name, query, search_type, result_count, created_at, notes, last_executed_at, execution_count FROM saved_searches';
|
|
2216
|
+
const sqlParams = [];
|
|
2217
|
+
if (input.search_type) {
|
|
2218
|
+
sql += ' WHERE search_type = ?';
|
|
2219
|
+
sqlParams.push(input.search_type);
|
|
2220
|
+
}
|
|
2221
|
+
sql += ' ORDER BY created_at DESC LIMIT ? OFFSET ?';
|
|
2222
|
+
sqlParams.push(input.limit, input.offset);
|
|
2223
|
+
const rows = conn.prepare(sql).all(...sqlParams);
|
|
2224
|
+
const totalRow = conn.prepare(input.search_type
|
|
2225
|
+
? 'SELECT COUNT(*) as count FROM saved_searches WHERE search_type = ?'
|
|
2226
|
+
: 'SELECT COUNT(*) as count FROM saved_searches').get(...(input.search_type ? [input.search_type] : []));
|
|
2227
|
+
return formatResponse(successResult({
|
|
2228
|
+
action: 'list',
|
|
2229
|
+
saved_searches: rows,
|
|
2230
|
+
total: totalRow.count,
|
|
2231
|
+
limit: input.limit,
|
|
2232
|
+
offset: input.offset,
|
|
2233
|
+
next_steps: [{ tool: 'ocr_search', description: 'Run a new search' }, { tool: 'ocr_search_saved', description: 'Save a search (action=save) for later' }],
|
|
2234
|
+
}));
|
|
2235
|
+
}
|
|
2236
|
+
// Both 'get' and 'execute' require saved_search_id
|
|
2237
|
+
if (!input.saved_search_id) {
|
|
2238
|
+
throw new MCPError('VALIDATION_ERROR', 'saved_search_id is required for get and execute actions');
|
|
2239
|
+
}
|
|
2240
|
+
if (input.action === 'get') {
|
|
2241
|
+
const row = conn.prepare('SELECT * FROM saved_searches WHERE id = ?').get(input.saved_search_id);
|
|
2242
|
+
if (!row) {
|
|
2243
|
+
throw new Error(`Saved search not found: ${input.saved_search_id}`);
|
|
2244
|
+
}
|
|
2245
|
+
return formatResponse(successResult({
|
|
2246
|
+
action: 'get',
|
|
2247
|
+
id: row.id,
|
|
2248
|
+
name: row.name,
|
|
2249
|
+
query: row.query,
|
|
2250
|
+
search_type: row.search_type,
|
|
2251
|
+
search_params: JSON.parse(row.search_params),
|
|
2252
|
+
result_count: row.result_count,
|
|
2253
|
+
result_ids: JSON.parse(row.result_ids),
|
|
2254
|
+
created_at: row.created_at,
|
|
2255
|
+
notes: row.notes,
|
|
2256
|
+
next_steps: [{ tool: 'ocr_search', description: 'Run a new search' }, { tool: 'ocr_search_saved', description: 'Save a search (action=save) for later' }],
|
|
2257
|
+
}));
|
|
2258
|
+
}
|
|
2259
|
+
// action === 'execute'
|
|
2260
|
+
const row = conn.prepare('SELECT * FROM saved_searches WHERE id = ?').get(input.saved_search_id);
|
|
2261
|
+
if (!row) {
|
|
2262
|
+
throw new MCPError('VALIDATION_ERROR', `Saved search not found: ${input.saved_search_id}`);
|
|
2263
|
+
}
|
|
2264
|
+
// Parse stored search parameters
|
|
2265
|
+
let searchParams;
|
|
2266
|
+
try {
|
|
2267
|
+
searchParams = JSON.parse(row.search_params);
|
|
2268
|
+
}
|
|
2269
|
+
catch (parseErr) {
|
|
2270
|
+
throw new MCPError('INTERNAL_ERROR', `Failed to parse saved search params: ${String(parseErr)}`);
|
|
2271
|
+
}
|
|
2272
|
+
// Override limit if requested
|
|
2273
|
+
if (input.override_limit !== undefined) {
|
|
2274
|
+
searchParams.limit = input.override_limit;
|
|
2275
|
+
}
|
|
2276
|
+
// Ensure query is set in params
|
|
2277
|
+
searchParams.query = row.query;
|
|
2278
|
+
// Dispatch through unified handler with appropriate mode
|
|
2279
|
+
const modeMap = { bm25: 'keyword', semantic: 'semantic', hybrid: 'hybrid' };
|
|
2280
|
+
const mode = modeMap[row.search_type];
|
|
2281
|
+
if (!mode) {
|
|
2282
|
+
throw new MCPError('VALIDATION_ERROR', `Unknown search type: ${row.search_type}`);
|
|
2283
|
+
}
|
|
2284
|
+
searchParams.mode = mode;
|
|
2285
|
+
const searchResult = await handleSearchUnified(searchParams);
|
|
2286
|
+
// Parse the search result to wrap with saved search metadata
|
|
2287
|
+
const searchResultData = JSON.parse(searchResult.content[0].text);
|
|
2288
|
+
// Task 6.4: Update saved search analytics (execution tracking)
|
|
2289
|
+
let analyticsWarning;
|
|
2290
|
+
try {
|
|
2291
|
+
conn.prepare('UPDATE saved_searches SET last_executed_at = ?, execution_count = COALESCE(execution_count, 0) + 1 WHERE id = ?').run(new Date().toISOString(), row.id);
|
|
2292
|
+
}
|
|
2293
|
+
catch (analyticsErr) {
|
|
2294
|
+
// Non-fatal: schema pre-v30 databases may not have these columns yet
|
|
2295
|
+
const msg = analyticsErr instanceof Error ? analyticsErr.message : String(analyticsErr);
|
|
2296
|
+
console.error('[search] Failed to update saved search analytics:', msg);
|
|
2297
|
+
analyticsWarning = `Analytics tracking unavailable: database schema may be pre-v30. ${msg}`;
|
|
2298
|
+
}
|
|
2299
|
+
const result = {
|
|
2300
|
+
action: 'execute',
|
|
2301
|
+
saved_search: {
|
|
2302
|
+
id: row.id,
|
|
2303
|
+
name: row.name,
|
|
2304
|
+
query: row.query,
|
|
2305
|
+
search_type: row.search_type,
|
|
2306
|
+
original_result_count: row.result_count,
|
|
2307
|
+
created_at: row.created_at,
|
|
2308
|
+
notes: row.notes,
|
|
2309
|
+
},
|
|
2310
|
+
re_executed_at: new Date().toISOString(),
|
|
2311
|
+
search_results: searchResultData,
|
|
2312
|
+
next_steps: [{ tool: 'ocr_search', description: 'Run a new search' }, { tool: 'ocr_search_saved', description: 'Save a search (action=save) for later' }],
|
|
2313
|
+
};
|
|
2314
|
+
if (analyticsWarning) {
|
|
2315
|
+
result.warning = analyticsWarning;
|
|
2316
|
+
}
|
|
2317
|
+
return formatResponse(successResult(result));
|
|
2318
|
+
}
|
|
2319
|
+
catch (error) {
|
|
2320
|
+
return handleError(error);
|
|
2321
|
+
}
|
|
2322
|
+
}
|
|
2323
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
2324
|
+
// CROSS-DATABASE SEARCH HANDLER
|
|
2325
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
2326
|
+
const CrossDbSearchInput = z.object({
|
|
2327
|
+
query: z.string().min(1).describe('Search query'),
|
|
2328
|
+
database_names: z.array(z.string()).optional()
|
|
2329
|
+
.describe('Database names to search (default: all databases)'),
|
|
2330
|
+
limit_per_db: z.number().int().min(1).max(50).default(10)
|
|
2331
|
+
.describe('Maximum results per database'),
|
|
2332
|
+
});
|
|
2333
|
+
/**
|
|
2334
|
+
* Handle ocr_search_cross_db - Search across multiple databases using BM25
|
|
2335
|
+
*/
|
|
2336
|
+
async function handleCrossDbSearch(params) {
|
|
2337
|
+
try {
|
|
2338
|
+
const input = validateInput(CrossDbSearchInput, params);
|
|
2339
|
+
const { listDatabases } = await import('../services/storage/database/static-operations.js');
|
|
2340
|
+
const Database = (await import('better-sqlite3')).default;
|
|
2341
|
+
// Get list of databases
|
|
2342
|
+
let databases = listDatabases();
|
|
2343
|
+
// Filter to requested database_names if provided
|
|
2344
|
+
if (input.database_names && input.database_names.length > 0) {
|
|
2345
|
+
const nameSet = new Set(input.database_names);
|
|
2346
|
+
databases = databases.filter((db) => nameSet.has(db.name));
|
|
2347
|
+
}
|
|
2348
|
+
const allResults = [];
|
|
2349
|
+
const skippedDbs = [];
|
|
2350
|
+
for (const dbInfo of databases) {
|
|
2351
|
+
let conn = null;
|
|
2352
|
+
try {
|
|
2353
|
+
conn = new Database(dbInfo.path, { readonly: true });
|
|
2354
|
+
// Check if FTS table exists
|
|
2355
|
+
const ftsCheck = conn
|
|
2356
|
+
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_fts'")
|
|
2357
|
+
.get();
|
|
2358
|
+
if (!ftsCheck) {
|
|
2359
|
+
skippedDbs.push({ name: dbInfo.name, reason: 'No FTS index (chunks_fts table not found)' });
|
|
2360
|
+
continue;
|
|
2361
|
+
}
|
|
2362
|
+
// Run BM25 search (sanitize query for FTS5 safety)
|
|
2363
|
+
const ftsQuery = sanitizeFTS5Query(input.query);
|
|
2364
|
+
const rows = conn
|
|
2365
|
+
.prepare(`SELECT c.id, c.document_id, c.text, c.chunk_index, bm25(chunks_fts) AS bm25_score
|
|
2366
|
+
FROM chunks_fts
|
|
2367
|
+
JOIN chunks c ON c.rowid = chunks_fts.rowid
|
|
2368
|
+
WHERE chunks_fts MATCH ?
|
|
2369
|
+
ORDER BY bm25(chunks_fts)
|
|
2370
|
+
LIMIT ?`)
|
|
2371
|
+
.all(ftsQuery, input.limit_per_db);
|
|
2372
|
+
for (const row of rows) {
|
|
2373
|
+
// Get document info
|
|
2374
|
+
const docInfo = conn
|
|
2375
|
+
.prepare('SELECT file_name, file_path FROM documents WHERE id = ?')
|
|
2376
|
+
.get(row.document_id);
|
|
2377
|
+
allResults.push({
|
|
2378
|
+
database_name: dbInfo.name,
|
|
2379
|
+
document_id: row.document_id,
|
|
2380
|
+
file_name: docInfo?.file_name ?? null,
|
|
2381
|
+
chunk_id: row.id,
|
|
2382
|
+
chunk_index: row.chunk_index,
|
|
2383
|
+
text_preview: row.text.substring(0, 300),
|
|
2384
|
+
bm25_score: Math.abs(row.bm25_score),
|
|
2385
|
+
normalized_score: 0, // Set during per-database normalization below
|
|
2386
|
+
});
|
|
2387
|
+
}
|
|
2388
|
+
}
|
|
2389
|
+
catch (dbError) {
|
|
2390
|
+
const errMsg = dbError instanceof Error ? dbError.message : String(dbError);
|
|
2391
|
+
console.error(`[CrossDbSearch] Failed to search database ${dbInfo.name}: ${errMsg}`);
|
|
2392
|
+
skippedDbs.push({ name: dbInfo.name, reason: errMsg });
|
|
2393
|
+
}
|
|
2394
|
+
finally {
|
|
2395
|
+
if (conn) {
|
|
2396
|
+
try {
|
|
2397
|
+
conn.close();
|
|
2398
|
+
}
|
|
2399
|
+
catch (closeErr) {
|
|
2400
|
+
console.error(`[CrossDbSearch] Failed to close connection to ${dbInfo.name}: ${String(closeErr)}`);
|
|
2401
|
+
}
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
// Normalize BM25 scores per-database before merging.
|
|
2406
|
+
// BM25 scores from different databases use different corpus statistics (IDF, avgdl)
|
|
2407
|
+
// so raw scores are not comparable. Min-max normalize each database's scores to [0, 1].
|
|
2408
|
+
const byDatabase = new Map();
|
|
2409
|
+
for (const r of allResults) {
|
|
2410
|
+
if (!byDatabase.has(r.database_name))
|
|
2411
|
+
byDatabase.set(r.database_name, []);
|
|
2412
|
+
byDatabase.get(r.database_name).push(r);
|
|
2413
|
+
}
|
|
2414
|
+
for (const dbResults of byDatabase.values()) {
|
|
2415
|
+
const scores = dbResults.map(r => r.bm25_score);
|
|
2416
|
+
const minScore = safeMin(scores) ?? 0;
|
|
2417
|
+
const maxScore = safeMax(scores) ?? 0;
|
|
2418
|
+
const range = maxScore - minScore;
|
|
2419
|
+
for (const r of dbResults) {
|
|
2420
|
+
r.normalized_score = range > 0
|
|
2421
|
+
? (r.bm25_score - minScore) / range
|
|
2422
|
+
: 1.0;
|
|
2423
|
+
}
|
|
2424
|
+
}
|
|
2425
|
+
// Sort by normalized score (higher=better)
|
|
2426
|
+
allResults.sort((a, b) => b.normalized_score - a.normalized_score);
|
|
2427
|
+
return formatResponse(successResult({
|
|
2428
|
+
query: input.query,
|
|
2429
|
+
databases_searched: databases.length - skippedDbs.length,
|
|
2430
|
+
total_results: allResults.length,
|
|
2431
|
+
results: allResults,
|
|
2432
|
+
score_normalization: 'per_database_min_max',
|
|
2433
|
+
databases_skipped: skippedDbs.length > 0 ? skippedDbs : undefined,
|
|
2434
|
+
next_steps: [{ tool: 'ocr_db_select', description: 'Switch to a specific database for deeper search' }, { tool: 'ocr_search', description: 'Search within the current database with full features' }],
|
|
2435
|
+
}));
|
|
2436
|
+
}
|
|
2437
|
+
catch (error) {
|
|
2438
|
+
return handleError(error);
|
|
2439
|
+
}
|
|
2440
|
+
}
|
|
2441
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
2442
|
+
// TOOL DEFINITIONS EXPORT
|
|
2443
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
2444
|
+
/**
|
|
2445
|
+
* Search tools collection for MCP server registration
|
|
2446
|
+
*/
|
|
2447
|
+
export const searchTools = {
|
|
2448
|
+
ocr_search: {
|
|
2449
|
+
description: '[ESSENTIAL] Primary search. mode="keyword" (BM25), "semantic" (vector), or "hybrid" (default, best). Quality-weighted, query-expanded, deduplicated.',
|
|
2450
|
+
inputSchema: SearchUnifiedInput.shape,
|
|
2451
|
+
handler: handleSearchUnified,
|
|
2452
|
+
},
|
|
2453
|
+
ocr_fts_manage: {
|
|
2454
|
+
description: '[SETUP] FTS5 index maintenance. action="status" checks health; "rebuild" recreates index. Use when keyword search returns unexpected zero results.',
|
|
2455
|
+
inputSchema: {
|
|
2456
|
+
action: z.enum(['rebuild', 'status']).describe('Action: rebuild index or check status'),
|
|
2457
|
+
},
|
|
2458
|
+
handler: handleFTSManage,
|
|
2459
|
+
},
|
|
2460
|
+
ocr_search_export: {
|
|
2461
|
+
description: '[STATUS] Use to export search results to a CSV or JSON file on disk. Returns file path and result count.',
|
|
2462
|
+
inputSchema: {
|
|
2463
|
+
query: z.string().min(1).max(1000).describe('Search query'),
|
|
2464
|
+
search_type: z
|
|
2465
|
+
.enum(['bm25', 'semantic', 'hybrid'])
|
|
2466
|
+
.default('hybrid')
|
|
2467
|
+
.describe('Search method to use'),
|
|
2468
|
+
limit: z.number().int().min(1).max(1000).default(100).describe('Maximum results'),
|
|
2469
|
+
format: z.enum(['csv', 'json']).default('csv').describe('Export file format'),
|
|
2470
|
+
output_path: z.string().min(1).describe('File path to save export'),
|
|
2471
|
+
include_text: z.boolean().default(true).describe('Include full text in export'),
|
|
2472
|
+
},
|
|
2473
|
+
handler: handleSearchExport,
|
|
2474
|
+
},
|
|
2475
|
+
ocr_benchmark_compare: {
|
|
2476
|
+
description: '[SEARCH] Use when you have the same documents in separate databases and want to compare search quality. Returns per-database results for the same query.',
|
|
2477
|
+
inputSchema: {
|
|
2478
|
+
query: z.string().min(1).max(1000).describe('Search query'),
|
|
2479
|
+
database_names: z
|
|
2480
|
+
.array(z.string().min(1))
|
|
2481
|
+
.min(2)
|
|
2482
|
+
.describe('Database names to compare (minimum 2)'),
|
|
2483
|
+
search_type: z.enum(['bm25', 'semantic']).default('bm25').describe('Search method to use'),
|
|
2484
|
+
limit: z.number().int().min(1).max(50).default(10).describe('Maximum results per database'),
|
|
2485
|
+
},
|
|
2486
|
+
handler: handleBenchmarkCompare,
|
|
2487
|
+
},
|
|
2488
|
+
ocr_rag_context: {
|
|
2489
|
+
description: '[ESSENTIAL] Use when answering a user question about document content. Returns pre-assembled, deduplicated markdown context from hybrid search. Best for RAG workflows.',
|
|
2490
|
+
inputSchema: {
|
|
2491
|
+
question: z.string().min(1).max(2000).describe('The question to build context for'),
|
|
2492
|
+
limit: z
|
|
2493
|
+
.number()
|
|
2494
|
+
.int()
|
|
2495
|
+
.min(1)
|
|
2496
|
+
.max(20)
|
|
2497
|
+
.default(5)
|
|
2498
|
+
.describe('Maximum search results to include in context'),
|
|
2499
|
+
document_filter: z.array(z.string()).optional().describe('Restrict to specific documents'),
|
|
2500
|
+
max_context_length: z
|
|
2501
|
+
.number()
|
|
2502
|
+
.int()
|
|
2503
|
+
.min(500)
|
|
2504
|
+
.max(50000)
|
|
2505
|
+
.default(8000)
|
|
2506
|
+
.describe('Maximum total context length in characters'),
|
|
2507
|
+
max_results_per_document: z
|
|
2508
|
+
.number()
|
|
2509
|
+
.int()
|
|
2510
|
+
.min(1)
|
|
2511
|
+
.max(20)
|
|
2512
|
+
.default(3)
|
|
2513
|
+
.describe('Maximum chunks per document for source diversity (default: 3)'),
|
|
2514
|
+
},
|
|
2515
|
+
handler: handleRagContext,
|
|
2516
|
+
},
|
|
2517
|
+
ocr_search_saved: {
|
|
2518
|
+
description: '[SEARCH] Manage saved searches. action="save"|"list"|"get"|"execute". Save requires name, query, search_type, result_count.',
|
|
2519
|
+
inputSchema: SearchSavedInput.shape,
|
|
2520
|
+
handler: handleSearchSaved,
|
|
2521
|
+
},
|
|
2522
|
+
ocr_search_cross_db: {
|
|
2523
|
+
description: '[SEARCH] Use to search across ALL databases at once using BM25 keyword matching. Returns merged results with database source. No need to switch databases.',
|
|
2524
|
+
inputSchema: CrossDbSearchInput.shape,
|
|
2525
|
+
handler: handleCrossDbSearch,
|
|
2526
|
+
},
|
|
2527
|
+
};
|
|
2528
|
+
//# sourceMappingURL=search.js.map
|