ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 Search Service using SQLite FTS5
|
|
3
|
+
*
|
|
4
|
+
* FAIL FAST: All errors throw immediately with detailed messages
|
|
5
|
+
* PROVENANCE: Every result includes provenance_id and content_hash
|
|
6
|
+
*/
|
|
7
|
+
import crypto from 'crypto';
|
|
8
|
+
import { SCHEMA_VERSION } from '../storage/migrations/schema-definitions.js';
|
|
9
|
+
import { computeQualityMultiplier } from './quality.js';
|
|
10
|
+
/**
|
|
11
|
+
* Apply quality multiplier to BM25 results, re-sort, and re-rank.
|
|
12
|
+
*/
|
|
13
|
+
function applyQualityAndRerank(results) {
|
|
14
|
+
for (const r of results) {
|
|
15
|
+
r.bm25_score *= computeQualityMultiplier(r.ocr_quality_score);
|
|
16
|
+
}
|
|
17
|
+
results.sort((a, b) => b.bm25_score - a.bm25_score);
|
|
18
|
+
for (let i = 0; i < results.length; i++) {
|
|
19
|
+
results[i].rank = i + 1;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
export class BM25SearchService {
|
|
23
|
+
db;
|
|
24
|
+
constructor(db) {
|
|
25
|
+
this.db = db;
|
|
26
|
+
this.verifyFTSTableExists();
|
|
27
|
+
}
|
|
28
|
+
verifyFTSTableExists() {
|
|
29
|
+
const result = this.db
|
|
30
|
+
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_fts'")
|
|
31
|
+
.get();
|
|
32
|
+
if (!result) {
|
|
33
|
+
throw new Error('FTS5 table "chunks_fts" not found. Database must be at schema version 4. ' +
|
|
34
|
+
'Re-select the database to trigger migration.');
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
search(options) {
|
|
38
|
+
const { query, limit = 10, phraseSearch = false, documentFilter, includeHighlight = true, chunkFilter, preSanitized = false, } = options;
|
|
39
|
+
if (!query || query.trim().length === 0) {
|
|
40
|
+
throw new Error('BM25 search query cannot be empty');
|
|
41
|
+
}
|
|
42
|
+
let ftsQuery;
|
|
43
|
+
if (phraseSearch) {
|
|
44
|
+
ftsQuery = `"${query.replace(/"/g, '""')}"`;
|
|
45
|
+
}
|
|
46
|
+
else if (preSanitized) {
|
|
47
|
+
// M-7: Defense-in-depth: verify the pre-sanitized query is actually safe
|
|
48
|
+
if (/["'()]/.test(query)) {
|
|
49
|
+
console.error(`[WARN] preSanitized query contains FTS5 metacharacters, falling back to sanitization: "${query}"`);
|
|
50
|
+
ftsQuery = sanitizeFTS5Query(query);
|
|
51
|
+
}
|
|
52
|
+
else {
|
|
53
|
+
ftsQuery = query;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
else {
|
|
57
|
+
ftsQuery = sanitizeFTS5Query(query);
|
|
58
|
+
}
|
|
59
|
+
let sql = `
|
|
60
|
+
SELECT
|
|
61
|
+
c.id AS chunk_id,
|
|
62
|
+
(SELECT e.id FROM embeddings e WHERE e.chunk_id = c.id ORDER BY e.created_at DESC LIMIT 1) AS embedding_id,
|
|
63
|
+
c.document_id,
|
|
64
|
+
c.text AS original_text,
|
|
65
|
+
bm25(chunks_fts) AS bm25_score,
|
|
66
|
+
d.file_path AS source_file_path,
|
|
67
|
+
d.file_name AS source_file_name,
|
|
68
|
+
d.file_hash AS source_file_hash,
|
|
69
|
+
c.page_number,
|
|
70
|
+
c.character_start,
|
|
71
|
+
c.character_end,
|
|
72
|
+
c.chunk_index,
|
|
73
|
+
c.provenance_id,
|
|
74
|
+
c.text_hash AS content_hash,
|
|
75
|
+
c.heading_context,
|
|
76
|
+
c.section_path,
|
|
77
|
+
c.content_types,
|
|
78
|
+
c.is_atomic,
|
|
79
|
+
c.page_range,
|
|
80
|
+
c.heading_level,
|
|
81
|
+
d.doc_title,
|
|
82
|
+
d.doc_author,
|
|
83
|
+
d.doc_subject,
|
|
84
|
+
(SELECT o.parse_quality_score FROM ocr_results o WHERE o.document_id = c.document_id ORDER BY o.processing_completed_at DESC LIMIT 1) AS ocr_quality_score,
|
|
85
|
+
c.overlap_previous,
|
|
86
|
+
c.overlap_next,
|
|
87
|
+
c.chunking_strategy,
|
|
88
|
+
c.embedding_status,
|
|
89
|
+
d.page_count AS doc_page_count,
|
|
90
|
+
(SELECT o.datalab_mode FROM ocr_results o WHERE o.document_id = c.document_id ORDER BY o.processing_completed_at DESC LIMIT 1) AS datalab_mode,
|
|
91
|
+
(SELECT COUNT(*) FROM chunks c2 WHERE c2.document_id = c.document_id) AS total_chunks
|
|
92
|
+
${includeHighlight ? ", snippet(chunks_fts, 0, '<mark>', '</mark>', '...', 32) AS highlight" : ''}
|
|
93
|
+
FROM chunks_fts
|
|
94
|
+
JOIN chunks c ON chunks_fts.rowid = c.rowid
|
|
95
|
+
JOIN documents d ON c.document_id = d.id
|
|
96
|
+
WHERE chunks_fts MATCH ?
|
|
97
|
+
`;
|
|
98
|
+
const params = [ftsQuery];
|
|
99
|
+
if (documentFilter && documentFilter.length > 0) {
|
|
100
|
+
sql += ` AND c.document_id IN (${documentFilter.map(() => '?').join(',')})`;
|
|
101
|
+
params.push(...documentFilter);
|
|
102
|
+
}
|
|
103
|
+
if (chunkFilter && chunkFilter.conditions.length > 0) {
|
|
104
|
+
for (const condition of chunkFilter.conditions) {
|
|
105
|
+
sql += ` AND ${condition}`;
|
|
106
|
+
}
|
|
107
|
+
params.push(...chunkFilter.params);
|
|
108
|
+
}
|
|
109
|
+
sql += ` ORDER BY bm25(chunks_fts) LIMIT ?`;
|
|
110
|
+
params.push(limit);
|
|
111
|
+
const rows = this.db.prepare(sql).all(...params);
|
|
112
|
+
// TY-09: Field casts below are intentional -- better-sqlite3 returns untyped Records.
|
|
113
|
+
// The SQL query guarantees these columns exist and have the expected types.
|
|
114
|
+
const results = rows.map((row, index) => ({
|
|
115
|
+
chunk_id: row.chunk_id,
|
|
116
|
+
image_id: null,
|
|
117
|
+
embedding_id: row.embedding_id ?? null,
|
|
118
|
+
extraction_id: null,
|
|
119
|
+
document_id: row.document_id,
|
|
120
|
+
original_text: row.original_text,
|
|
121
|
+
bm25_score: Math.abs(row.bm25_score),
|
|
122
|
+
rank: index + 1,
|
|
123
|
+
result_type: 'chunk',
|
|
124
|
+
source_file_path: row.source_file_path,
|
|
125
|
+
source_file_name: row.source_file_name,
|
|
126
|
+
source_file_hash: row.source_file_hash,
|
|
127
|
+
page_number: row.page_number,
|
|
128
|
+
character_start: row.character_start,
|
|
129
|
+
character_end: row.character_end,
|
|
130
|
+
chunk_index: row.chunk_index,
|
|
131
|
+
provenance_id: row.provenance_id,
|
|
132
|
+
content_hash: row.content_hash,
|
|
133
|
+
highlight: row.highlight,
|
|
134
|
+
heading_context: row.heading_context ?? null,
|
|
135
|
+
section_path: row.section_path ?? null,
|
|
136
|
+
content_types: row.content_types ?? null,
|
|
137
|
+
is_atomic: !!row.is_atomic,
|
|
138
|
+
page_range: row.page_range ?? null,
|
|
139
|
+
heading_level: row.heading_level ?? null,
|
|
140
|
+
ocr_quality_score: row.ocr_quality_score ?? null,
|
|
141
|
+
doc_title: row.doc_title ?? null,
|
|
142
|
+
doc_author: row.doc_author ?? null,
|
|
143
|
+
doc_subject: row.doc_subject ?? null,
|
|
144
|
+
overlap_previous: row.overlap_previous ?? 0,
|
|
145
|
+
overlap_next: row.overlap_next ?? 0,
|
|
146
|
+
chunking_strategy: row.chunking_strategy ?? null,
|
|
147
|
+
embedding_status: row.embedding_status ?? 'pending',
|
|
148
|
+
doc_page_count: row.doc_page_count ?? null,
|
|
149
|
+
datalab_mode: row.datalab_mode ?? null,
|
|
150
|
+
total_chunks: row.total_chunks ?? 0,
|
|
151
|
+
}));
|
|
152
|
+
applyQualityAndRerank(results);
|
|
153
|
+
return results;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Search VLM description embeddings using FTS5
|
|
157
|
+
* Queries vlm_fts JOIN embeddings JOIN images JOIN documents
|
|
158
|
+
*
|
|
159
|
+
* NOTE: VLM results only support page_range_filter from chunk filters
|
|
160
|
+
* (VLM embeddings don't have heading_context, section_path, etc.)
|
|
161
|
+
*/
|
|
162
|
+
searchVLM(options) {
|
|
163
|
+
const { query, limit = 10, phraseSearch = false, documentFilter, includeHighlight = true, pageRangeFilter, preSanitized = false, } = options;
|
|
164
|
+
if (!query || query.trim().length === 0) {
|
|
165
|
+
throw new Error('BM25 search query cannot be empty');
|
|
166
|
+
}
|
|
167
|
+
// Check if vlm_fts table exists (v6+ only)
|
|
168
|
+
const vlmFtsExists = this.db
|
|
169
|
+
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vlm_fts'")
|
|
170
|
+
.get();
|
|
171
|
+
if (!vlmFtsExists)
|
|
172
|
+
return [];
|
|
173
|
+
let ftsQuery;
|
|
174
|
+
if (phraseSearch) {
|
|
175
|
+
ftsQuery = `"${query.replace(/"/g, '""')}"`;
|
|
176
|
+
}
|
|
177
|
+
else if (preSanitized) {
|
|
178
|
+
// M-7: Defense-in-depth: verify the pre-sanitized query is actually safe
|
|
179
|
+
if (/["'()]/.test(query)) {
|
|
180
|
+
console.error(`[WARN] preSanitized query contains FTS5 metacharacters, falling back to sanitization: "${query}"`);
|
|
181
|
+
ftsQuery = sanitizeFTS5Query(query);
|
|
182
|
+
}
|
|
183
|
+
else {
|
|
184
|
+
ftsQuery = query;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
else {
|
|
188
|
+
ftsQuery = sanitizeFTS5Query(query);
|
|
189
|
+
}
|
|
190
|
+
let sql = `
|
|
191
|
+
SELECT
|
|
192
|
+
e.id AS embedding_id,
|
|
193
|
+
e.image_id,
|
|
194
|
+
e.document_id,
|
|
195
|
+
e.original_text,
|
|
196
|
+
bm25(vlm_fts) AS bm25_score,
|
|
197
|
+
d.file_path AS source_file_path,
|
|
198
|
+
d.file_name AS source_file_name,
|
|
199
|
+
d.file_hash AS source_file_hash,
|
|
200
|
+
e.page_number,
|
|
201
|
+
e.character_start,
|
|
202
|
+
e.character_end,
|
|
203
|
+
e.chunk_index,
|
|
204
|
+
e.provenance_id,
|
|
205
|
+
e.content_hash,
|
|
206
|
+
d.doc_title,
|
|
207
|
+
d.doc_author,
|
|
208
|
+
d.doc_subject,
|
|
209
|
+
(SELECT o.parse_quality_score FROM ocr_results o WHERE o.document_id = e.document_id ORDER BY o.processing_completed_at DESC LIMIT 1) AS ocr_quality_score
|
|
210
|
+
${includeHighlight ? ", snippet(vlm_fts, 0, '<mark>', '</mark>', '...', 32) AS highlight" : ''}
|
|
211
|
+
FROM vlm_fts
|
|
212
|
+
JOIN embeddings e ON vlm_fts.rowid = e.rowid
|
|
213
|
+
JOIN documents d ON e.document_id = d.id
|
|
214
|
+
WHERE vlm_fts MATCH ?
|
|
215
|
+
`;
|
|
216
|
+
const params = [ftsQuery];
|
|
217
|
+
if (documentFilter && documentFilter.length > 0) {
|
|
218
|
+
sql += ` AND e.document_id IN (${documentFilter.map(() => '?').join(',')})`;
|
|
219
|
+
params.push(...documentFilter);
|
|
220
|
+
}
|
|
221
|
+
// VLM only supports page_range_filter (no heading/section/content_type)
|
|
222
|
+
if (pageRangeFilter) {
|
|
223
|
+
if (pageRangeFilter.min_page !== undefined) {
|
|
224
|
+
sql += ' AND e.page_number >= ?';
|
|
225
|
+
params.push(pageRangeFilter.min_page);
|
|
226
|
+
}
|
|
227
|
+
if (pageRangeFilter.max_page !== undefined) {
|
|
228
|
+
sql += ' AND e.page_number <= ?';
|
|
229
|
+
params.push(pageRangeFilter.max_page);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
sql += ` ORDER BY bm25(vlm_fts) LIMIT ?`;
|
|
233
|
+
params.push(limit);
|
|
234
|
+
const rows = this.db.prepare(sql).all(...params);
|
|
235
|
+
const results = rows.map((row, index) => ({
|
|
236
|
+
chunk_id: null,
|
|
237
|
+
image_id: row.image_id,
|
|
238
|
+
embedding_id: row.embedding_id,
|
|
239
|
+
extraction_id: null,
|
|
240
|
+
document_id: row.document_id,
|
|
241
|
+
original_text: row.original_text,
|
|
242
|
+
bm25_score: Math.abs(row.bm25_score),
|
|
243
|
+
rank: index + 1,
|
|
244
|
+
result_type: 'vlm',
|
|
245
|
+
source_file_path: row.source_file_path,
|
|
246
|
+
source_file_name: row.source_file_name,
|
|
247
|
+
source_file_hash: row.source_file_hash,
|
|
248
|
+
page_number: row.page_number,
|
|
249
|
+
character_start: row.character_start,
|
|
250
|
+
character_end: row.character_end,
|
|
251
|
+
chunk_index: row.chunk_index,
|
|
252
|
+
provenance_id: row.provenance_id,
|
|
253
|
+
content_hash: row.content_hash,
|
|
254
|
+
highlight: row.highlight,
|
|
255
|
+
ocr_quality_score: row.ocr_quality_score ?? null,
|
|
256
|
+
doc_title: row.doc_title ?? null,
|
|
257
|
+
doc_author: row.doc_author ?? null,
|
|
258
|
+
doc_subject: row.doc_subject ?? null,
|
|
259
|
+
}));
|
|
260
|
+
applyQualityAndRerank(results);
|
|
261
|
+
return results;
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Search extraction content using FTS5
|
|
265
|
+
* Queries extractions_fts JOIN extractions JOIN documents
|
|
266
|
+
*
|
|
267
|
+
* NOTE: Extractions don't have page numbers or chunk metadata,
|
|
268
|
+
* so chunkFilter and pageRangeFilter are not applied here.
|
|
269
|
+
*/
|
|
270
|
+
searchExtractions(options) {
|
|
271
|
+
const { query, limit = 10, phraseSearch = false, documentFilter, includeHighlight = true, preSanitized = false, } = options;
|
|
272
|
+
if (!query || query.trim().length === 0) {
|
|
273
|
+
throw new Error('BM25 search query cannot be empty');
|
|
274
|
+
}
|
|
275
|
+
// Check if extractions_fts table exists (v9+ only)
|
|
276
|
+
const ftsExists = this.db
|
|
277
|
+
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='extractions_fts'")
|
|
278
|
+
.get();
|
|
279
|
+
if (!ftsExists)
|
|
280
|
+
return [];
|
|
281
|
+
let ftsQuery;
|
|
282
|
+
if (phraseSearch) {
|
|
283
|
+
ftsQuery = `"${query.replace(/"/g, '""')}"`;
|
|
284
|
+
}
|
|
285
|
+
else if (preSanitized) {
|
|
286
|
+
// M-7: Defense-in-depth: verify the pre-sanitized query is actually safe
|
|
287
|
+
if (/["'()]/.test(query)) {
|
|
288
|
+
console.error(`[WARN] preSanitized query contains FTS5 metacharacters, falling back to sanitization: "${query}"`);
|
|
289
|
+
ftsQuery = sanitizeFTS5Query(query);
|
|
290
|
+
}
|
|
291
|
+
else {
|
|
292
|
+
ftsQuery = query;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
else {
|
|
296
|
+
ftsQuery = sanitizeFTS5Query(query);
|
|
297
|
+
}
|
|
298
|
+
let sql = `
|
|
299
|
+
SELECT
|
|
300
|
+
ex.id AS extraction_id,
|
|
301
|
+
ex.document_id,
|
|
302
|
+
ex.extraction_json AS original_text,
|
|
303
|
+
bm25(extractions_fts) AS bm25_score,
|
|
304
|
+
d.file_path AS source_file_path,
|
|
305
|
+
d.file_name AS source_file_name,
|
|
306
|
+
d.file_hash AS source_file_hash,
|
|
307
|
+
ex.provenance_id,
|
|
308
|
+
ex.content_hash,
|
|
309
|
+
d.doc_title,
|
|
310
|
+
d.doc_author,
|
|
311
|
+
d.doc_subject,
|
|
312
|
+
(SELECT o.parse_quality_score FROM ocr_results o WHERE o.document_id = ex.document_id ORDER BY o.processing_completed_at DESC LIMIT 1) AS ocr_quality_score,
|
|
313
|
+
(SELECT e.id FROM embeddings e WHERE e.extraction_id = ex.id ORDER BY e.created_at DESC LIMIT 1) AS embedding_id
|
|
314
|
+
${includeHighlight ? ", snippet(extractions_fts, 0, '<mark>', '</mark>', '...', 32) AS highlight" : ''}
|
|
315
|
+
FROM extractions_fts
|
|
316
|
+
JOIN extractions ex ON extractions_fts.rowid = ex.rowid
|
|
317
|
+
JOIN documents d ON ex.document_id = d.id
|
|
318
|
+
WHERE extractions_fts MATCH ?
|
|
319
|
+
`;
|
|
320
|
+
const params = [ftsQuery];
|
|
321
|
+
if (documentFilter && documentFilter.length > 0) {
|
|
322
|
+
sql += ` AND ex.document_id IN (${documentFilter.map(() => '?').join(',')})`;
|
|
323
|
+
params.push(...documentFilter);
|
|
324
|
+
}
|
|
325
|
+
sql += ` ORDER BY bm25(extractions_fts) LIMIT ?`;
|
|
326
|
+
params.push(limit);
|
|
327
|
+
const rows = this.db.prepare(sql).all(...params);
|
|
328
|
+
const results = rows.map((row, index) => ({
|
|
329
|
+
chunk_id: null,
|
|
330
|
+
image_id: null,
|
|
331
|
+
embedding_id: row.embedding_id ?? null,
|
|
332
|
+
extraction_id: row.extraction_id,
|
|
333
|
+
document_id: row.document_id,
|
|
334
|
+
original_text: row.original_text,
|
|
335
|
+
bm25_score: Math.abs(row.bm25_score),
|
|
336
|
+
rank: index + 1,
|
|
337
|
+
result_type: 'extraction',
|
|
338
|
+
source_file_path: row.source_file_path,
|
|
339
|
+
source_file_name: row.source_file_name,
|
|
340
|
+
source_file_hash: row.source_file_hash,
|
|
341
|
+
page_number: null,
|
|
342
|
+
character_start: 0,
|
|
343
|
+
character_end: 0,
|
|
344
|
+
chunk_index: 0,
|
|
345
|
+
provenance_id: row.provenance_id,
|
|
346
|
+
content_hash: row.content_hash,
|
|
347
|
+
highlight: row.highlight,
|
|
348
|
+
ocr_quality_score: row.ocr_quality_score ?? null,
|
|
349
|
+
doc_title: row.doc_title ?? null,
|
|
350
|
+
doc_author: row.doc_author ?? null,
|
|
351
|
+
doc_subject: row.doc_subject ?? null,
|
|
352
|
+
}));
|
|
353
|
+
applyQualityAndRerank(results);
|
|
354
|
+
return results;
|
|
355
|
+
}
|
|
356
|
+
rebuildIndex() {
|
|
357
|
+
const start = Date.now();
|
|
358
|
+
this.db.exec("INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')");
|
|
359
|
+
const count = this.db.prepare('SELECT COUNT(*) as cnt FROM chunks').get();
|
|
360
|
+
const contentHash = this.computeContentHash();
|
|
361
|
+
const now = new Date().toISOString();
|
|
362
|
+
this.db
|
|
363
|
+
.prepare(`
|
|
364
|
+
INSERT INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
|
|
365
|
+
VALUES (1, ?, ?, 'porter unicode61', ?, ?)
|
|
366
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
367
|
+
last_rebuild_at = excluded.last_rebuild_at,
|
|
368
|
+
chunks_indexed = excluded.chunks_indexed,
|
|
369
|
+
content_hash = excluded.content_hash
|
|
370
|
+
`)
|
|
371
|
+
.run(now, count.cnt, SCHEMA_VERSION, contentHash);
|
|
372
|
+
// Also rebuild VLM FTS if table exists
|
|
373
|
+
const vlmResult = this.rebuildVLMIndex();
|
|
374
|
+
// Also rebuild extractions FTS if table exists
|
|
375
|
+
const extractionResult = this.rebuildExtractionIndex();
|
|
376
|
+
const duration = Date.now() - start;
|
|
377
|
+
return {
|
|
378
|
+
chunks_indexed: count.cnt,
|
|
379
|
+
vlm_indexed: vlmResult.vlm_indexed,
|
|
380
|
+
extractions_indexed: extractionResult.extractions_indexed,
|
|
381
|
+
duration_ms: duration,
|
|
382
|
+
content_hash: contentHash,
|
|
383
|
+
};
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Rebuild VLM FTS index from embeddings where image_id IS NOT NULL
|
|
387
|
+
*/
|
|
388
|
+
rebuildVLMIndex() {
|
|
389
|
+
const vlmFtsExists = this.db
|
|
390
|
+
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vlm_fts'")
|
|
391
|
+
.get();
|
|
392
|
+
if (!vlmFtsExists)
|
|
393
|
+
return { vlm_indexed: 0, duration_ms: 0 };
|
|
394
|
+
const start = Date.now();
|
|
395
|
+
// L-15: Wrap delete-all + insert + metadata update in a transaction so a crash
|
|
396
|
+
// between delete-all and insert cannot leave an empty VLM FTS index.
|
|
397
|
+
// H-4 fix: FTS5 'rebuild' reads ALL rows from the content table (embeddings),
|
|
398
|
+
// including chunk embeddings (image_id IS NULL). This creates ghost VLM results.
|
|
399
|
+
// Instead: clear the index, then manually re-insert only VLM embeddings.
|
|
400
|
+
const rebuildTransaction = this.db.transaction(() => {
|
|
401
|
+
this.db.exec("INSERT INTO vlm_fts(vlm_fts) VALUES('delete-all')");
|
|
402
|
+
this.db.exec(`
|
|
403
|
+
INSERT INTO vlm_fts(rowid, original_text)
|
|
404
|
+
SELECT rowid, original_text FROM embeddings WHERE image_id IS NOT NULL
|
|
405
|
+
`);
|
|
406
|
+
const count = this.db
|
|
407
|
+
.prepare('SELECT COUNT(*) as cnt FROM embeddings WHERE image_id IS NOT NULL')
|
|
408
|
+
.get();
|
|
409
|
+
const now = new Date().toISOString();
|
|
410
|
+
this.db
|
|
411
|
+
.prepare(`
|
|
412
|
+
INSERT INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
|
|
413
|
+
VALUES (2, ?, ?, 'porter unicode61', ?, NULL)
|
|
414
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
415
|
+
last_rebuild_at = excluded.last_rebuild_at,
|
|
416
|
+
chunks_indexed = excluded.chunks_indexed
|
|
417
|
+
`)
|
|
418
|
+
.run(now, count.cnt, SCHEMA_VERSION);
|
|
419
|
+
return count.cnt;
|
|
420
|
+
});
|
|
421
|
+
const vlmCount = rebuildTransaction();
|
|
422
|
+
return {
|
|
423
|
+
vlm_indexed: vlmCount,
|
|
424
|
+
duration_ms: Date.now() - start,
|
|
425
|
+
};
|
|
426
|
+
}
|
|
427
|
+
/**
|
|
428
|
+
* Rebuild extractions FTS index
|
|
429
|
+
*/
|
|
430
|
+
rebuildExtractionIndex() {
|
|
431
|
+
const ftsExists = this.db
|
|
432
|
+
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='extractions_fts'")
|
|
433
|
+
.get();
|
|
434
|
+
if (!ftsExists)
|
|
435
|
+
return { extractions_indexed: 0, duration_ms: 0 };
|
|
436
|
+
const start = Date.now();
|
|
437
|
+
this.db.exec("INSERT INTO extractions_fts(extractions_fts) VALUES('rebuild')");
|
|
438
|
+
const count = this.db.prepare('SELECT COUNT(*) as cnt FROM extractions').get();
|
|
439
|
+
const now = new Date().toISOString();
|
|
440
|
+
this.db
|
|
441
|
+
.prepare(`
|
|
442
|
+
INSERT INTO fts_index_metadata (id, last_rebuild_at, chunks_indexed, tokenizer, schema_version, content_hash)
|
|
443
|
+
VALUES (3, ?, ?, 'porter unicode61', ?, NULL)
|
|
444
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
445
|
+
last_rebuild_at = excluded.last_rebuild_at,
|
|
446
|
+
chunks_indexed = excluded.chunks_indexed
|
|
447
|
+
`)
|
|
448
|
+
.run(now, count.cnt, SCHEMA_VERSION);
|
|
449
|
+
return {
|
|
450
|
+
extractions_indexed: count.cnt,
|
|
451
|
+
duration_ms: Date.now() - start,
|
|
452
|
+
};
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Search document metadata (title, author, subject) using FTS5.
|
|
456
|
+
* Queries documents_fts table (v30+).
|
|
457
|
+
*
|
|
458
|
+
* Returns document IDs and metadata fields matching the query.
|
|
459
|
+
* Used to find documents by metadata rather than content.
|
|
460
|
+
*/
|
|
461
|
+
searchDocumentMetadata(options) {
|
|
462
|
+
const { query, limit = 10, phraseSearch = false } = options;
|
|
463
|
+
if (!query || query.trim().length === 0) {
|
|
464
|
+
throw new Error('Document metadata search query cannot be empty');
|
|
465
|
+
}
|
|
466
|
+
// Check if documents_fts table exists (v30+ only)
|
|
467
|
+
const ftsExists = this.db
|
|
468
|
+
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='documents_fts'")
|
|
469
|
+
.get();
|
|
470
|
+
if (!ftsExists)
|
|
471
|
+
return [];
|
|
472
|
+
const ftsQuery = phraseSearch ? `"${query.replace(/"/g, '""')}"` : sanitizeFTS5Query(query);
|
|
473
|
+
const sql = `
|
|
474
|
+
SELECT
|
|
475
|
+
d.id AS document_id,
|
|
476
|
+
d.file_name,
|
|
477
|
+
d.doc_title,
|
|
478
|
+
d.doc_author,
|
|
479
|
+
d.doc_subject,
|
|
480
|
+
bm25(documents_fts) AS bm25_score
|
|
481
|
+
FROM documents_fts
|
|
482
|
+
JOIN documents d ON documents_fts.rowid = d.rowid
|
|
483
|
+
WHERE documents_fts MATCH ?
|
|
484
|
+
ORDER BY bm25(documents_fts)
|
|
485
|
+
LIMIT ?
|
|
486
|
+
`;
|
|
487
|
+
const rows = this.db.prepare(sql).all(ftsQuery, limit);
|
|
488
|
+
return rows.map((row, index) => ({
|
|
489
|
+
document_id: row.document_id,
|
|
490
|
+
file_name: row.file_name,
|
|
491
|
+
doc_title: row.doc_title ?? null,
|
|
492
|
+
doc_author: row.doc_author ?? null,
|
|
493
|
+
doc_subject: row.doc_subject ?? null,
|
|
494
|
+
bm25_score: Math.abs(row.bm25_score),
|
|
495
|
+
rank: index + 1,
|
|
496
|
+
result_type: 'document_metadata',
|
|
497
|
+
}));
|
|
498
|
+
}
|
|
499
|
+
/**
|
|
500
|
+
* Check whether all expected FTS triggers exist for a given set of trigger names.
|
|
501
|
+
* If all triggers are present, the FTS index is kept in sync atomically and cannot be stale.
|
|
502
|
+
* If any trigger is missing, the index IS stale (triggers are the sync mechanism).
|
|
503
|
+
*/
|
|
504
|
+
checkTriggersExist(triggerNames) {
|
|
505
|
+
if (triggerNames.length === 0)
|
|
506
|
+
return true;
|
|
507
|
+
const placeholders = triggerNames.map(() => '?').join(',');
|
|
508
|
+
const row = this.db
|
|
509
|
+
.prepare(`SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='trigger' AND name IN (${placeholders})`)
|
|
510
|
+
.get(...triggerNames);
|
|
511
|
+
return row.cnt === triggerNames.length;
|
|
512
|
+
}
|
|
513
|
+
getStatus() {
|
|
514
|
+
const meta = this.db.prepare('SELECT * FROM fts_index_metadata WHERE id = 1').get();
|
|
515
|
+
if (!meta) {
|
|
516
|
+
throw new Error('FTS index metadata not found. Database migration to v4 may not have completed.');
|
|
517
|
+
}
|
|
518
|
+
const chunkCount = this.db.prepare('SELECT COUNT(*) as cnt FROM chunks').get().cnt;
|
|
519
|
+
// L-7 fix: Stale detection via trigger existence, not count comparison.
|
|
520
|
+
// FTS is maintained by triggers that fire atomically on INSERT/DELETE/UPDATE.
|
|
521
|
+
// If all triggers exist, the index is in sync by definition.
|
|
522
|
+
// If any trigger is missing, the index IS stale (sync mechanism is broken).
|
|
523
|
+
const chunksTriggersOk = this.checkTriggersExist([
|
|
524
|
+
'chunks_fts_ai', 'chunks_fts_ad', 'chunks_fts_au',
|
|
525
|
+
]);
|
|
526
|
+
// Get VLM FTS metadata (id=2) if it exists
|
|
527
|
+
const vlmMeta = this.db.prepare('SELECT * FROM fts_index_metadata WHERE id = 2').get();
|
|
528
|
+
const vlmCount = this.db
|
|
529
|
+
.prepare('SELECT COUNT(*) as cnt FROM embeddings WHERE image_id IS NOT NULL')
|
|
530
|
+
.get().cnt;
|
|
531
|
+
const vlmIndexed = vlmMeta?.chunks_indexed ?? 0;
|
|
532
|
+
const vlmTriggersOk = this.checkTriggersExist([
|
|
533
|
+
'vlm_fts_ai', 'vlm_fts_ad', 'vlm_fts_au',
|
|
534
|
+
]);
|
|
535
|
+
// Get extraction FTS metadata (id=3) if it exists
|
|
536
|
+
const extractionMeta = this.db
|
|
537
|
+
.prepare('SELECT * FROM fts_index_metadata WHERE id = 3')
|
|
538
|
+
.get();
|
|
539
|
+
const extractionCount = (() => {
|
|
540
|
+
try {
|
|
541
|
+
return this.db.prepare('SELECT COUNT(*) as cnt FROM extractions').get()
|
|
542
|
+
.cnt;
|
|
543
|
+
}
|
|
544
|
+
catch (error) {
|
|
545
|
+
console.error(`[BM25] Failed to count extractions: ${String(error)}`);
|
|
546
|
+
return 0;
|
|
547
|
+
}
|
|
548
|
+
})();
|
|
549
|
+
const extractionsIndexed = extractionMeta?.chunks_indexed ?? 0;
|
|
550
|
+
const extractionTriggersOk = this.checkTriggersExist([
|
|
551
|
+
'extractions_fts_ai', 'extractions_fts_ad', 'extractions_fts_au',
|
|
552
|
+
]);
|
|
553
|
+
return {
|
|
554
|
+
...meta,
|
|
555
|
+
current_chunk_count: chunkCount,
|
|
556
|
+
index_stale: !chunksTriggersOk,
|
|
557
|
+
vlm_indexed: vlmIndexed,
|
|
558
|
+
current_vlm_count: vlmCount,
|
|
559
|
+
vlm_index_stale: !vlmTriggersOk,
|
|
560
|
+
vlm_last_rebuild_at: vlmMeta?.last_rebuild_at ?? null,
|
|
561
|
+
extractions_indexed: extractionsIndexed,
|
|
562
|
+
current_extraction_count: extractionCount,
|
|
563
|
+
extraction_index_stale: !extractionTriggersOk,
|
|
564
|
+
extraction_last_rebuild_at: extractionMeta?.last_rebuild_at ?? null,
|
|
565
|
+
};
|
|
566
|
+
}
|
|
567
|
+
computeContentHash() {
|
|
568
|
+
return computeFTSContentHash(this.db);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Sanitize a user-provided query for safe use in FTS5 MATCH expressions.
|
|
573
|
+
*
|
|
574
|
+
* - Preserves FTS5 boolean operators (AND, OR, NOT)
|
|
575
|
+
* - Treats hyphens as word separators (matching unicode61 tokenizer)
|
|
576
|
+
* - Strips all FTS5 metacharacters (' " ( ) * : ^ ~ + etc.)
|
|
577
|
+
* - Inserts implicit AND between consecutive non-operator tokens
|
|
578
|
+
* - Strips leading/trailing/consecutive operators
|
|
579
|
+
*
|
|
580
|
+
* This is the SINGLE authoritative FTS5 sanitizer for the entire codebase.
|
|
581
|
+
*
|
|
582
|
+
* @param query - Raw user query string
|
|
583
|
+
* @returns Sanitized FTS5 query string
|
|
584
|
+
* @throws Error if query contains no valid tokens after sanitization
|
|
585
|
+
*/
|
|
586
|
+
export function sanitizeFTS5Query(query) {
|
|
587
|
+
const FTS5_OPERATORS = new Set(['AND', 'OR', 'NOT']);
|
|
588
|
+
const rawTokens = query
|
|
589
|
+
.trim()
|
|
590
|
+
.split(/\s+/)
|
|
591
|
+
.filter((t) => t.length > 0);
|
|
592
|
+
const result = [];
|
|
593
|
+
for (const raw of rawTokens) {
|
|
594
|
+
if (FTS5_OPERATORS.has(raw.toUpperCase())) {
|
|
595
|
+
result.push(raw.toUpperCase());
|
|
596
|
+
}
|
|
597
|
+
else {
|
|
598
|
+
// L-5: Treat hyphens as word separators (matching FTS5 unicode61 tokenizer)
|
|
599
|
+
const parts = raw
|
|
600
|
+
.split(/-/)
|
|
601
|
+
.map((p) => p.replace(/['"()*:^~+{}[\]\\;@<>#!$%&|,./`?]/g, ''))
|
|
602
|
+
.filter((p) => p.length > 0);
|
|
603
|
+
result.push(...parts);
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
// Strip leading/trailing operators and consecutive operators
|
|
607
|
+
while (result.length > 0 && FTS5_OPERATORS.has(result[0]))
|
|
608
|
+
result.shift();
|
|
609
|
+
while (result.length > 0 && FTS5_OPERATORS.has(result[result.length - 1]))
|
|
610
|
+
result.pop();
|
|
611
|
+
const cleaned = [];
|
|
612
|
+
for (const t of result) {
|
|
613
|
+
if (FTS5_OPERATORS.has(t) &&
|
|
614
|
+
cleaned.length > 0 &&
|
|
615
|
+
FTS5_OPERATORS.has(cleaned[cleaned.length - 1]))
|
|
616
|
+
continue;
|
|
617
|
+
cleaned.push(t);
|
|
618
|
+
}
|
|
619
|
+
// Strip leading NOT to prevent accidental negative-only queries
|
|
620
|
+
if (cleaned.length >= 2 && cleaned[0] === 'NOT') {
|
|
621
|
+
cleaned.shift();
|
|
622
|
+
}
|
|
623
|
+
const finalTokens = cleaned.filter((t) => t.length > 0);
|
|
624
|
+
if (finalTokens.length === 0) {
|
|
625
|
+
throw new Error('Query contains no valid search tokens after sanitization');
|
|
626
|
+
}
|
|
627
|
+
// Insert implicit AND between consecutive non-operator tokens
|
|
628
|
+
const parts = [];
|
|
629
|
+
for (let i = 0; i < finalTokens.length; i++) {
|
|
630
|
+
parts.push(finalTokens[i]);
|
|
631
|
+
if (i < finalTokens.length - 1 &&
|
|
632
|
+
!FTS5_OPERATORS.has(finalTokens[i]) &&
|
|
633
|
+
!FTS5_OPERATORS.has(finalTokens[i + 1])) {
|
|
634
|
+
parts.push('AND');
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
return parts.join(' ');
|
|
638
|
+
}
|
|
639
|
+
/**
|
|
640
|
+
* Compute SHA-256 content hash of all chunk IDs and text_hashes for FTS index integrity verification.
|
|
641
|
+
* L-10 fix: Uses incremental hashing with iterate() instead of loading all rows into memory.
|
|
642
|
+
* Used by both BM25SearchService and the v3->v4 migration.
|
|
643
|
+
*/
|
|
644
|
+
export function computeFTSContentHash(db) {
|
|
645
|
+
const hash = crypto.createHash('sha256');
|
|
646
|
+
let first = true;
|
|
647
|
+
for (const row of db.prepare('SELECT id, text_hash FROM chunks ORDER BY id').iterate()) {
|
|
648
|
+
const r = row;
|
|
649
|
+
if (!first)
|
|
650
|
+
hash.update('|');
|
|
651
|
+
hash.update(`${r.id}:${r.text_hash}`);
|
|
652
|
+
first = false;
|
|
653
|
+
}
|
|
654
|
+
return 'sha256:' + hash.digest('hex');
|
|
655
|
+
}
|
|
656
|
+
//# sourceMappingURL=bm25.js.map
|