ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clustering Service - Orchestrates document clustering pipeline
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* 1. Fetch document-level embeddings (average-pool chunk embeddings, L2-normalize)
|
|
6
|
+
* 2. Call Python clustering worker (HDBSCAN / agglomerative / kmeans)
|
|
7
|
+
* 3. Store cluster + document_cluster records with provenance
|
|
8
|
+
*
|
|
9
|
+
* CRITICAL: NEVER use console.log() - stdout is JSON-RPC protocol.
|
|
10
|
+
* Use console.error() for all logging.
|
|
11
|
+
*
|
|
12
|
+
* @module services/clustering/clustering-service
|
|
13
|
+
*/
|
|
14
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
15
|
+
import { PythonShell } from 'python-shell';
|
|
16
|
+
import path from 'path';
|
|
17
|
+
import { fileURLToPath } from 'url';
|
|
18
|
+
import { getProvenanceTracker } from '../provenance/index.js';
|
|
19
|
+
import { ProvenanceType } from '../../models/provenance.js';
|
|
20
|
+
import { insertCluster, insertDocumentCluster } from '../storage/database/cluster-operations.js';
|
|
21
|
+
import { computeHash } from '../../utils/hash.js';
|
|
22
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
23
|
+
class ClusteringError extends Error {
|
|
24
|
+
code;
|
|
25
|
+
details;
|
|
26
|
+
constructor(message, code, details) {
|
|
27
|
+
super(message);
|
|
28
|
+
this.code = code;
|
|
29
|
+
this.details = details;
|
|
30
|
+
this.name = 'ClusteringError';
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
34
|
+
// SERVICE
|
|
35
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
36
|
+
/** Worker timeout: 5 minutes */
|
|
37
|
+
const WORKER_TIMEOUT_MS = 300_000;
|
|
38
|
+
/** Max stderr accumulation: 10KB */
|
|
39
|
+
const MAX_STDERR_LENGTH = 10_240;
|
|
40
|
+
/**
|
|
41
|
+
* Compute document-level embeddings by average-pooling chunk embeddings.
|
|
42
|
+
*
|
|
43
|
+
* For each document that has embeddings, fetches all chunk-based vectors
|
|
44
|
+
* from vec_embeddings, computes the element-wise mean, and L2-normalizes.
|
|
45
|
+
*
|
|
46
|
+
* sqlite-vec has NO native vector averaging -- we extract to TypeScript.
|
|
47
|
+
*
|
|
48
|
+
* @param conn - Raw better-sqlite3 connection (for direct vec_embeddings queries)
|
|
49
|
+
* @param documentIds - Optional filter; if empty, includes all documents with embeddings
|
|
50
|
+
* @returns Array of DocumentEmbedding with 768-dim Float32Array per document
|
|
51
|
+
*/
|
|
52
|
+
export function computeDocumentEmbeddings(conn, documentIds) {
|
|
53
|
+
// chunk_id IS NOT NULL ensures we only get chunk embeddings (not VLM or extraction)
|
|
54
|
+
const hasFilter = documentIds && documentIds.length > 0;
|
|
55
|
+
const filterClause = hasFilter
|
|
56
|
+
? ` AND e.document_id IN (${documentIds.map(() => '?').join(', ')})`
|
|
57
|
+
: '';
|
|
58
|
+
const rows = conn
|
|
59
|
+
.prepare(`
|
|
60
|
+
SELECT e.document_id, v.vector
|
|
61
|
+
FROM vec_embeddings v
|
|
62
|
+
JOIN embeddings e ON e.id = v.embedding_id
|
|
63
|
+
WHERE e.chunk_id IS NOT NULL${filterClause}
|
|
64
|
+
ORDER BY e.document_id, e.chunk_index
|
|
65
|
+
`)
|
|
66
|
+
.all(...(hasFilter ? documentIds : []));
|
|
67
|
+
// Group by document_id
|
|
68
|
+
const docVectors = new Map();
|
|
69
|
+
for (const row of rows) {
|
|
70
|
+
const existing = docVectors.get(row.document_id);
|
|
71
|
+
if (existing) {
|
|
72
|
+
existing.push(row.vector);
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
docVectors.set(row.document_id, [row.vector]);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
// Average-pool + L2-normalize per document
|
|
79
|
+
const results = [];
|
|
80
|
+
for (const [docId, vectors] of docVectors) {
|
|
81
|
+
const averaged = averageVectors(vectors);
|
|
82
|
+
results.push({
|
|
83
|
+
document_id: docId,
|
|
84
|
+
embedding: averaged,
|
|
85
|
+
chunk_count: vectors.length,
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
return results;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Average-pool vectors and L2-normalize the result.
|
|
92
|
+
*
|
|
93
|
+
* @param vectors - Array of 768-dim vectors as Buffers (from sqlite-vec)
|
|
94
|
+
* @returns L2-normalized 768-dim Float32Array
|
|
95
|
+
*/
|
|
96
|
+
function averageVectors(vectors) {
|
|
97
|
+
if (vectors.length === 0) {
|
|
98
|
+
return new Float32Array(768);
|
|
99
|
+
}
|
|
100
|
+
const dim = 768;
|
|
101
|
+
const sum = new Float64Array(dim); // Use float64 for accumulation precision
|
|
102
|
+
for (const buf of vectors) {
|
|
103
|
+
const f32 = new Float32Array(buf.buffer, buf.byteOffset, dim);
|
|
104
|
+
for (let i = 0; i < dim; i++) {
|
|
105
|
+
sum[i] += f32[i];
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// Mean
|
|
109
|
+
const n = vectors.length;
|
|
110
|
+
const result = new Float32Array(dim);
|
|
111
|
+
for (let i = 0; i < dim; i++) {
|
|
112
|
+
result[i] = sum[i] / n;
|
|
113
|
+
}
|
|
114
|
+
// L2 normalize
|
|
115
|
+
let norm = 0;
|
|
116
|
+
for (let i = 0; i < dim; i++) {
|
|
117
|
+
norm += result[i] * result[i];
|
|
118
|
+
}
|
|
119
|
+
norm = Math.sqrt(norm);
|
|
120
|
+
if (norm > 0) {
|
|
121
|
+
for (let i = 0; i < dim; i++) {
|
|
122
|
+
result[i] /= norm;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return result;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Run the Python clustering worker via python-shell.
|
|
129
|
+
*
|
|
130
|
+
* Sends JSON to stdin, parses JSON from stdout.
|
|
131
|
+
* Uses the same PythonShell pattern as embedding_worker.py.
|
|
132
|
+
*
|
|
133
|
+
* @param embeddings - 2D array of embeddings [n_docs][768]
|
|
134
|
+
* @param documentIds - Document IDs matching embedding order
|
|
135
|
+
* @param config - Clustering algorithm configuration
|
|
136
|
+
* @param distanceMatrix - Optional precomputed distance matrix [n_docs][n_docs]
|
|
137
|
+
* @returns WorkerResult from Python
|
|
138
|
+
*/
|
|
139
|
+
async function runClusteringWorker(embeddings, documentIds, config, distanceMatrix) {
|
|
140
|
+
const workerPath = path.resolve(__dirname, '../../../python/clustering_worker.py');
|
|
141
|
+
const workerInput = {
|
|
142
|
+
embeddings,
|
|
143
|
+
document_ids: documentIds,
|
|
144
|
+
algorithm: config.algorithm,
|
|
145
|
+
n_clusters: config.n_clusters,
|
|
146
|
+
min_cluster_size: config.min_cluster_size,
|
|
147
|
+
distance_threshold: config.distance_threshold,
|
|
148
|
+
linkage: config.linkage,
|
|
149
|
+
};
|
|
150
|
+
if (distanceMatrix) {
|
|
151
|
+
workerInput.distance_matrix = distanceMatrix;
|
|
152
|
+
}
|
|
153
|
+
const input = JSON.stringify(workerInput);
|
|
154
|
+
return new Promise((resolve, reject) => {
|
|
155
|
+
let settled = false;
|
|
156
|
+
const options = {
|
|
157
|
+
mode: 'text',
|
|
158
|
+
pythonOptions: ['-u'],
|
|
159
|
+
args: [],
|
|
160
|
+
};
|
|
161
|
+
const shell = new PythonShell(workerPath, options);
|
|
162
|
+
let stderr = '';
|
|
163
|
+
let sigkillTimer = null;
|
|
164
|
+
const cleanup = () => {
|
|
165
|
+
if (sigkillTimer) {
|
|
166
|
+
clearTimeout(sigkillTimer);
|
|
167
|
+
sigkillTimer = null;
|
|
168
|
+
}
|
|
169
|
+
};
|
|
170
|
+
const timer = setTimeout(() => {
|
|
171
|
+
if (settled)
|
|
172
|
+
return;
|
|
173
|
+
try {
|
|
174
|
+
shell.kill();
|
|
175
|
+
}
|
|
176
|
+
catch (error) {
|
|
177
|
+
console.error('[ClusteringService] Failed to kill shell on timeout:', error instanceof Error ? error.message : String(error));
|
|
178
|
+
/* ignore */
|
|
179
|
+
}
|
|
180
|
+
// M-6: SIGKILL escalation if SIGTERM doesn't exit within 5s
|
|
181
|
+
sigkillTimer = setTimeout(() => {
|
|
182
|
+
if (!settled) {
|
|
183
|
+
console.error(`[ClusteringService] Process did not exit after SIGTERM, sending SIGKILL (pid: ${shell.childProcess?.pid})`);
|
|
184
|
+
try {
|
|
185
|
+
shell.childProcess?.kill('SIGKILL');
|
|
186
|
+
}
|
|
187
|
+
catch (error) {
|
|
188
|
+
console.error('[ClusteringService] Failed to SIGKILL process (may already be gone):', error instanceof Error ? error.message : String(error));
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
if (!settled) {
|
|
192
|
+
settled = true;
|
|
193
|
+
reject(new ClusteringError(`Clustering worker timeout after ${WORKER_TIMEOUT_MS}ms (SIGKILL after 5s grace)`, 'WORKER_TIMEOUT', { stderr: stderr.substring(0, 1000) }));
|
|
194
|
+
}
|
|
195
|
+
}, 5000);
|
|
196
|
+
}, WORKER_TIMEOUT_MS);
|
|
197
|
+
const outputChunks = [];
|
|
198
|
+
shell.on('message', (msg) => {
|
|
199
|
+
outputChunks.push(msg);
|
|
200
|
+
});
|
|
201
|
+
shell.on('stderr', (err) => {
|
|
202
|
+
if (stderr.length < MAX_STDERR_LENGTH) {
|
|
203
|
+
stderr += err + '\n';
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
const handleEnd = (err) => {
|
|
207
|
+
clearTimeout(timer);
|
|
208
|
+
cleanup();
|
|
209
|
+
if (settled)
|
|
210
|
+
return;
|
|
211
|
+
settled = true;
|
|
212
|
+
if (err) {
|
|
213
|
+
console.error('[ClusterWorker] Error:', err.message);
|
|
214
|
+
if (stderr)
|
|
215
|
+
console.error('[ClusterWorker] Stderr:', stderr.substring(0, 1000));
|
|
216
|
+
}
|
|
217
|
+
const output = outputChunks.join('\n');
|
|
218
|
+
if (!output.trim()) {
|
|
219
|
+
if (err) {
|
|
220
|
+
reject(new ClusteringError(`Clustering worker failed: ${err.message}`, 'WORKER_FAILED', {
|
|
221
|
+
stderr: stderr.substring(0, 1000),
|
|
222
|
+
}));
|
|
223
|
+
}
|
|
224
|
+
else {
|
|
225
|
+
reject(new ClusteringError('Clustering worker produced no output', 'WORKER_FAILED', {
|
|
226
|
+
stderr: stderr.substring(0, 1000),
|
|
227
|
+
}));
|
|
228
|
+
}
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
// Parse the last JSON line
|
|
232
|
+
const lines = output.trim().split('\n');
|
|
233
|
+
let parsed;
|
|
234
|
+
for (let i = lines.length - 1; i >= 0; i--) {
|
|
235
|
+
try {
|
|
236
|
+
parsed = JSON.parse(lines[i].trim());
|
|
237
|
+
break;
|
|
238
|
+
}
|
|
239
|
+
catch (error) {
|
|
240
|
+
console.error('[ClusteringService] JSON parse failed for output line, trying previous:', error instanceof Error ? error.message : String(error));
|
|
241
|
+
/* not JSON, try previous line */
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
if (parsed !== undefined) {
|
|
245
|
+
// Validate required fields exist on parsed worker result
|
|
246
|
+
if (typeof parsed !== 'object' || parsed === null) {
|
|
247
|
+
reject(new ClusteringError('Clustering worker returned non-object result', 'WORKER_PARSE_ERROR', { output: output.substring(0, 1000) }));
|
|
248
|
+
return;
|
|
249
|
+
}
|
|
250
|
+
if (typeof parsed.success !== 'boolean') {
|
|
251
|
+
reject(new ClusteringError('Clustering worker result missing required "success" field', 'WORKER_PARSE_ERROR', { output: output.substring(0, 1000) }));
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
if (parsed.success &&
|
|
255
|
+
parsed.silhouette_score === undefined &&
|
|
256
|
+
parsed.labels === undefined) {
|
|
257
|
+
reject(new ClusteringError('Clustering worker success=true but missing "labels" and "silhouette_score" fields', 'WORKER_PARSE_ERROR', { output: output.substring(0, 1000) }));
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
resolve(parsed);
|
|
261
|
+
}
|
|
262
|
+
else {
|
|
263
|
+
reject(new ClusteringError('Failed to parse clustering worker output as JSON', 'WORKER_PARSE_ERROR', { output: output.substring(0, 1000) }));
|
|
264
|
+
}
|
|
265
|
+
};
|
|
266
|
+
shell.send(input);
|
|
267
|
+
shell.end(handleEnd);
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
/**
|
|
271
|
+
* Compute cosine similarity between a document embedding and a centroid.
|
|
272
|
+
* Both vectors are assumed to be L2-normalized, so similarity = dot product.
|
|
273
|
+
*/
|
|
274
|
+
export function cosineSimilarity(a, b) {
|
|
275
|
+
let dot = 0;
|
|
276
|
+
for (let i = 0; i < a.length; i++) {
|
|
277
|
+
dot += (a[i] ?? 0) * (b[i] ?? 0);
|
|
278
|
+
}
|
|
279
|
+
// Clamp to [0, 1] to handle floating point drift
|
|
280
|
+
return Math.max(0, Math.min(1, dot));
|
|
281
|
+
}
|
|
282
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
283
|
+
// MAIN CLUSTERING PIPELINE
|
|
284
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
285
|
+
/**
|
|
286
|
+
* Run the full clustering pipeline:
|
|
287
|
+
* 1. Compute document-level embeddings
|
|
288
|
+
* 2. Validate minimum document count
|
|
289
|
+
* 3. Call Python clustering worker
|
|
290
|
+
* 4. Create provenance records
|
|
291
|
+
* 5. Store clusters + document_cluster assignments
|
|
292
|
+
*
|
|
293
|
+
* @param db - DatabaseService instance
|
|
294
|
+
* @param vector - VectorService instance (unused directly but validates vec loaded)
|
|
295
|
+
* @param config - Clustering configuration
|
|
296
|
+
* @param documentIds - Optional filter (empty = all documents with embeddings)
|
|
297
|
+
* @returns ClusterRunResult with full results
|
|
298
|
+
*/
|
|
299
|
+
export async function runClustering(db, _vector, config, documentIds) {
|
|
300
|
+
const startTime = performance.now();
|
|
301
|
+
const runId = uuidv4();
|
|
302
|
+
const conn = db.getConnection();
|
|
303
|
+
// Step 1: Compute document-level embeddings
|
|
304
|
+
console.error(`[CLUSTER] Computing document embeddings...`);
|
|
305
|
+
const docEmbeddings = computeDocumentEmbeddings(conn, documentIds);
|
|
306
|
+
if (docEmbeddings.length < 2) {
|
|
307
|
+
throw new ClusteringError(`At least 2 documents with embeddings required for clustering, got ${docEmbeddings.length}`, 'INSUFFICIENT_DOCUMENTS', { found: docEmbeddings.length, requested: documentIds?.length ?? 'all' });
|
|
308
|
+
}
|
|
309
|
+
console.error(`[CLUSTER] ${docEmbeddings.length} documents with embeddings`);
|
|
310
|
+
// Step 2: Prepare data for Python worker
|
|
311
|
+
const orderedDocIds = docEmbeddings.map((d) => d.document_id);
|
|
312
|
+
const embeddingMatrix = docEmbeddings.map((d) => Array.from(d.embedding));
|
|
313
|
+
// Step 3: Call Python clustering worker
|
|
314
|
+
console.error(`[CLUSTER] Running ${config.algorithm} clustering...`);
|
|
315
|
+
const workerResult = await runClusteringWorker(embeddingMatrix, orderedDocIds, config);
|
|
316
|
+
if (!workerResult.success) {
|
|
317
|
+
throw new ClusteringError(`Clustering worker failed: ${workerResult.error}`, 'WORKER_FAILED', {
|
|
318
|
+
error_type: workerResult.error_type,
|
|
319
|
+
error: workerResult.error,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
// Step 4: Store results in database
|
|
323
|
+
console.error(`[CLUSTER] Found ${workerResult.n_clusters} clusters, storing results...`);
|
|
324
|
+
const tracker = getProvenanceTracker(db);
|
|
325
|
+
const now = new Date().toISOString();
|
|
326
|
+
const processingDurationMs = Math.round(performance.now() - startTime);
|
|
327
|
+
const algorithmParamsJson = JSON.stringify({
|
|
328
|
+
algorithm: config.algorithm,
|
|
329
|
+
n_clusters: config.n_clusters,
|
|
330
|
+
min_cluster_size: config.min_cluster_size,
|
|
331
|
+
distance_threshold: config.distance_threshold,
|
|
332
|
+
linkage: config.linkage,
|
|
333
|
+
});
|
|
334
|
+
// Build cluster result items and store cluster records
|
|
335
|
+
const clusterItems = [];
|
|
336
|
+
const labels = workerResult.labels;
|
|
337
|
+
const probabilities = workerResult.probabilities;
|
|
338
|
+
const centroids = workerResult.centroids;
|
|
339
|
+
const coherenceScores = workerResult.coherence_scores;
|
|
340
|
+
// Group documents by cluster label
|
|
341
|
+
const clusterGroups = new Map(); // label -> doc indices
|
|
342
|
+
for (let i = 0; i < labels.length; i++) {
|
|
343
|
+
const label = labels[i];
|
|
344
|
+
if (label === -1)
|
|
345
|
+
continue; // Skip noise
|
|
346
|
+
const existing = clusterGroups.get(label);
|
|
347
|
+
if (existing) {
|
|
348
|
+
existing.push(i);
|
|
349
|
+
}
|
|
350
|
+
else {
|
|
351
|
+
clusterGroups.set(label, [i]);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
// Sort cluster labels
|
|
355
|
+
const sortedLabels = Array.from(clusterGroups.keys()).sort((a, b) => a - b);
|
|
356
|
+
const clusterIdMap = new Map(); // label -> cluster UUID
|
|
357
|
+
// Use a transaction to store everything atomically
|
|
358
|
+
const storeTransaction = conn.transaction(() => {
|
|
359
|
+
for (let ci = 0; ci < sortedLabels.length; ci++) {
|
|
360
|
+
const label = sortedLabels[ci];
|
|
361
|
+
const docIndices = clusterGroups.get(label);
|
|
362
|
+
const centroid = centroids[ci];
|
|
363
|
+
const coherence = coherenceScores[ci];
|
|
364
|
+
const clusterId = uuidv4();
|
|
365
|
+
clusterIdMap.set(label, clusterId);
|
|
366
|
+
// Content hash from centroid + run_id
|
|
367
|
+
const contentHash = computeHash(JSON.stringify(centroid) + ':' + runId);
|
|
368
|
+
// Find any document's provenance_id as the source_id for the cluster provenance
|
|
369
|
+
// Use the first document in this cluster
|
|
370
|
+
const firstDocId = orderedDocIds[docIndices[0]];
|
|
371
|
+
const firstDoc = db.getDocument(firstDocId);
|
|
372
|
+
const sourceProvId = firstDoc?.provenance_id ?? null;
|
|
373
|
+
// Create CLUSTERING provenance record
|
|
374
|
+
const provId = tracker.createProvenance({
|
|
375
|
+
type: ProvenanceType.CLUSTERING,
|
|
376
|
+
source_type: 'CLUSTERING',
|
|
377
|
+
source_id: sourceProvId,
|
|
378
|
+
root_document_id: firstDoc?.provenance_id ?? runId,
|
|
379
|
+
content_hash: contentHash,
|
|
380
|
+
input_hash: computeHash(algorithmParamsJson + ':' + docIndices.length),
|
|
381
|
+
processor: 'clustering-service',
|
|
382
|
+
processor_version: '1.0.0',
|
|
383
|
+
processing_params: {
|
|
384
|
+
algorithm: config.algorithm,
|
|
385
|
+
run_id: runId,
|
|
386
|
+
cluster_index: ci,
|
|
387
|
+
document_count: docIndices.length,
|
|
388
|
+
},
|
|
389
|
+
processing_duration_ms: processingDurationMs,
|
|
390
|
+
processing_quality_score: coherence,
|
|
391
|
+
});
|
|
392
|
+
// Insert cluster record
|
|
393
|
+
const cluster = {
|
|
394
|
+
id: clusterId,
|
|
395
|
+
run_id: runId,
|
|
396
|
+
cluster_index: ci,
|
|
397
|
+
label: null,
|
|
398
|
+
description: null,
|
|
399
|
+
classification_tag: null,
|
|
400
|
+
document_count: docIndices.length,
|
|
401
|
+
centroid_json: JSON.stringify(centroid),
|
|
402
|
+
top_terms_json: null,
|
|
403
|
+
coherence_score: coherence,
|
|
404
|
+
algorithm: config.algorithm,
|
|
405
|
+
algorithm_params_json: algorithmParamsJson,
|
|
406
|
+
silhouette_score: workerResult.silhouette_score ?? null,
|
|
407
|
+
content_hash: contentHash,
|
|
408
|
+
provenance_id: provId,
|
|
409
|
+
created_at: now,
|
|
410
|
+
processing_duration_ms: processingDurationMs,
|
|
411
|
+
};
|
|
412
|
+
insertCluster(conn, cluster);
|
|
413
|
+
// Build result item
|
|
414
|
+
const itemDocIds = [];
|
|
415
|
+
const itemSimilarities = [];
|
|
416
|
+
const itemProbabilities = [];
|
|
417
|
+
for (const idx of docIndices) {
|
|
418
|
+
itemDocIds.push(orderedDocIds[idx]);
|
|
419
|
+
itemSimilarities.push(cosineSimilarity(docEmbeddings[idx].embedding, centroid));
|
|
420
|
+
itemProbabilities.push(probabilities[idx]);
|
|
421
|
+
}
|
|
422
|
+
clusterItems.push({
|
|
423
|
+
cluster_index: ci,
|
|
424
|
+
document_count: docIndices.length,
|
|
425
|
+
coherence_score: coherence,
|
|
426
|
+
centroid,
|
|
427
|
+
document_ids: itemDocIds,
|
|
428
|
+
similarities: itemSimilarities,
|
|
429
|
+
probabilities: itemProbabilities,
|
|
430
|
+
});
|
|
431
|
+
}
|
|
432
|
+
// Store document-cluster assignments (both clustered and noise documents)
|
|
433
|
+
for (let i = 0; i < labels.length; i++) {
|
|
434
|
+
const label = labels[i];
|
|
435
|
+
const isNoise = label === -1;
|
|
436
|
+
const clusterId = isNoise ? null : (clusterIdMap.get(label) ?? null);
|
|
437
|
+
const centroid = isNoise ? null : centroids[sortedLabels.indexOf(label)];
|
|
438
|
+
const similarity = centroid ? cosineSimilarity(docEmbeddings[i].embedding, centroid) : 0;
|
|
439
|
+
const dc = {
|
|
440
|
+
id: uuidv4(),
|
|
441
|
+
document_id: orderedDocIds[i],
|
|
442
|
+
cluster_id: clusterId,
|
|
443
|
+
run_id: runId,
|
|
444
|
+
similarity_to_centroid: Math.round(similarity * 1000000) / 1000000,
|
|
445
|
+
membership_probability: probabilities[i],
|
|
446
|
+
is_noise: isNoise,
|
|
447
|
+
assigned_at: now,
|
|
448
|
+
};
|
|
449
|
+
insertDocumentCluster(conn, dc);
|
|
450
|
+
}
|
|
451
|
+
});
|
|
452
|
+
storeTransaction();
|
|
453
|
+
const noiseDocIds = orderedDocIds.filter((_, i) => labels[i] === -1);
|
|
454
|
+
const totalDurationMs = Math.round(performance.now() - startTime);
|
|
455
|
+
console.error(`[CLUSTER] Done: ${workerResult.n_clusters} clusters, ${noiseDocIds.length} noise docs, ${totalDurationMs}ms`);
|
|
456
|
+
return {
|
|
457
|
+
run_id: runId,
|
|
458
|
+
algorithm: config.algorithm,
|
|
459
|
+
n_clusters: workerResult.n_clusters,
|
|
460
|
+
total_documents: docEmbeddings.length,
|
|
461
|
+
noise_document_ids: noiseDocIds,
|
|
462
|
+
silhouette_score: workerResult.silhouette_score ?? 0,
|
|
463
|
+
clusters: clusterItems,
|
|
464
|
+
processing_duration_ms: totalDurationMs,
|
|
465
|
+
};
|
|
466
|
+
}
|
|
467
|
+
//# sourceMappingURL=clustering-service.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"clustering-service.js","sourceRoot":"","sources":["../../../src/services/clustering/clustering-service.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,EAAE,IAAI,MAAM,EAAE,MAAM,MAAM,CAAC;AACpC,OAAO,EAAE,WAAW,EAAiC,MAAM,cAAc,CAAC;AAC1E,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AAIpC,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAS5D,OAAO,EAAE,aAAa,EAAE,qBAAqB,EAAE,MAAM,2CAA2C,CAAC;AACjG,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAElD,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAa/D,MAAM,eAAgB,SAAQ,KAAK;IAGf;IACA;IAHlB,YACE,OAAe,EACC,IAAyB,EACzB,OAAiC;QAEjD,KAAK,CAAC,OAAO,CAAC,CAAC;QAHC,SAAI,GAAJ,IAAI,CAAqB;QACzB,YAAO,GAAP,OAAO,CAA0B;QAGjD,IAAI,CAAC,IAAI,GAAG,iBAAiB,CAAC;IAChC,CAAC;CACF;AA6BD,kFAAkF;AAClF,UAAU;AACV,kFAAkF;AAElF,gCAAgC;AAChC,MAAM,iBAAiB,GAAG,OAAO,CAAC;AAElC,oCAAoC;AACpC,MAAM,iBAAiB,GAAG,MAAM,CAAC;AAEjC;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,yBAAyB,CACvC,IAAuB,EACvB,WAAsB;IAEtB,oFAAoF;IACpF,MAAM,SAAS,GAAG,WAAW,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC;IACxD,MAAM,YAAY,GAAG,SAAS;QAC5B,CAAC,CAAC,0BAA0B,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;QACpE,CAAC,CAAC,EAAE,CAAC;IAEP,MAAM,IAAI,GAAG,IAAI;SACd,OAAO,CACN;;;;kCAI4B,YAAY;;GAE3C,CACE;SACA,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC,CAGtC,CAAC;IAEH,uBAAuB;IACvB,MAAM,UAAU,GAAG,IAAI,GAAG,EAAoB,CAAC;IAC/C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QACjD,IAAI,QAAQ,EAAE,CAAC;YACb,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAC5B,CAAC;aAAM,CAAC;YACN,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IAED,2CAA2C;IAC3C,MAAM,OAAO,GAAwB,EAAE,CAAC;IACxC,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,IAAI,UAAU,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;QACzC,OAAO,CAAC,IAAI,CAAC;YACX,WAAW,EAAE,KAAK;YAClB,SAAS,EAAE,QAAQ;YACnB,WAAW,EAAE,OAAO,CAAC,MAAM;SAC5B,CAAC,CAAC;IACL,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;;;GAKG;AACH,SAAS,cAAc,CAAC,OAAiB;IACvC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;IAC/B,CAAC;IAED,MAAM,GAAG,GAAG,GAAG,CAAC;IAChB,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,yCAAyC;IAE5E,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;QAC1B,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC;QAC9D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7B,GAAG,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAED,OAAO;IACP,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;IACzB,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;IACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IACzB,CAAC;IAED,eAAe;IACf,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,IAAI,IAAI,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IAChC,CAAC;IACD,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEvB,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;QACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7B,MAAM,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACpB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;;;;;GAWG;AACH,KAAK,UAAU,mBAAmB,CAChC,UAAsB,EACtB,WAAqB,EACrB,MAAwB,EACxB,cAA2B;IAE3B,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,sCAAsC,CAAC,CAAC;IAEnF,MAAM,WAAW,GAA4B;QAC3C,UAAU;QACV,YAAY,EAAE,WAAW;QACzB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;QACzC,kBAAkB,EAAE,MAAM,CAAC,kBAAkB;QAC7C,OAAO,EAAE,MAAM,CAAC,OAAO;KACxB,CAAC;IAEF,IAAI,cAAc,EAAE,CAAC;QACnB,WAAW,CAAC,eAAe,GAAG,cAAc,CAAC;IAC/C,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;IAE1C,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,IAAI,OAAO,GAAG,KAAK,CAAC;QACpB,MAAM,OAAO,GAAuB;YAClC,IAAI,EAAE,MAAM;YACZ,aAAa,EAAE,CAAC,IAAI,CAAC;YACrB,IAAI,EAAE,EAAE;SACT,CAAC;QAEF,MAAM,KAAK,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;QACnD,IAAI,MAAM,GAAG,EAAE,CAAC;QAChB,IAAI,YAAY,GAAyC,IAAI,CAAC;QAE9D,MAAM,OAAO,GAAG,GAAG,EAAE;YACnB,IAAI,YAAY,EAAE,CAAC;gBACjB,YAAY,CAAC,YAAY,CAAC,CAAC;gBAC3B,YAAY,GAAG,IAAI,CAAC;YACtB,CAAC;QACH,CAAC,CAAC;QAEF,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;YAC5B,IAAI,OAAO;gBAAE,OAAO;YACpB,IAAI,CAAC;gBACH,KAAK,CAAC,IAAI,EAAE,CAAC;YACf,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,KAAK,CACX,sDAAsD,EACtD,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CACvD,CAAC;gBACF,YAAY;YACd,CAAC;YACD,4DAA4D;YAC5D,YAAY,GAAG,UAAU,CAAC,GAAG,EAAE;gBAC7B,IAAI,CAAC,OAAO,EAAE,CAAC;oBACb,OAAO,CAAC,KAAK,CACX,iFAAiF,KAAK,CAAC,YAAY,EAAE,GAAG,GAAG,CAC5G,CAAC;oBACF,IAAI,CAAC;wBACH,KAAK,CAAC,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;oBACtC,CAAC;oBAAC,OAAO,KAAK,EAAE,CAAC;wBACf,OAAO,CAAC,KAAK,CACX,sEAAsE,EACtE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CACvD,CAAC;oBACJ,CAAC;gBACH,CAAC;gBACD,IAAI,CAAC,OAAO,EAAE,CAAC;oBACb,OAAO,GAAG,IAAI,CAAC;oBACf,MAAM,CACJ,IAAI,eAAe,CACjB,mCAAmC,iBAAiB,6BAA6B,EACjF,gBAAgB,EAChB,EAAE,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,CACtC,CACF,CAAC;gBACJ,CAAC;YACH,CAAC,EAAE,IAAI,CAAC,CAAC;QACX,CAAC,EAAE,iBAAiB,CAAC,CAAC;QAEtB,MAAM,YAAY,GAAa,EAAE,CAAC;QAClC,KAAK,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,GAAW,EAAE,EAAE;YAClC,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACzB,CAAC,CAAC,CAAC;QAEH,KAAK,CAAC,EAAE,CAAC,QAAQ,EAAE,CAAC,GAAW,EAAE,EAAE;YACjC,IAAI,MAAM,CAAC,MAAM,GAAG,iBAAiB,EAAE,CAAC;gBACtC,MAAM,IAAI,GAAG,GAAG,IAAI,CAAC;YACvB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,MAAM,SAAS,GAAG,CAAC,GAAW,EAAE,EAAE;YAChC,YAAY,CAAC,KAAK,CAAC,CAAC;YACpB,OAAO,EAAE,CAAC;YACV,IAAI,OAAO;gBAAE,OAAO;YACpB,OAAO,GAAG,IAAI,CAAC;YAEf,IAAI,GAAG,EAAE,CAAC;gBACR,OAAO,CAAC,KAAK,CAAC,wBAAwB,EAAE,GAAG,CAAC,OAAO,CAAC,CAAC;gBACrD,IAAI,MAAM;oBAAE,OAAO,CAAC,KAAK,CAAC,yBAAyB,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;YAClF,CAAC;YAED,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACvC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC;gBACnB,IAAI,GAAG,EAAE,CAAC;oBACR,MAAM,CACJ,IAAI,eAAe,CAAC,6BAA6B,GAAG,CAAC,OAAO,EAAE,EAAE,eAAe,EAAE;wBAC/E,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC;qBAClC,CAAC,CACH,CAAC;gBACJ,CAAC;qBAAM,CAAC;oBACN,MAAM,CACJ,IAAI,eAAe,CAAC,sCAAsC,EAAE,eAAe,EAAE;wBAC3E,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC;qBAClC,CAAC,CACH,CAAC;gBACJ,CAAC;gBACD,OAAO;YACT,CAAC;YAED,2BAA2B;YAC3B,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,MAAgC,CAAC;YACrC,KAAK,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC3C,IAAI,CAAC;oBACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAiB,CAAC;oBACrD,MAAM;gBACR,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,OAAO,CAAC,KAAK,CACX,yEAAyE,EACzE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CACvD,CAAC;oBACF,iCAAiC;gBACnC,CAAC;YACH,CAAC;YAED,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;gBACzB,yDAAyD;gBACzD,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;oBAClD,MAAM,CACJ,IAAI,eAAe,CACjB,8CAA8C,EAC9C,oBAAoB,EACpB,EAAE,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,CACtC,CACF,CAAC;oBACF,OAAO;gBACT,CAAC;gBACD,IAAI,OAAO,MAAM,CAAC,OAAO,KAAK,SAAS,EAAE,CAAC;oBACxC,MAAM,CACJ,IAAI,eAAe,CACjB,2DAA2D,EAC3D,oBAAoB,EACpB,EAAE,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,CACtC,CACF,CAAC;oBACF,OAAO;gBACT,CAAC;gBACD,IACE,MAAM,CAAC,OAAO;oBACd,MAAM,CAAC,gBAAgB,KAAK,SAAS;oBACrC,MAAM,CAAC,MAAM,KAAK,SAAS,EAC3B,CAAC;oBACD,MAAM,CACJ,IAAI,eAAe,CACjB,mFAAmF,EACnF,oBAAoB,EACpB,EAAE,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,CACtC,CACF,CAAC;oBACF,OAAO;gBACT,CAAC;gBACD,OAAO,CAAC,MAAM,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,MAAM,CACJ,IAAI,eAAe,CACjB,kDAAkD,EAClD,oBAAoB,EACpB,EAAE,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,CACtC,CACF,CAAC;YACJ,CAAC;QACH,CAAC,CAAC;QAEF,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClB,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,CAA0B,EAAE,CAAW;IACtE,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;IACnC,CAAC;IACD,iDAAiD;IACjD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;AACvC,CAAC;AAED,kFAAkF;AAClF,2BAA2B;AAC3B,kFAAkF;AAElF;;;;;;;;;;;;;GAaG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,EAAmB,EACnB,OAAsB,EACtB,MAAwB,EACxB,WAAsB;IAEtB,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IACpC,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC;IACvB,MAAM,IAAI,GAAG,EAAE,CAAC,aAAa,EAAE,CAAC;IAEhC,4CAA4C;IAC5C,OAAO,CAAC,KAAK,CAAC,4CAA4C,CAAC,CAAC;IAC5D,MAAM,aAAa,GAAG,yBAAyB,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IAEnE,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,IAAI,eAAe,CACvB,qEAAqE,aAAa,CAAC,MAAM,EAAE,EAC3F,wBAAwB,EACxB,EAAE,KAAK,EAAE,aAAa,CAAC,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,IAAI,KAAK,EAAE,CACzE,CAAC;IACJ,CAAC;IAED,OAAO,CAAC,KAAK,CAAC,aAAa,aAAa,CAAC,MAAM,4BAA4B,CAAC,CAAC;IAE7E,yCAAyC;IACzC,MAAM,aAAa,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;IAC9D,MAAM,eAAe,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC;IAE1E,wCAAwC;IACxC,OAAO,CAAC,KAAK,CAAC,qBAAqB,MAAM,CAAC,SAAS,gBAAgB,CAAC,CAAC;IACrE,MAAM,YAAY,GAAG,MAAM,mBAAmB,CAC5C,eAAe,EACf,aAAa,EACb,MAAM,CACP,CAAC;IAEF,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,CAAC;QAC1B,MAAM,IAAI,eAAe,CAAC,6BAA6B,YAAY,CAAC,KAAK,EAAE,EAAE,eAAe,EAAE;YAC5F,UAAU,EAAE,YAAY,CAAC,UAAU;YACnC,KAAK,EAAE,YAAY,CAAC,KAAK;SAC1B,CAAC,CAAC;IACL,CAAC;IAED,oCAAoC;IACpC,OAAO,CAAC,KAAK,CAAC,mBAAmB,YAAY,CAAC,UAAU,+BAA+B,CAAC,CAAC;IACzF,MAAM,OAAO,GAAG,oBAAoB,CAAC,EAAE,CAAC,CAAC;IACzC,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACrC,MAAM,oBAAoB,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,CAAC;IACvE,MAAM,mBAAmB,GAAG,IAAI,CAAC,SAAS,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;QACzC,kBAAkB,EAAE,MAAM,CAAC,kBAAkB;QAC7C,OAAO,EAAE,MAAM,CAAC,OAAO;KACxB,CAAC,CAAC;IAEH,uDAAuD;IACvD,MAAM,YAAY,GAAwB,EAAE,CAAC;IAC7C,MAAM,MAAM,GAAG,YAAY,CAAC,MAAO,CAAC;IACpC,MAAM,aAAa,GAAG,YAAY,CAAC,aAAc,CAAC;IAClD,MAAM,SAAS,GAAG,YAAY,CAAC,SAAU,CAAC;IAC1C,MAAM,eAAe,GAAG,YAAY,CAAC,gBAAiB,CAAC;IAEvD,mCAAmC;IACnC,MAAM,aAAa,GAAG,IAAI,GAAG,EAAoB,CAAC,CAAC,uBAAuB;IAC1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QACxB,IAAI,KAAK,KAAK,CAAC,CAAC;YAAE,SAAS,CAAC,aAAa;QACzC,MAAM,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAC1C,IAAI,QAAQ,EAAE,CAAC;YACb,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACnB,CAAC;aAAM,CAAC;YACN,aAAa,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;IAED,sBAAsB;IACtB,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5E,MAAM,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,wBAAwB;IAExE,mDAAmD;IACnD,MAAM,gBAAgB,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,EAAE;QAC7C,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,YAAY,CAAC,MAAM,EAAE,EAAE,EAAE,EAAE,CAAC;YAChD,MAAM,KAAK,GAAG,YAAY,CAAC,EAAE,CAAC,CAAC;YAC/B,MAAM,UAAU,GAAG,aAAa,CAAC,GAAG,CAAC,KAAK,CAAE,CAAC;YAC7C,MAAM,QAAQ,GAAG,SAAS,CAAC,EAAE,CAAC,CAAC;YAC/B,MAAM,SAAS,GAAG,eAAe,CAAC,EAAE,CAAC,CAAC;YAEtC,MAAM,SAAS,GAAG,MAAM,EAAE,CAAC;YAC3B,YAAY,CAAC,GAAG,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;YAEnC,sCAAsC;YACtC,MAAM,WAAW,GAAG,WAAW,CAAC,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,GAAG,GAAG,GAAG,KAAK,CAAC,CAAC;YAExE,gFAAgF;YAChF,yCAAyC;YACzC,MAAM,UAAU,GAAG,aAAa,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;YAChD,MAAM,QAAQ,GAAG,EAAE,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;YAC5C,MAAM,YAAY,GAAG,QAAQ,EAAE,aAAa,IAAI,IAAI,CAAC;YAErD,sCAAsC;YACtC,MAAM,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC;gBACtC,IAAI,EAAE,cAAc,CAAC,UAAU;gBAC/B,WAAW,EAAE,YAA0B;gBACvC,SAAS,EAAE,YAAY;gBACvB,gBAAgB,EAAE,QAAQ,EAAE,aAAa,IAAI,KAAK;gBAClD,YAAY,EAAE,WAAW;gBACzB,UAAU,EAAE,WAAW,CAAC,mBAAmB,GAAG,GAAG,GAAG,UAAU,CAAC,MAAM,CAAC;gBACtE,SAAS,EAAE,oBAAoB;gBAC/B,iBAAiB,EAAE,OAAO;gBAC1B,iBAAiB,EAAE;oBACjB,SAAS,EAAE,MAAM,CAAC,SAAS;oBAC3B,MAAM,EAAE,KAAK;oBACb,aAAa,EAAE,EAAE;oBACjB,cAAc,EAAE,UAAU,CAAC,MAAM;iBAClC;gBACD,sBAAsB,EAAE,oBAAoB;gBAC5C,wBAAwB,EAAE,SAAS;aACpC,CAAC,CAAC;YAEH,wBAAwB;YACxB,MAAM,OAAO,GAAY;gBACvB,EAAE,EAAE,SAAS;gBACb,MAAM,EAAE,KAAK;gBACb,aAAa,EAAE,EAAE;gBACjB,KAAK,EAAE,IAAI;gBACX,WAAW,EAAE,IAAI;gBACjB,kBAAkB,EAAE,IAAI;gBACxB,cAAc,EAAE,UAAU,CAAC,MAAM;gBACjC,aAAa,EAAE,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC;gBACvC,cAAc,EAAE,IAAI;gBACpB,eAAe,EAAE,SAAS;gBAC1B,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,qBAAqB,EAAE,mBAAmB;gBAC1C,gBAAgB,EAAE,YAAY,CAAC,gBAAgB,IAAI,IAAI;gBACvD,YAAY,EAAE,WAAW;gBACzB,aAAa,EAAE,MAAM;gBACrB,UAAU,EAAE,GAAG;gBACf,sBAAsB,EAAE,oBAAoB;aAC7C,CAAC;YAEF,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;YAE7B,oBAAoB;YACpB,MAAM,UAAU,GAAa,EAAE,CAAC;YAChC,MAAM,gBAAgB,GAAa,EAAE,CAAC;YACtC,MAAM,iBAAiB,GAAa,EAAE,CAAC;YAEvC,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;gBAC7B,UAAU,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC;gBACpC,gBAAgB,CAAC,IAAI,CAAC,gBAAgB,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC,CAAC;gBAChF,iBAAiB,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC;YAC7C,CAAC;YAED,YAAY,CAAC,IAAI,CAAC;gBAChB,aAAa,EAAE,EAAE;gBACjB,cAAc,EAAE,UAAU,CAAC,MAAM;gBACjC,eAAe,EAAE,SAAS;gBAC1B,QAAQ;gBACR,YAAY,EAAE,UAAU;gBACxB,YAAY,EAAE,gBAAgB;gBAC9B,aAAa,EAAE,iBAAiB;aACjC,CAAC,CAAC;QACL,CAAC;QAED,0EAA0E;QAC1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YACxB,MAAM,OAAO,GAAG,KAAK,KAAK,CAAC,CAAC,CAAC;YAC7B,MAAM,SAAS,GAAG,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,CAAC;YACrE,MAAM,QAAQ,GAAG,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC;YACzE,MAAM,UAAU,GAAG,QAAQ,CAAC,CAAC,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAEzF,MAAM,EAAE,GAAoB;gBAC1B,EAAE,EAAE,MAAM,EAAE;gBACZ,WAAW,EAAE,aAAa,CAAC,CAAC,CAAC;gBAC7B,UAAU,EAAE,SAAS;gBACrB,MAAM,EAAE,KAAK;gBACb,sBAAsB,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,OAAO,CAAC,GAAG,OAAO;gBAClE,sBAAsB,EAAE,aAAa,CAAC,CAAC,CAAC;gBACxC,QAAQ,EAAE,OAAO;gBACjB,WAAW,EAAE,GAAG;aACjB,CAAC;YAEF,qBAAqB,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAClC,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,gBAAgB,EAAE,CAAC;IAEnB,MAAM,WAAW,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAErE,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,CAAC;IAClE,OAAO,CAAC,KAAK,CACX,mBAAmB,YAAY,CAAC,UAAU,cAAc,WAAW,CAAC,MAAM,gBAAgB,eAAe,IAAI,CAC9G,CAAC;IAEF,OAAO;QACL,MAAM,EAAE,KAAK;QACb,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,UAAU,EAAE,YAAY,CAAC,UAAW;QACpC,eAAe,EAAE,aAAa,CAAC,MAAM;QACrC,kBAAkB,EAAE,WAAW;QAC/B,gBAAgB,EAAE,YAAY,CAAC,gBAAgB,IAAI,CAAC;QACpD,QAAQ,EAAE,YAAY;QACtB,sBAAsB,EAAE,eAAe;KACxC,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Comparison Diff Service
|
|
3
|
+
*
|
|
4
|
+
* Computes text and structural diffs between two OCR-processed documents.
|
|
5
|
+
* Uses the `diff` npm package (jsdiff) for text comparison.
|
|
6
|
+
*
|
|
7
|
+
* CRITICAL: NEVER use console.log() - stdout is JSON-RPC protocol.
|
|
8
|
+
*/
|
|
9
|
+
import type { TextDiffResult, StructuralDiff } from '../../models/comparison.js';
|
|
10
|
+
/**
|
|
11
|
+
* Input shape for a document's structural metadata used in compareStructure()
|
|
12
|
+
*/
|
|
13
|
+
export interface StructuralDocInput {
|
|
14
|
+
page_count: number | null;
|
|
15
|
+
text_length: number;
|
|
16
|
+
quality_score: number | null;
|
|
17
|
+
ocr_mode: string;
|
|
18
|
+
chunk_count: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Compare structural metadata between two documents
|
|
22
|
+
*
|
|
23
|
+
* @param doc1 - First document structural metadata
|
|
24
|
+
* @param doc2 - Second document structural metadata
|
|
25
|
+
* @returns StructuralDiff with side-by-side metadata
|
|
26
|
+
*/
|
|
27
|
+
export declare function compareStructure(doc1: StructuralDocInput, doc2: StructuralDocInput): StructuralDiff;
|
|
28
|
+
/**
|
|
29
|
+
* Compare two texts using line-level diff
|
|
30
|
+
*
|
|
31
|
+
* @param text1 - First document text
|
|
32
|
+
* @param text2 - Second document text
|
|
33
|
+
* @param maxOperations - Maximum operations to return (default 1000)
|
|
34
|
+
* @returns TextDiffResult with operations, counts, and similarity ratio
|
|
35
|
+
*/
|
|
36
|
+
export declare function compareText(text1: string, text2: string, maxOperations?: number): TextDiffResult;
|
|
37
|
+
/**
|
|
38
|
+
* Generate a human-readable summary of the comparison
|
|
39
|
+
*/
|
|
40
|
+
export declare function generateSummary(textDiff: TextDiffResult | null, structuralDiff: StructuralDiff, doc1Name: string, doc2Name: string): string;
|
|
41
|
+
//# sourceMappingURL=diff-service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"diff-service.d.ts","sourceRoot":"","sources":["../../../src/services/comparison/diff-service.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAGH,OAAO,KAAK,EAEV,cAAc,EACd,cAAc,EACf,MAAM,4BAA4B,CAAC;AAEpC;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;GAMG;AACH,wBAAgB,gBAAgB,CAC9B,IAAI,EAAE,kBAAkB,EACxB,IAAI,EAAE,kBAAkB,GACvB,cAAc,CAahB;AAED;;;;;;;GAOG;AACH,wBAAgB,WAAW,CACzB,KAAK,EAAE,MAAM,EACb,KAAK,EAAE,MAAM,EACb,aAAa,GAAE,MAAa,GAC3B,cAAc,CA+DhB;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,cAAc,GAAG,IAAI,EAC/B,cAAc,EAAE,cAAc,EAC9B,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,MAAM,GACf,MAAM,CAwBR"}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Comparison Diff Service
|
|
3
|
+
*
|
|
4
|
+
* Computes text and structural diffs between two OCR-processed documents.
|
|
5
|
+
* Uses the `diff` npm package (jsdiff) for text comparison.
|
|
6
|
+
*
|
|
7
|
+
* CRITICAL: NEVER use console.log() - stdout is JSON-RPC protocol.
|
|
8
|
+
*/
|
|
9
|
+
import { diffLines } from 'diff';
|
|
10
|
+
/**
|
|
11
|
+
* Compare structural metadata between two documents
|
|
12
|
+
*
|
|
13
|
+
* @param doc1 - First document structural metadata
|
|
14
|
+
* @param doc2 - Second document structural metadata
|
|
15
|
+
* @returns StructuralDiff with side-by-side metadata
|
|
16
|
+
*/
|
|
17
|
+
export function compareStructure(doc1, doc2) {
|
|
18
|
+
return {
|
|
19
|
+
doc1_page_count: doc1.page_count,
|
|
20
|
+
doc2_page_count: doc2.page_count,
|
|
21
|
+
doc1_chunk_count: doc1.chunk_count,
|
|
22
|
+
doc2_chunk_count: doc2.chunk_count,
|
|
23
|
+
doc1_text_length: doc1.text_length,
|
|
24
|
+
doc2_text_length: doc2.text_length,
|
|
25
|
+
doc1_quality_score: doc1.quality_score,
|
|
26
|
+
doc2_quality_score: doc2.quality_score,
|
|
27
|
+
doc1_ocr_mode: doc1.ocr_mode,
|
|
28
|
+
doc2_ocr_mode: doc2.ocr_mode,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Compare two texts using line-level diff
|
|
33
|
+
*
|
|
34
|
+
* @param text1 - First document text
|
|
35
|
+
* @param text2 - Second document text
|
|
36
|
+
* @param maxOperations - Maximum operations to return (default 1000)
|
|
37
|
+
* @returns TextDiffResult with operations, counts, and similarity ratio
|
|
38
|
+
*/
|
|
39
|
+
export function compareText(text1, text2, maxOperations = 1000) {
|
|
40
|
+
const changes = diffLines(text1, text2);
|
|
41
|
+
let doc1Offset = 0;
|
|
42
|
+
let doc2Offset = 0;
|
|
43
|
+
let insertions = 0;
|
|
44
|
+
let deletions = 0;
|
|
45
|
+
let unchanged = 0;
|
|
46
|
+
const operations = [];
|
|
47
|
+
for (const change of changes) {
|
|
48
|
+
let type;
|
|
49
|
+
if (change.added) {
|
|
50
|
+
type = 'insert';
|
|
51
|
+
}
|
|
52
|
+
else if (change.removed) {
|
|
53
|
+
type = 'delete';
|
|
54
|
+
}
|
|
55
|
+
else {
|
|
56
|
+
type = 'equal';
|
|
57
|
+
}
|
|
58
|
+
operations.push({
|
|
59
|
+
type,
|
|
60
|
+
text: change.value,
|
|
61
|
+
doc1_offset: doc1Offset,
|
|
62
|
+
doc2_offset: doc2Offset,
|
|
63
|
+
line_count: change.count ?? 0,
|
|
64
|
+
});
|
|
65
|
+
if (change.added) {
|
|
66
|
+
insertions++;
|
|
67
|
+
doc2Offset += change.value.length;
|
|
68
|
+
}
|
|
69
|
+
else if (change.removed) {
|
|
70
|
+
deletions++;
|
|
71
|
+
doc1Offset += change.value.length;
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
unchanged++;
|
|
75
|
+
doc1Offset += change.value.length;
|
|
76
|
+
doc2Offset += change.value.length;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
const totalOps = operations.length;
|
|
80
|
+
const truncated = totalOps > maxOperations;
|
|
81
|
+
const finalOps = truncated ? operations.slice(0, maxOperations) : operations;
|
|
82
|
+
// Similarity = unchanged chars / total chars
|
|
83
|
+
const unchangedChars = operations
|
|
84
|
+
.filter((o) => o.type === 'equal')
|
|
85
|
+
.reduce((sum, o) => sum + o.text.length, 0);
|
|
86
|
+
const totalChars = text1.length + text2.length;
|
|
87
|
+
const similarityRatio = totalChars === 0 ? 1.0 : (2 * unchangedChars) / totalChars;
|
|
88
|
+
return {
|
|
89
|
+
operations: finalOps,
|
|
90
|
+
total_operations: totalOps,
|
|
91
|
+
truncated,
|
|
92
|
+
insertions,
|
|
93
|
+
deletions,
|
|
94
|
+
unchanged,
|
|
95
|
+
similarity_ratio: Math.round(similarityRatio * 10000) / 10000,
|
|
96
|
+
doc1_length: text1.length,
|
|
97
|
+
doc2_length: text2.length,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Generate a human-readable summary of the comparison
|
|
102
|
+
*/
|
|
103
|
+
export function generateSummary(textDiff, structuralDiff, doc1Name, doc2Name) {
|
|
104
|
+
const parts = [];
|
|
105
|
+
parts.push(`Comparison of "${doc1Name}" vs "${doc2Name}".`);
|
|
106
|
+
if (textDiff) {
|
|
107
|
+
const pct = Math.round(textDiff.similarity_ratio * 100);
|
|
108
|
+
parts.push(`Text similarity: ${pct}%.`);
|
|
109
|
+
parts.push(`${textDiff.insertions} insertions, ${textDiff.deletions} deletions, ${textDiff.unchanged} unchanged sections.`);
|
|
110
|
+
if (textDiff.truncated) {
|
|
111
|
+
parts.push(`(Diff truncated: showing ${textDiff.operations.length} of ${textDiff.total_operations} operations.)`);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
const pageDiff = (structuralDiff.doc1_page_count ?? 0) - (structuralDiff.doc2_page_count ?? 0);
|
|
115
|
+
if (pageDiff !== 0) {
|
|
116
|
+
parts.push(`Page count difference: ${Math.abs(pageDiff)} pages.`);
|
|
117
|
+
}
|
|
118
|
+
return parts.join(' ');
|
|
119
|
+
}
|
|
120
|
+
//# sourceMappingURL=diff-service.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"diff-service.js","sourceRoot":"","sources":["../../../src/services/comparison/diff-service.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,MAAM,CAAC;AAkBjC;;;;;;GAMG;AACH,MAAM,UAAU,gBAAgB,CAC9B,IAAwB,EACxB,IAAwB;IAExB,OAAO;QACL,eAAe,EAAE,IAAI,CAAC,UAAU;QAChC,eAAe,EAAE,IAAI,CAAC,UAAU;QAChC,gBAAgB,EAAE,IAAI,CAAC,WAAW;QAClC,gBAAgB,EAAE,IAAI,CAAC,WAAW;QAClC,gBAAgB,EAAE,IAAI,CAAC,WAAW;QAClC,gBAAgB,EAAE,IAAI,CAAC,WAAW;QAClC,kBAAkB,EAAE,IAAI,CAAC,aAAa;QACtC,kBAAkB,EAAE,IAAI,CAAC,aAAa;QACtC,aAAa,EAAE,IAAI,CAAC,QAAQ;QAC5B,aAAa,EAAE,IAAI,CAAC,QAAQ;KAC7B,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,WAAW,CACzB,KAAa,EACb,KAAa,EACb,gBAAwB,IAAI;IAE5B,MAAM,OAAO,GAAG,SAAS,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAExC,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,MAAM,UAAU,GAAwB,EAAE,CAAC;IAE3C,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,IAA+B,CAAC;QACpC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YACjB,IAAI,GAAG,QAAQ,CAAC;QAClB,CAAC;aAAM,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC1B,IAAI,GAAG,QAAQ,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,IAAI,GAAG,OAAO,CAAC;QACjB,CAAC;QAED,UAAU,CAAC,IAAI,CAAC;YACd,IAAI;YACJ,IAAI,EAAE,MAAM,CAAC,KAAK;YAClB,WAAW,EAAE,UAAU;YACvB,WAAW,EAAE,UAAU;YACvB,UAAU,EAAE,MAAM,CAAC,KAAK,IAAI,CAAC;SAC9B,CAAC,CAAC;QAEH,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YACjB,UAAU,EAAE,CAAC;YACb,UAAU,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;QACpC,CAAC;aAAM,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC1B,SAAS,EAAE,CAAC;YACZ,UAAU,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;QACpC,CAAC;aAAM,CAAC;YACN,SAAS,EAAE,CAAC;YACZ,UAAU,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;YAClC,UAAU,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC;QACpC,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAG,UAAU,CAAC,MAAM,CAAC;IACnC,MAAM,SAAS,GAAG,QAAQ,GAAG,aAAa,CAAC;IAC3C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC;IAE7E,6CAA6C;IAC7C,MAAM,cAAc,GAAG,UAAU;SAC9B,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC;SACjC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;IAC/C,MAAM,eAAe,GAAG,UAAU,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,cAAc,CAAC,GAAG,UAAU,CAAC;IAEnF,OAAO;QACL,UAAU,EAAE,QAAQ;QACpB,gBAAgB,EAAE,QAAQ;QAC1B,SAAS;QACT,UAAU;QACV,SAAS;QACT,SAAS;QACT,gBAAgB,EAAE,IAAI,CAAC,KAAK,CAAC,eAAe,GAAG,KAAK,CAAC,GAAG,KAAK;QAC7D,WAAW,EAAE,KAAK,CAAC,MAAM;QACzB,WAAW,EAAE,KAAK,CAAC,MAAM;KAC1B,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAC7B,QAA+B,EAC/B,cAA8B,EAC9B,QAAgB,EAChB,QAAgB;IAEhB,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,CAAC,IAAI,CAAC,kBAAkB,QAAQ,SAAS,QAAQ,IAAI,CAAC,CAAC;IAE5D,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,gBAAgB,GAAG,GAAG,CAAC,CAAC;QACxD,KAAK,CAAC,IAAI,CAAC,oBAAoB,GAAG,IAAI,CAAC,CAAC;QACxC,KAAK,CAAC,IAAI,CACR,GAAG,QAAQ,CAAC,UAAU,gBAAgB,QAAQ,CAAC,SAAS,eAAe,QAAQ,CAAC,SAAS,sBAAsB,CAChH,CAAC;QACF,IAAI,QAAQ,CAAC,SAAS,EAAE,CAAC;YACvB,KAAK,CAAC,IAAI,CACR,4BAA4B,QAAQ,CAAC,UAAU,CAAC,MAAM,OAAO,QAAQ,CAAC,gBAAgB,eAAe,CACtG,CAAC;QACJ,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAG,CAAC,cAAc,CAAC,eAAe,IAAI,CAAC,CAAC,GAAG,CAAC,cAAc,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC;IAC/F,IAAI,QAAQ,KAAK,CAAC,EAAE,CAAC;QACnB,KAAK,CAAC,IAAI,CAAC,0BAA0B,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;IACpE,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC"}
|