ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OCR Provenance MCP System - Python Workers
|
|
3
|
+
|
|
4
|
+
This package provides:
|
|
5
|
+
- GPU utilities for device detection and VRAM monitoring
|
|
6
|
+
- Datalab OCR worker for document processing
|
|
7
|
+
- Embedding worker for local inference with nomic-embed-text-v1.5
|
|
8
|
+
|
|
9
|
+
CRITICAL DESIGN PRINCIPLES:
|
|
10
|
+
- CP-004: Local Inference - Embedding generation MUST run locally
|
|
11
|
+
- No data leaves the local machine for embedding generation
|
|
12
|
+
- Auto-detects best device: CUDA > MPS (Apple Silicon) > CPU
|
|
13
|
+
|
|
14
|
+
Supported Platforms:
|
|
15
|
+
- Linux/Windows with NVIDIA GPU (CUDA)
|
|
16
|
+
- macOS with Apple Silicon (MPS)
|
|
17
|
+
- Any platform without GPU (CPU fallback)
|
|
18
|
+
|
|
19
|
+
Module Structure:
|
|
20
|
+
- gpu_utils: GPU verification, VRAM monitoring, device detection
|
|
21
|
+
- ocr_worker: Datalab OCR API integration (future)
|
|
22
|
+
- embedding_worker: nomic-embed-text-v1.5 inference
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
__version__ = "1.0.0"
|
|
26
|
+
__author__ = "OCR Provenance MCP System"
|
|
27
|
+
|
|
28
|
+
from .embedding_worker import (
|
|
29
|
+
DEFAULT_BATCH_SIZE,
|
|
30
|
+
DEFAULT_DEVICE,
|
|
31
|
+
EMBEDDING_DIM,
|
|
32
|
+
MODEL_NAME,
|
|
33
|
+
# Constants
|
|
34
|
+
MODEL_PATH,
|
|
35
|
+
MODEL_VERSION,
|
|
36
|
+
PREFIX_DOCUMENT,
|
|
37
|
+
PREFIX_QUERY,
|
|
38
|
+
# Data classes
|
|
39
|
+
EmbeddingResult,
|
|
40
|
+
QueryEmbeddingResult,
|
|
41
|
+
embed_chunks,
|
|
42
|
+
embed_query,
|
|
43
|
+
embed_with_oom_recovery,
|
|
44
|
+
generate_embeddings,
|
|
45
|
+
generate_query_embedding,
|
|
46
|
+
# Core functions
|
|
47
|
+
load_model,
|
|
48
|
+
)
|
|
49
|
+
from .gpu_utils import (
|
|
50
|
+
EmbeddingModelError,
|
|
51
|
+
# Error classes
|
|
52
|
+
GPUError,
|
|
53
|
+
# Type definitions
|
|
54
|
+
GPUInfo,
|
|
55
|
+
GPUNotAvailableError,
|
|
56
|
+
GPUOutOfMemoryError,
|
|
57
|
+
ModelInfo,
|
|
58
|
+
VRAMUsage,
|
|
59
|
+
clear_gpu_memory,
|
|
60
|
+
get_vram_usage,
|
|
61
|
+
test_embedding_generation,
|
|
62
|
+
# Core functions
|
|
63
|
+
verify_gpu,
|
|
64
|
+
verify_model_loading,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
__all__ = [
|
|
68
|
+
"DEFAULT_BATCH_SIZE",
|
|
69
|
+
"DEFAULT_DEVICE",
|
|
70
|
+
"EMBEDDING_DIM",
|
|
71
|
+
"MODEL_NAME",
|
|
72
|
+
# Constants (from embedding_worker)
|
|
73
|
+
"MODEL_PATH",
|
|
74
|
+
"MODEL_VERSION",
|
|
75
|
+
"PREFIX_DOCUMENT",
|
|
76
|
+
"PREFIX_QUERY",
|
|
77
|
+
# Error classes (from gpu_utils)
|
|
78
|
+
"EmbeddingModelError",
|
|
79
|
+
# Data classes (from embedding_worker)
|
|
80
|
+
"EmbeddingResult",
|
|
81
|
+
"GPUError",
|
|
82
|
+
# Type definitions (from gpu_utils)
|
|
83
|
+
"GPUInfo",
|
|
84
|
+
"GPUNotAvailableError",
|
|
85
|
+
"GPUOutOfMemoryError",
|
|
86
|
+
"ModelInfo",
|
|
87
|
+
"QueryEmbeddingResult",
|
|
88
|
+
"VRAMUsage",
|
|
89
|
+
# Version
|
|
90
|
+
"__version__",
|
|
91
|
+
# GPU utilities (from gpu_utils)
|
|
92
|
+
"clear_gpu_memory",
|
|
93
|
+
"embed_chunks",
|
|
94
|
+
"embed_query",
|
|
95
|
+
"embed_with_oom_recovery",
|
|
96
|
+
"generate_embeddings",
|
|
97
|
+
"generate_query_embedding",
|
|
98
|
+
"get_vram_usage",
|
|
99
|
+
# Embedding functions (from embedding_worker)
|
|
100
|
+
"load_model",
|
|
101
|
+
"test_embedding_generation",
|
|
102
|
+
"verify_gpu",
|
|
103
|
+
"verify_model_loading",
|
|
104
|
+
]
|
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Document Clustering Worker for OCR Provenance MCP System
|
|
4
|
+
|
|
5
|
+
Clusters documents by their embedding vectors using HDBSCAN, Agglomerative,
|
|
6
|
+
or K-Means algorithms. Reads JSON from stdin, writes JSON to stdout.
|
|
7
|
+
|
|
8
|
+
CRITICAL CONSTRAINTS:
|
|
9
|
+
- NEVER use print() except for the final JSON output to stdout
|
|
10
|
+
- Use sys.stderr.write() for any debug logging
|
|
11
|
+
- All numpy types MUST be converted to Python types before JSON output
|
|
12
|
+
|
|
13
|
+
Dependencies: scikit-learn >= 1.3 (includes HDBSCAN), numpy
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
echo '{"embeddings": [...], "document_ids": [...], "algorithm": "hdbscan"}' | python clustering_worker.py
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import sys
|
|
23
|
+
import time
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def validate_inputs(data: dict) -> tuple[np.ndarray, list[str], str, dict, np.ndarray | None]:
|
|
29
|
+
"""
|
|
30
|
+
Validate and extract inputs from the parsed JSON data.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Tuple of (embeddings, document_ids, algorithm, params, distance_matrix)
|
|
34
|
+
distance_matrix is None when not provided (use cosine on embeddings).
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
ValueError: On invalid inputs
|
|
38
|
+
"""
|
|
39
|
+
# Validate embeddings
|
|
40
|
+
if "embeddings" not in data:
|
|
41
|
+
raise ValueError("Missing required field: 'embeddings'")
|
|
42
|
+
|
|
43
|
+
embeddings = np.array(data["embeddings"], dtype=np.float32)
|
|
44
|
+
|
|
45
|
+
if embeddings.ndim != 2:
|
|
46
|
+
raise ValueError(f"Embeddings must be 2-dimensional (N, D), got shape {embeddings.shape}")
|
|
47
|
+
|
|
48
|
+
n_docs = embeddings.shape[0]
|
|
49
|
+
if n_docs < 2:
|
|
50
|
+
raise ValueError(f"At least 2 documents required for clustering, got {n_docs}")
|
|
51
|
+
|
|
52
|
+
# Validate document_ids
|
|
53
|
+
document_ids = data.get("document_ids", [])
|
|
54
|
+
if document_ids and len(document_ids) != n_docs:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"document_ids length ({len(document_ids)}) does not match embeddings count ({n_docs})"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Validate algorithm
|
|
60
|
+
algorithm = data.get("algorithm", "hdbscan")
|
|
61
|
+
valid_algorithms = ("hdbscan", "agglomerative", "kmeans")
|
|
62
|
+
if algorithm not in valid_algorithms:
|
|
63
|
+
raise ValueError(f"Unknown algorithm '{algorithm}'. Must be one of: {valid_algorithms}")
|
|
64
|
+
|
|
65
|
+
# Extract algorithm parameters
|
|
66
|
+
params = {
|
|
67
|
+
"n_clusters": data.get("n_clusters"),
|
|
68
|
+
"min_cluster_size": data.get("min_cluster_size", 3),
|
|
69
|
+
"distance_threshold": data.get("distance_threshold", 1.0),
|
|
70
|
+
"linkage": data.get("linkage", "average"),
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Validate optional precomputed distance matrix
|
|
74
|
+
distance_matrix: np.ndarray | None = None
|
|
75
|
+
if "distance_matrix" in data:
|
|
76
|
+
distance_matrix = np.array(data["distance_matrix"], dtype=np.float64)
|
|
77
|
+
if distance_matrix.shape != (n_docs, n_docs):
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"distance_matrix shape {distance_matrix.shape} does not match "
|
|
80
|
+
f"document count ({n_docs}, {n_docs})"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return embeddings, document_ids, algorithm, params, distance_matrix
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def cluster_hdbscan(
|
|
87
|
+
embeddings: np.ndarray,
|
|
88
|
+
min_cluster_size: int,
|
|
89
|
+
distance_matrix: np.ndarray | None = None,
|
|
90
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
91
|
+
"""
|
|
92
|
+
Cluster using HDBSCAN with cosine distance matrix.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
embeddings: (N, D) float32 array
|
|
96
|
+
min_cluster_size: Minimum points to form a cluster
|
|
97
|
+
distance_matrix: Optional precomputed distance matrix (N, N)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Tuple of (labels, probabilities)
|
|
101
|
+
"""
|
|
102
|
+
from sklearn.cluster import HDBSCAN
|
|
103
|
+
from sklearn.metrics.pairwise import cosine_distances
|
|
104
|
+
|
|
105
|
+
dist_matrix = distance_matrix if distance_matrix is not None else cosine_distances(embeddings)
|
|
106
|
+
|
|
107
|
+
clusterer = HDBSCAN(
|
|
108
|
+
min_cluster_size=min_cluster_size,
|
|
109
|
+
metric="precomputed",
|
|
110
|
+
cluster_selection_method="eom",
|
|
111
|
+
allow_single_cluster=True,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# MUST pass .copy() -- sklearn may mutate the input distance matrix
|
|
115
|
+
labels = clusterer.fit_predict(dist_matrix.copy())
|
|
116
|
+
probabilities = clusterer.probabilities_
|
|
117
|
+
|
|
118
|
+
return labels, probabilities
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def cluster_agglomerative(
|
|
122
|
+
embeddings: np.ndarray,
|
|
123
|
+
n_clusters: int | None,
|
|
124
|
+
distance_threshold: float,
|
|
125
|
+
linkage: str,
|
|
126
|
+
distance_matrix: np.ndarray | None = None,
|
|
127
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
128
|
+
"""
|
|
129
|
+
Cluster using Agglomerative Clustering with cosine metric.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
embeddings: (N, D) float32 array
|
|
133
|
+
n_clusters: Number of clusters (None to use distance_threshold)
|
|
134
|
+
distance_threshold: Max linkage distance (used when n_clusters is None)
|
|
135
|
+
linkage: Linkage criterion ('average', 'complete', 'single')
|
|
136
|
+
distance_matrix: Optional precomputed distance matrix (N, N)
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Tuple of (labels, probabilities)
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If ward linkage is requested (incompatible with cosine/precomputed)
|
|
143
|
+
"""
|
|
144
|
+
from sklearn.cluster import AgglomerativeClustering
|
|
145
|
+
|
|
146
|
+
# CRITICAL: ward linkage is INCOMPATIBLE with cosine/precomputed metric
|
|
147
|
+
if linkage == "ward":
|
|
148
|
+
raise ValueError(
|
|
149
|
+
"Ward linkage is incompatible with cosine distance. "
|
|
150
|
+
"Use 'average', 'complete', or 'single' instead."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
metric = "precomputed" if distance_matrix is not None else "cosine"
|
|
154
|
+
fit_data = distance_matrix if distance_matrix is not None else embeddings
|
|
155
|
+
|
|
156
|
+
if n_clusters is not None:
|
|
157
|
+
clusterer = AgglomerativeClustering(
|
|
158
|
+
n_clusters=n_clusters,
|
|
159
|
+
metric=metric,
|
|
160
|
+
linkage=linkage,
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
clusterer = AgglomerativeClustering(
|
|
164
|
+
n_clusters=None,
|
|
165
|
+
metric=metric,
|
|
166
|
+
linkage=linkage,
|
|
167
|
+
distance_threshold=distance_threshold,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
labels = clusterer.fit_predict(fit_data)
|
|
171
|
+
# Agglomerative does not produce probabilities
|
|
172
|
+
probabilities = np.ones(len(labels), dtype=np.float64)
|
|
173
|
+
|
|
174
|
+
return labels, probabilities
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def cluster_kmeans(
|
|
178
|
+
embeddings: np.ndarray,
|
|
179
|
+
n_clusters: int | None,
|
|
180
|
+
distance_matrix: np.ndarray | None = None,
|
|
181
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
182
|
+
"""
|
|
183
|
+
Cluster using K-Means.
|
|
184
|
+
|
|
185
|
+
When a precomputed distance_matrix is provided, K-Means cannot be used
|
|
186
|
+
directly (it requires feature vectors). In this case we fall back to
|
|
187
|
+
spectral embedding of the distance matrix into n_clusters dimensions,
|
|
188
|
+
then run K-Means on the spectral features.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
embeddings: (N, D) float32 array
|
|
192
|
+
n_clusters: Number of clusters (defaults to sqrt(N) if None)
|
|
193
|
+
distance_matrix: Optional precomputed distance matrix (N, N)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Tuple of (labels, probabilities)
|
|
197
|
+
"""
|
|
198
|
+
from sklearn.cluster import KMeans
|
|
199
|
+
|
|
200
|
+
if n_clusters is None:
|
|
201
|
+
# Reasonable default: sqrt(N), clamped to [2, N-1]
|
|
202
|
+
n_clusters = max(2, min(int(np.sqrt(len(embeddings))), len(embeddings) - 1))
|
|
203
|
+
|
|
204
|
+
if distance_matrix is not None:
|
|
205
|
+
# K-Means needs feature vectors; convert distance matrix via MDS
|
|
206
|
+
from sklearn.manifold import MDS
|
|
207
|
+
|
|
208
|
+
mds = MDS(
|
|
209
|
+
n_components=min(n_clusters, len(embeddings) - 1),
|
|
210
|
+
dissimilarity="precomputed",
|
|
211
|
+
random_state=42,
|
|
212
|
+
normalized_stress=False,
|
|
213
|
+
)
|
|
214
|
+
feature_vectors = mds.fit_transform(distance_matrix)
|
|
215
|
+
clusterer = KMeans(n_clusters=n_clusters, n_init="auto", random_state=42)
|
|
216
|
+
labels = clusterer.fit_predict(feature_vectors)
|
|
217
|
+
else:
|
|
218
|
+
clusterer = KMeans(n_clusters=n_clusters, n_init="auto")
|
|
219
|
+
labels = clusterer.fit_predict(embeddings)
|
|
220
|
+
|
|
221
|
+
# K-Means does not produce probabilities
|
|
222
|
+
probabilities = np.ones(len(labels), dtype=np.float64)
|
|
223
|
+
|
|
224
|
+
return labels, probabilities
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def compute_centroids(embeddings: np.ndarray, labels: np.ndarray) -> list[list[float]]:
|
|
228
|
+
"""
|
|
229
|
+
Compute L2-normalized centroid for each cluster (excluding noise label -1).
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
embeddings: (N, D) float32 array
|
|
233
|
+
labels: Cluster labels (N,)
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
List of centroid vectors, one per cluster (ordered by cluster label)
|
|
237
|
+
"""
|
|
238
|
+
unique_labels = sorted(set(labels.tolist()))
|
|
239
|
+
centroids = []
|
|
240
|
+
|
|
241
|
+
for k in unique_labels:
|
|
242
|
+
if k == -1:
|
|
243
|
+
continue # Skip noise
|
|
244
|
+
mask = labels == k
|
|
245
|
+
cluster_embeddings = embeddings[mask]
|
|
246
|
+
centroid = cluster_embeddings.mean(axis=0)
|
|
247
|
+
# L2 normalize
|
|
248
|
+
norm = np.linalg.norm(centroid)
|
|
249
|
+
if norm > 0:
|
|
250
|
+
centroid = centroid / norm
|
|
251
|
+
centroids.append(centroid.tolist())
|
|
252
|
+
|
|
253
|
+
return centroids
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def compute_coherence_scores(embeddings: np.ndarray, labels: np.ndarray) -> list[float]:
|
|
257
|
+
"""
|
|
258
|
+
Compute average pairwise cosine similarity within each cluster.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
embeddings: (N, D) float32 array
|
|
262
|
+
labels: Cluster labels (N,)
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
List of coherence scores, one per cluster (ordered by cluster label)
|
|
266
|
+
"""
|
|
267
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
268
|
+
|
|
269
|
+
unique_labels = sorted(set(labels.tolist()))
|
|
270
|
+
scores = []
|
|
271
|
+
|
|
272
|
+
for k in unique_labels:
|
|
273
|
+
if k == -1:
|
|
274
|
+
continue # Skip noise
|
|
275
|
+
mask = labels == k
|
|
276
|
+
cluster_embeddings = embeddings[mask]
|
|
277
|
+
|
|
278
|
+
if len(cluster_embeddings) < 2:
|
|
279
|
+
# Single-member cluster has perfect coherence
|
|
280
|
+
scores.append(1.0)
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
sim_matrix = cosine_similarity(cluster_embeddings)
|
|
284
|
+
# Average of upper triangle (excluding diagonal)
|
|
285
|
+
n = len(cluster_embeddings)
|
|
286
|
+
upper_sum = (sim_matrix.sum() - np.trace(sim_matrix)) / 2.0
|
|
287
|
+
n_pairs = n * (n - 1) / 2.0
|
|
288
|
+
avg_sim = float(upper_sum / n_pairs) if n_pairs > 0 else 1.0
|
|
289
|
+
scores.append(round(avg_sim, 6))
|
|
290
|
+
|
|
291
|
+
return scores
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def compute_silhouette(embeddings: np.ndarray, labels: np.ndarray) -> float:
|
|
295
|
+
"""
|
|
296
|
+
Compute silhouette score, excluding noise points (label == -1).
|
|
297
|
+
|
|
298
|
+
Returns 0.0 if all docs are noise or only 1 cluster exists.
|
|
299
|
+
"""
|
|
300
|
+
from sklearn.metrics import silhouette_score
|
|
301
|
+
|
|
302
|
+
# Filter out noise
|
|
303
|
+
non_noise_mask = labels >= 0
|
|
304
|
+
filtered_embeddings = embeddings[non_noise_mask]
|
|
305
|
+
filtered_labels = labels[non_noise_mask]
|
|
306
|
+
|
|
307
|
+
# Need at least 2 clusters and 2 samples
|
|
308
|
+
unique_clusters = set(filtered_labels.tolist())
|
|
309
|
+
if len(unique_clusters) < 2 or len(filtered_embeddings) < 2:
|
|
310
|
+
return 0.0
|
|
311
|
+
|
|
312
|
+
score = silhouette_score(filtered_embeddings, filtered_labels, metric="cosine")
|
|
313
|
+
return round(float(score), 6)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def run_clustering(data: dict) -> dict:
|
|
317
|
+
"""
|
|
318
|
+
Main clustering pipeline.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
data: Parsed input JSON
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Result dict ready for JSON serialization
|
|
325
|
+
"""
|
|
326
|
+
start_time = time.perf_counter()
|
|
327
|
+
|
|
328
|
+
# Validate inputs
|
|
329
|
+
embeddings, _document_ids, algorithm, params, distance_matrix = validate_inputs(data)
|
|
330
|
+
|
|
331
|
+
# Dispatch to algorithm
|
|
332
|
+
if algorithm == "hdbscan":
|
|
333
|
+
labels, probabilities = cluster_hdbscan(
|
|
334
|
+
embeddings, params["min_cluster_size"], distance_matrix
|
|
335
|
+
)
|
|
336
|
+
elif algorithm == "agglomerative":
|
|
337
|
+
labels, probabilities = cluster_agglomerative(
|
|
338
|
+
embeddings,
|
|
339
|
+
params["n_clusters"],
|
|
340
|
+
params["distance_threshold"],
|
|
341
|
+
params["linkage"],
|
|
342
|
+
distance_matrix,
|
|
343
|
+
)
|
|
344
|
+
elif algorithm == "kmeans":
|
|
345
|
+
labels, probabilities = cluster_kmeans(embeddings, params["n_clusters"], distance_matrix)
|
|
346
|
+
|
|
347
|
+
# Compute metrics
|
|
348
|
+
labels_list = labels.tolist()
|
|
349
|
+
noise_mask = labels == -1
|
|
350
|
+
noise_indices = [int(i) for i in np.where(noise_mask)[0]]
|
|
351
|
+
noise_count = int(noise_mask.sum())
|
|
352
|
+
|
|
353
|
+
# Number of actual clusters (excluding noise label -1)
|
|
354
|
+
unique_clusters = set(labels_list)
|
|
355
|
+
unique_clusters.discard(-1)
|
|
356
|
+
n_clusters = len(unique_clusters)
|
|
357
|
+
|
|
358
|
+
centroids = compute_centroids(embeddings, labels)
|
|
359
|
+
coherence_scores = compute_coherence_scores(embeddings, labels)
|
|
360
|
+
silhouette = compute_silhouette(embeddings, labels)
|
|
361
|
+
|
|
362
|
+
elapsed_ms = round((time.perf_counter() - start_time) * 1000, 2)
|
|
363
|
+
|
|
364
|
+
return {
|
|
365
|
+
"success": True,
|
|
366
|
+
"labels": labels_list,
|
|
367
|
+
"probabilities": [round(float(p), 6) for p in probabilities],
|
|
368
|
+
"centroids": centroids,
|
|
369
|
+
"n_clusters": n_clusters,
|
|
370
|
+
"noise_count": noise_count,
|
|
371
|
+
"noise_indices": noise_indices,
|
|
372
|
+
"silhouette_score": silhouette,
|
|
373
|
+
"coherence_scores": coherence_scores,
|
|
374
|
+
"elapsed_ms": elapsed_ms,
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def main() -> None:
|
|
379
|
+
"""Entry point: read JSON from stdin, write JSON to stdout."""
|
|
380
|
+
try:
|
|
381
|
+
raw_input = sys.stdin.read()
|
|
382
|
+
if not raw_input.strip():
|
|
383
|
+
raise ValueError("Empty input on stdin")
|
|
384
|
+
|
|
385
|
+
data = json.loads(raw_input)
|
|
386
|
+
result = run_clustering(data)
|
|
387
|
+
print(json.dumps(result))
|
|
388
|
+
sys.exit(0)
|
|
389
|
+
|
|
390
|
+
except json.JSONDecodeError as e:
|
|
391
|
+
print(
|
|
392
|
+
json.dumps(
|
|
393
|
+
{
|
|
394
|
+
"success": False,
|
|
395
|
+
"error": f"Invalid JSON input: {e}",
|
|
396
|
+
"error_type": "JSONDecodeError",
|
|
397
|
+
}
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
sys.exit(1)
|
|
401
|
+
|
|
402
|
+
except ValueError as e:
|
|
403
|
+
print(
|
|
404
|
+
json.dumps(
|
|
405
|
+
{
|
|
406
|
+
"success": False,
|
|
407
|
+
"error": str(e),
|
|
408
|
+
"error_type": "ValueError",
|
|
409
|
+
}
|
|
410
|
+
)
|
|
411
|
+
)
|
|
412
|
+
sys.exit(1)
|
|
413
|
+
|
|
414
|
+
except ImportError as e:
|
|
415
|
+
print(
|
|
416
|
+
json.dumps(
|
|
417
|
+
{
|
|
418
|
+
"success": False,
|
|
419
|
+
"error": f"Missing dependency: {e}. Requires scikit-learn >= 1.3 and numpy.",
|
|
420
|
+
"error_type": "ImportError",
|
|
421
|
+
}
|
|
422
|
+
)
|
|
423
|
+
)
|
|
424
|
+
sys.exit(1)
|
|
425
|
+
|
|
426
|
+
except Exception as e:
|
|
427
|
+
print(
|
|
428
|
+
json.dumps(
|
|
429
|
+
{
|
|
430
|
+
"success": False,
|
|
431
|
+
"error": str(e),
|
|
432
|
+
"error_type": type(e).__name__,
|
|
433
|
+
}
|
|
434
|
+
)
|
|
435
|
+
)
|
|
436
|
+
sys.exit(1)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
if __name__ == "__main__":
|
|
440
|
+
main()
|