ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,1394 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Report MCP Tools
|
|
3
|
+
*
|
|
4
|
+
* Tools for generating evaluation reports on OCR and VLM processing results.
|
|
5
|
+
* Produces markdown reports with statistics, metrics, and quality analysis.
|
|
6
|
+
*
|
|
7
|
+
* CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
|
|
8
|
+
* Use console.error() for all logging.
|
|
9
|
+
*
|
|
10
|
+
* @module tools/reports
|
|
11
|
+
*/
|
|
12
|
+
import { z } from 'zod';
|
|
13
|
+
import * as fs from 'fs';
|
|
14
|
+
import { dirname } from 'path';
|
|
15
|
+
import { safeMin, safeMax } from '../utils/math.js';
|
|
16
|
+
import { requireDatabase } from '../server/state.js';
|
|
17
|
+
import { successResult } from '../server/types.js';
|
|
18
|
+
import { MCPError } from '../server/errors.js';
|
|
19
|
+
import { formatResponse, handleError } from './shared.js';
|
|
20
|
+
import { validateInput, sanitizePath } from '../utils/validation.js';
|
|
21
|
+
import { getImageStats, getImagesByDocument, } from '../services/storage/database/image-operations.js';
|
|
22
|
+
import { getComparisonSummariesByDocument } from '../services/storage/database/comparison-operations.js';
|
|
23
|
+
import { getClusteringStats, getClusterSummariesForDocument, } from '../services/storage/database/cluster-operations.js';
|
|
24
|
+
// ===============================================================================
|
|
25
|
+
// VALIDATION SCHEMAS
|
|
26
|
+
// ===============================================================================
|
|
27
|
+
const EvaluationReportInput = z.object({
|
|
28
|
+
output_path: z.string().optional(),
|
|
29
|
+
confidence_threshold: z.number().min(0).max(1).default(0.7),
|
|
30
|
+
});
|
|
31
|
+
const DocumentReportInput = z.object({
|
|
32
|
+
document_id: z.string().min(1),
|
|
33
|
+
});
|
|
34
|
+
const ReportOverviewInput = z.object({
|
|
35
|
+
section: z.enum(['quality', 'corpus', 'all']).default('all'),
|
|
36
|
+
include_section_frequency: z.boolean().default(true),
|
|
37
|
+
include_content_type_distribution: z.boolean().default(true),
|
|
38
|
+
limit: z.number().int().min(1).max(100).default(20),
|
|
39
|
+
});
|
|
40
|
+
const ReportPerformanceInput = z.object({
|
|
41
|
+
section: z.enum(['pipeline', 'throughput', 'bottlenecks', 'all']).default('all'),
|
|
42
|
+
group_by: z.enum(['total', 'document', 'mode', 'file_type']).default('total'),
|
|
43
|
+
limit: z.number().int().min(1).max(100).default(20),
|
|
44
|
+
processor_filter: z.string().optional(),
|
|
45
|
+
bucket: z.enum(['hourly', 'daily', 'weekly', 'monthly']).default('daily'),
|
|
46
|
+
created_after: z.string().optional(),
|
|
47
|
+
created_before: z.string().optional(),
|
|
48
|
+
});
|
|
49
|
+
const ErrorAnalyticsInput = z.object({
|
|
50
|
+
include_error_messages: z.boolean().default(true),
|
|
51
|
+
limit: z.number().int().min(1).max(50).default(10),
|
|
52
|
+
});
|
|
53
|
+
// MERGE-C: Unified trends schema (ocr_quality_trends + ocr_timeline_analytics → ocr_trends)
|
|
54
|
+
const TrendsInput = z.object({
|
|
55
|
+
metric: z.enum(['quality', 'volume']).describe('Trend type: quality (OCR scores over time) or volume (processing counts over time)'),
|
|
56
|
+
bucket: z.enum(['hourly', 'daily', 'weekly', 'monthly']).default('daily'),
|
|
57
|
+
created_after: z.string().optional(),
|
|
58
|
+
created_before: z.string().optional(),
|
|
59
|
+
// quality-specific
|
|
60
|
+
group_by: z.enum(['none', 'ocr_mode', 'processor']).default('none')
|
|
61
|
+
.describe('(quality only) Group by OCR mode or processor'),
|
|
62
|
+
// volume-specific
|
|
63
|
+
volume_metric: z.enum(['documents', 'pages', 'chunks', 'embeddings', 'images', 'cost']).default('documents')
|
|
64
|
+
.describe('(volume only) Which metric to track over time'),
|
|
65
|
+
});
|
|
66
|
+
/**
|
|
67
|
+
* Handle ocr_evaluation_report - Generate comprehensive evaluation report
|
|
68
|
+
*/
|
|
69
|
+
export async function handleEvaluationReport(params) {
|
|
70
|
+
try {
|
|
71
|
+
const input = validateInput(EvaluationReportInput, params);
|
|
72
|
+
const outputPath = input.output_path;
|
|
73
|
+
const confidenceThreshold = input.confidence_threshold ?? 0.7;
|
|
74
|
+
const { db } = requireDatabase();
|
|
75
|
+
// Get overall stats
|
|
76
|
+
const imageStats = getImageStats(db.getConnection());
|
|
77
|
+
const dbStats = db.getStats();
|
|
78
|
+
// Get per-document stats
|
|
79
|
+
const documents = db.listDocuments({ limit: 1000 });
|
|
80
|
+
const docStats = [];
|
|
81
|
+
const imageTypeDistribution = {};
|
|
82
|
+
let totalConfidence = 0;
|
|
83
|
+
let confidenceCount = 0;
|
|
84
|
+
// M-10: Prepare per-document image status count query (reuse statement)
|
|
85
|
+
const docImageCountStmt = db.getConnection().prepare(`
|
|
86
|
+
SELECT
|
|
87
|
+
COUNT(*) as total,
|
|
88
|
+
COUNT(CASE WHEN vlm_status = 'pending' THEN 1 END) as pending,
|
|
89
|
+
COUNT(CASE WHEN vlm_status = 'failed' THEN 1 END) as failed
|
|
90
|
+
FROM images WHERE document_id = ?
|
|
91
|
+
`);
|
|
92
|
+
for (const doc of documents) {
|
|
93
|
+
// M-10: Use vlmStatus filter to only load complete images from SQL
|
|
94
|
+
const completeImages = getImagesByDocument(db.getConnection(), doc.id, {
|
|
95
|
+
vlmStatus: 'complete',
|
|
96
|
+
});
|
|
97
|
+
const ocrResult = db.getOCRResultByDocumentId(doc.id);
|
|
98
|
+
const docImageCounts = docImageCountStmt.get(doc.id);
|
|
99
|
+
const confidences = completeImages
|
|
100
|
+
.filter((i) => i.vlm_confidence !== null)
|
|
101
|
+
.map((i) => i.vlm_confidence);
|
|
102
|
+
// Track image types
|
|
103
|
+
const docImageTypes = {};
|
|
104
|
+
for (const img of completeImages) {
|
|
105
|
+
if (img.vlm_structured_data) {
|
|
106
|
+
const imageType = img.vlm_structured_data.imageType || 'other';
|
|
107
|
+
docImageTypes[imageType] = (docImageTypes[imageType] || 0) + 1;
|
|
108
|
+
imageTypeDistribution[imageType] = (imageTypeDistribution[imageType] || 0) + 1;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
// Calculate stats
|
|
112
|
+
const avgConfidence = confidences.length > 0 ? confidences.reduce((a, b) => a + b, 0) / confidences.length : 0;
|
|
113
|
+
totalConfidence += confidences.reduce((a, b) => a + b, 0);
|
|
114
|
+
confidenceCount += confidences.length;
|
|
115
|
+
docStats.push({
|
|
116
|
+
document_id: doc.id,
|
|
117
|
+
file_name: doc.file_name,
|
|
118
|
+
page_count: doc.page_count,
|
|
119
|
+
ocr_text_length: ocrResult?.text_length ?? 0,
|
|
120
|
+
image_count: docImageCounts.total,
|
|
121
|
+
vlm_complete: completeImages.length,
|
|
122
|
+
vlm_pending: docImageCounts.pending,
|
|
123
|
+
vlm_failed: docImageCounts.failed,
|
|
124
|
+
avg_confidence: avgConfidence,
|
|
125
|
+
min_confidence: safeMin(confidences) ?? 0,
|
|
126
|
+
max_confidence: safeMax(confidences) ?? 0,
|
|
127
|
+
image_types: docImageTypes,
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
// M-10: Direct SQL for low confidence images instead of tracking in per-document loop
|
|
131
|
+
const lowConfidenceImages = db
|
|
132
|
+
.getConnection()
|
|
133
|
+
.prepare(`
|
|
134
|
+
SELECT i.id as image_id, i.document_id, d.file_name, i.page_number as page,
|
|
135
|
+
i.vlm_confidence as confidence,
|
|
136
|
+
COALESCE(json_extract(i.vlm_structured_data, '$.imageType'), 'unknown') as image_type,
|
|
137
|
+
COALESCE(i.extracted_path, 'unknown') as path
|
|
138
|
+
FROM images i
|
|
139
|
+
JOIN documents d ON d.id = i.document_id
|
|
140
|
+
WHERE i.vlm_status = 'complete'
|
|
141
|
+
AND i.vlm_confidence IS NOT NULL
|
|
142
|
+
AND i.vlm_confidence < ?
|
|
143
|
+
ORDER BY i.vlm_confidence ASC
|
|
144
|
+
LIMIT 50
|
|
145
|
+
`)
|
|
146
|
+
.all(confidenceThreshold);
|
|
147
|
+
// Calculate overall average confidence
|
|
148
|
+
const overallAvgConfidence = confidenceCount > 0 ? totalConfidence / confidenceCount : 0;
|
|
149
|
+
// Comparison statistics
|
|
150
|
+
const comparisonSummary = db
|
|
151
|
+
.getConnection()
|
|
152
|
+
.prepare(`
|
|
153
|
+
SELECT COUNT(*) as count, AVG(similarity_ratio) as avg_similarity
|
|
154
|
+
FROM comparisons
|
|
155
|
+
`)
|
|
156
|
+
.get();
|
|
157
|
+
const comparisonCount = comparisonSummary.count;
|
|
158
|
+
const avgComparisonSimilarity = comparisonSummary.avg_similarity;
|
|
159
|
+
// Clustering statistics
|
|
160
|
+
const clusteringStats = getClusteringStats(db.getConnection());
|
|
161
|
+
// Generate markdown report
|
|
162
|
+
const report = generateMarkdownReport({
|
|
163
|
+
dbStats,
|
|
164
|
+
imageStats,
|
|
165
|
+
docStats,
|
|
166
|
+
lowConfidenceImages, // Already limited to 50 by SQL query
|
|
167
|
+
imageTypeDistribution,
|
|
168
|
+
overallAvgConfidence,
|
|
169
|
+
confidenceThreshold,
|
|
170
|
+
comparisonStats: { total: comparisonCount, avg_similarity: avgComparisonSimilarity },
|
|
171
|
+
clusteringStats,
|
|
172
|
+
});
|
|
173
|
+
// Save to file if path provided
|
|
174
|
+
if (outputPath) {
|
|
175
|
+
const safeOutputPath = sanitizePath(outputPath);
|
|
176
|
+
const dir = dirname(safeOutputPath);
|
|
177
|
+
if (!fs.existsSync(dir)) {
|
|
178
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
179
|
+
}
|
|
180
|
+
fs.writeFileSync(safeOutputPath, report);
|
|
181
|
+
console.error(`[INFO] Report saved to: ${safeOutputPath}`);
|
|
182
|
+
}
|
|
183
|
+
return formatResponse(successResult({
|
|
184
|
+
summary: {
|
|
185
|
+
total_documents: documents.length,
|
|
186
|
+
total_pages: documents.reduce((sum, d) => sum + (d.page_count || 0), 0),
|
|
187
|
+
total_images: imageStats.total,
|
|
188
|
+
vlm_processed: imageStats.processed,
|
|
189
|
+
vlm_pending: imageStats.pending,
|
|
190
|
+
vlm_failed: imageStats.failed,
|
|
191
|
+
overall_avg_confidence: overallAvgConfidence,
|
|
192
|
+
low_confidence_count: lowConfidenceImages.length,
|
|
193
|
+
total_comparisons: comparisonCount,
|
|
194
|
+
avg_comparison_similarity: avgComparisonSimilarity,
|
|
195
|
+
total_clusters: clusteringStats.total_clusters,
|
|
196
|
+
total_cluster_runs: clusteringStats.total_runs,
|
|
197
|
+
avg_coherence: clusteringStats.avg_coherence,
|
|
198
|
+
},
|
|
199
|
+
image_type_distribution: imageTypeDistribution,
|
|
200
|
+
output_path: outputPath ?? null,
|
|
201
|
+
report: outputPath ? null : report, // Only include report in response if not saved to file
|
|
202
|
+
next_steps: [
|
|
203
|
+
{ tool: 'ocr_report_overview', description: 'Get quality and corpus overview' },
|
|
204
|
+
{ tool: 'ocr_evaluate', description: 'Evaluate more images' },
|
|
205
|
+
],
|
|
206
|
+
}));
|
|
207
|
+
}
|
|
208
|
+
catch (error) {
|
|
209
|
+
return handleError(error);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Handle ocr_document_report - Generate report for a single document
|
|
214
|
+
*/
|
|
215
|
+
export async function handleDocumentReport(params) {
|
|
216
|
+
try {
|
|
217
|
+
const input = validateInput(DocumentReportInput, params);
|
|
218
|
+
const documentId = input.document_id;
|
|
219
|
+
const { db } = requireDatabase();
|
|
220
|
+
const doc = db.getDocument(documentId);
|
|
221
|
+
if (!doc) {
|
|
222
|
+
throw new MCPError('DOCUMENT_NOT_FOUND', `Document not found: ${documentId}`, {
|
|
223
|
+
document_id: documentId,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
const ocrResult = db.getOCRResultByDocumentId(documentId);
|
|
227
|
+
const images = getImagesByDocument(db.getConnection(), documentId);
|
|
228
|
+
const chunks = db.getChunksByDocumentId(documentId);
|
|
229
|
+
const extractions = db.getExtractionsByDocument(documentId);
|
|
230
|
+
// Calculate image stats
|
|
231
|
+
const completeImages = images.filter((i) => i.vlm_status === 'complete');
|
|
232
|
+
const confidences = completeImages
|
|
233
|
+
.filter((i) => i.vlm_confidence !== null)
|
|
234
|
+
.map((i) => i.vlm_confidence);
|
|
235
|
+
const imageTypes = {};
|
|
236
|
+
for (const img of completeImages) {
|
|
237
|
+
if (img.vlm_structured_data) {
|
|
238
|
+
const imageType = img.vlm_structured_data.imageType || 'other';
|
|
239
|
+
imageTypes[imageType] = (imageTypes[imageType] || 0) + 1;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
// Build image details
|
|
243
|
+
const imageDetails = images.map((img) => ({
|
|
244
|
+
id: img.id,
|
|
245
|
+
page: img.page_number,
|
|
246
|
+
index: img.image_index,
|
|
247
|
+
format: img.format,
|
|
248
|
+
dimensions: img.dimensions,
|
|
249
|
+
vlm_status: img.vlm_status,
|
|
250
|
+
confidence: img.vlm_confidence,
|
|
251
|
+
image_type: img.vlm_structured_data?.imageType || null,
|
|
252
|
+
primary_subject: img.vlm_structured_data?.primarySubject || null,
|
|
253
|
+
description_length: img.vlm_description?.length ?? 0,
|
|
254
|
+
has_embedding: !!img.vlm_embedding_id,
|
|
255
|
+
error: img.error_message,
|
|
256
|
+
}));
|
|
257
|
+
const docComparisons = getComparisonSummariesByDocument(db.getConnection(), documentId);
|
|
258
|
+
const docClusterMemberships = getClusterSummariesForDocument(db.getConnection(), documentId);
|
|
259
|
+
return formatResponse(successResult({
|
|
260
|
+
document: {
|
|
261
|
+
id: doc.id,
|
|
262
|
+
file_name: doc.file_name,
|
|
263
|
+
file_path: doc.file_path,
|
|
264
|
+
file_type: doc.file_type,
|
|
265
|
+
file_size: doc.file_size,
|
|
266
|
+
status: doc.status,
|
|
267
|
+
page_count: doc.page_count,
|
|
268
|
+
doc_title: doc.doc_title ?? null,
|
|
269
|
+
doc_author: doc.doc_author ?? null,
|
|
270
|
+
doc_subject: doc.doc_subject ?? null,
|
|
271
|
+
},
|
|
272
|
+
ocr: ocrResult
|
|
273
|
+
? {
|
|
274
|
+
text_length: ocrResult.text_length,
|
|
275
|
+
quality_score: ocrResult.parse_quality_score,
|
|
276
|
+
processing_duration_ms: ocrResult.processing_duration_ms,
|
|
277
|
+
mode: ocrResult.datalab_mode,
|
|
278
|
+
cost_cents: ocrResult.cost_cents,
|
|
279
|
+
datalab_request_id: ocrResult.datalab_request_id,
|
|
280
|
+
content_hash: ocrResult.content_hash,
|
|
281
|
+
}
|
|
282
|
+
: null,
|
|
283
|
+
chunks: {
|
|
284
|
+
total: chunks.length,
|
|
285
|
+
},
|
|
286
|
+
images: {
|
|
287
|
+
total: images.length,
|
|
288
|
+
complete: completeImages.length,
|
|
289
|
+
pending: images.filter((i) => i.vlm_status === 'pending').length,
|
|
290
|
+
failed: images.filter((i) => i.vlm_status === 'failed').length,
|
|
291
|
+
avg_confidence: confidences.length > 0
|
|
292
|
+
? confidences.reduce((a, b) => a + b, 0) / confidences.length
|
|
293
|
+
: null,
|
|
294
|
+
min_confidence: safeMin(confidences) ?? null,
|
|
295
|
+
max_confidence: safeMax(confidences) ?? null,
|
|
296
|
+
type_distribution: imageTypes,
|
|
297
|
+
details: imageDetails,
|
|
298
|
+
},
|
|
299
|
+
extractions: {
|
|
300
|
+
total: extractions.length,
|
|
301
|
+
items: extractions.map((e) => ({
|
|
302
|
+
id: e.id,
|
|
303
|
+
schema: e.schema_json ? JSON.parse(e.schema_json) : null,
|
|
304
|
+
result: e.extraction_json ? JSON.parse(e.extraction_json) : null,
|
|
305
|
+
created_at: e.created_at,
|
|
306
|
+
provenance_id: e.provenance_id,
|
|
307
|
+
})),
|
|
308
|
+
},
|
|
309
|
+
comparisons: {
|
|
310
|
+
total: docComparisons.length,
|
|
311
|
+
items: docComparisons.map((c) => ({
|
|
312
|
+
id: c.id,
|
|
313
|
+
compared_with: c.document_id_1 === documentId ? c.document_id_2 : c.document_id_1,
|
|
314
|
+
similarity_ratio: c.similarity_ratio,
|
|
315
|
+
summary: c.summary,
|
|
316
|
+
created_at: c.created_at,
|
|
317
|
+
processing_duration_ms: c.processing_duration_ms,
|
|
318
|
+
})),
|
|
319
|
+
},
|
|
320
|
+
clusters: {
|
|
321
|
+
total: docClusterMemberships.length,
|
|
322
|
+
items: docClusterMemberships.map((c) => ({
|
|
323
|
+
cluster_id: c.id,
|
|
324
|
+
run_id: c.run_id,
|
|
325
|
+
cluster_index: c.cluster_index,
|
|
326
|
+
label: c.label,
|
|
327
|
+
classification_tag: c.classification_tag,
|
|
328
|
+
coherence_score: c.coherence_score,
|
|
329
|
+
})),
|
|
330
|
+
},
|
|
331
|
+
next_steps: [
|
|
332
|
+
{ tool: 'ocr_document_get', description: 'Get document metadata' },
|
|
333
|
+
{ tool: 'ocr_search', description: 'Search within this document' },
|
|
334
|
+
],
|
|
335
|
+
}));
|
|
336
|
+
}
|
|
337
|
+
catch (error) {
|
|
338
|
+
return handleError(error);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
/**
|
|
342
|
+
* Handle ocr_report_overview - Consolidated quality + corpus overview
|
|
343
|
+
* Merges former ocr_quality_summary and ocr_corpus_profile.
|
|
344
|
+
* section='quality' | 'corpus' | 'all' (default: 'all')
|
|
345
|
+
*/
|
|
346
|
+
export async function handleReportOverview(params) {
|
|
347
|
+
try {
|
|
348
|
+
const input = validateInput(ReportOverviewInput, params);
|
|
349
|
+
const section = input.section ?? 'all';
|
|
350
|
+
const { db } = requireDatabase();
|
|
351
|
+
const conn = db.getConnection();
|
|
352
|
+
const result = { section };
|
|
353
|
+
// ---- Quality section (former ocr_quality_summary) ----
|
|
354
|
+
if (section === 'quality' || section === 'all') {
|
|
355
|
+
const imageStats = getImageStats(conn);
|
|
356
|
+
const dbStats = db.getStats();
|
|
357
|
+
const confStats = conn
|
|
358
|
+
.prepare(`
|
|
359
|
+
SELECT
|
|
360
|
+
COUNT(*) as cnt,
|
|
361
|
+
AVG(vlm_confidence) as avg_conf,
|
|
362
|
+
MIN(vlm_confidence) as min_conf,
|
|
363
|
+
MAX(vlm_confidence) as max_conf,
|
|
364
|
+
SUM(CASE WHEN vlm_confidence >= 0.9 THEN 1 ELSE 0 END) as high,
|
|
365
|
+
SUM(CASE WHEN vlm_confidence >= 0.7 AND vlm_confidence < 0.9 THEN 1 ELSE 0 END) as medium,
|
|
366
|
+
SUM(CASE WHEN vlm_confidence >= 0.5 AND vlm_confidence < 0.7 THEN 1 ELSE 0 END) as low,
|
|
367
|
+
SUM(CASE WHEN vlm_confidence < 0.5 THEN 1 ELSE 0 END) as very_low
|
|
368
|
+
FROM images
|
|
369
|
+
WHERE vlm_status = 'complete' AND vlm_confidence IS NOT NULL
|
|
370
|
+
`)
|
|
371
|
+
.get();
|
|
372
|
+
const ocrQualityStats = conn
|
|
373
|
+
.prepare(`
|
|
374
|
+
SELECT
|
|
375
|
+
COUNT(parse_quality_score) as scored_count,
|
|
376
|
+
AVG(parse_quality_score) as avg_quality,
|
|
377
|
+
MIN(parse_quality_score) as min_quality,
|
|
378
|
+
MAX(parse_quality_score) as max_quality,
|
|
379
|
+
SUM(CASE WHEN parse_quality_score >= 4 THEN 1 ELSE 0 END) as excellent,
|
|
380
|
+
SUM(CASE WHEN parse_quality_score >= 3 AND parse_quality_score < 4 THEN 1 ELSE 0 END) as good,
|
|
381
|
+
SUM(CASE WHEN parse_quality_score >= 2 AND parse_quality_score < 3 THEN 1 ELSE 0 END) as fair,
|
|
382
|
+
SUM(CASE WHEN parse_quality_score < 2 THEN 1 ELSE 0 END) as poor,
|
|
383
|
+
COALESCE(SUM(cost_cents), 0) as total_ocr_cost
|
|
384
|
+
FROM ocr_results
|
|
385
|
+
`)
|
|
386
|
+
.get();
|
|
387
|
+
const formFillCost = conn
|
|
388
|
+
.prepare('SELECT COALESCE(SUM(cost_cents), 0) as total FROM form_fills')
|
|
389
|
+
.get().total;
|
|
390
|
+
const comparisonStats = conn
|
|
391
|
+
.prepare(`
|
|
392
|
+
SELECT
|
|
393
|
+
COUNT(*) as total,
|
|
394
|
+
AVG(similarity_ratio) as avg_similarity,
|
|
395
|
+
MIN(similarity_ratio) as min_similarity,
|
|
396
|
+
MAX(similarity_ratio) as max_similarity
|
|
397
|
+
FROM comparisons
|
|
398
|
+
`)
|
|
399
|
+
.get();
|
|
400
|
+
const qualityClusteringStats = getClusteringStats(conn);
|
|
401
|
+
result.quality = {
|
|
402
|
+
documents: {
|
|
403
|
+
total: dbStats.total_documents,
|
|
404
|
+
complete: dbStats.documents_by_status.complete,
|
|
405
|
+
failed: dbStats.documents_by_status.failed,
|
|
406
|
+
pending: dbStats.documents_by_status.pending,
|
|
407
|
+
},
|
|
408
|
+
ocr: {
|
|
409
|
+
total_chunks: dbStats.total_chunks,
|
|
410
|
+
total_embeddings: dbStats.total_embeddings,
|
|
411
|
+
},
|
|
412
|
+
ocr_quality: {
|
|
413
|
+
average: ocrQualityStats.scored_count > 0 ? ocrQualityStats.avg_quality : null,
|
|
414
|
+
min: ocrQualityStats.scored_count > 0 ? ocrQualityStats.min_quality : null,
|
|
415
|
+
max: ocrQualityStats.scored_count > 0 ? ocrQualityStats.max_quality : null,
|
|
416
|
+
scored_count: ocrQualityStats.scored_count,
|
|
417
|
+
distribution: {
|
|
418
|
+
excellent_gte4: ocrQualityStats.excellent || 0,
|
|
419
|
+
good_3to4: ocrQualityStats.good || 0,
|
|
420
|
+
fair_2to3: ocrQualityStats.fair || 0,
|
|
421
|
+
poor_lt2: ocrQualityStats.poor || 0,
|
|
422
|
+
},
|
|
423
|
+
},
|
|
424
|
+
costs: {
|
|
425
|
+
total_ocr_cost_cents: ocrQualityStats.total_ocr_cost,
|
|
426
|
+
total_form_fill_cost_cents: formFillCost,
|
|
427
|
+
total_cost_cents: ocrQualityStats.total_ocr_cost + formFillCost,
|
|
428
|
+
},
|
|
429
|
+
images: {
|
|
430
|
+
total: imageStats.total,
|
|
431
|
+
processed: imageStats.processed,
|
|
432
|
+
pending: imageStats.pending,
|
|
433
|
+
failed: imageStats.failed,
|
|
434
|
+
processing_rate: imageStats.total > 0
|
|
435
|
+
? `${((imageStats.processed / imageStats.total) * 100).toFixed(1)}%`
|
|
436
|
+
: '0%',
|
|
437
|
+
},
|
|
438
|
+
vlm_confidence: {
|
|
439
|
+
average: confStats.cnt > 0 ? confStats.avg_conf : null,
|
|
440
|
+
min: confStats.cnt > 0 ? confStats.min_conf : null,
|
|
441
|
+
max: confStats.cnt > 0 ? confStats.max_conf : null,
|
|
442
|
+
distribution: {
|
|
443
|
+
high: confStats.high || 0,
|
|
444
|
+
medium: confStats.medium || 0,
|
|
445
|
+
low: confStats.low || 0,
|
|
446
|
+
very_low: confStats.very_low || 0,
|
|
447
|
+
},
|
|
448
|
+
},
|
|
449
|
+
extractions: {
|
|
450
|
+
total: dbStats.total_extractions,
|
|
451
|
+
extraction_rate: dbStats.total_documents > 0
|
|
452
|
+
? `${((dbStats.total_extractions / dbStats.total_documents) * 100).toFixed(1)}%`
|
|
453
|
+
: '0%',
|
|
454
|
+
},
|
|
455
|
+
form_fills: {
|
|
456
|
+
total: dbStats.total_form_fills,
|
|
457
|
+
},
|
|
458
|
+
comparisons: {
|
|
459
|
+
total: comparisonStats.total,
|
|
460
|
+
avg_similarity: comparisonStats.total > 0 ? comparisonStats.avg_similarity : null,
|
|
461
|
+
min_similarity: comparisonStats.total > 0 ? comparisonStats.min_similarity : null,
|
|
462
|
+
max_similarity: comparisonStats.total > 0 ? comparisonStats.max_similarity : null,
|
|
463
|
+
},
|
|
464
|
+
clustering: {
|
|
465
|
+
total_clusters: qualityClusteringStats.total_clusters,
|
|
466
|
+
total_runs: qualityClusteringStats.total_runs,
|
|
467
|
+
avg_coherence: qualityClusteringStats.total_clusters > 0 ? qualityClusteringStats.avg_coherence : null,
|
|
468
|
+
},
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
// ---- Corpus section (former ocr_corpus_profile) ----
|
|
472
|
+
if (section === 'corpus' || section === 'all') {
|
|
473
|
+
// Document size distribution
|
|
474
|
+
const docSizeStats = conn
|
|
475
|
+
.prepare(`
|
|
476
|
+
SELECT
|
|
477
|
+
COALESCE(AVG(page_count), 0) as avg_page_count,
|
|
478
|
+
COALESCE(MIN(page_count), 0) as min_page_count,
|
|
479
|
+
COALESCE(MAX(page_count), 0) as max_page_count,
|
|
480
|
+
COALESCE(AVG(file_size), 0) as avg_file_size,
|
|
481
|
+
COALESCE(SUM(file_size), 0) as total_file_size,
|
|
482
|
+
COUNT(*) as total_documents
|
|
483
|
+
FROM documents
|
|
484
|
+
WHERE status = 'complete'
|
|
485
|
+
`)
|
|
486
|
+
.get();
|
|
487
|
+
const fileTypeDistribution = conn
|
|
488
|
+
.prepare(`
|
|
489
|
+
SELECT file_type, COUNT(*) as count
|
|
490
|
+
FROM documents
|
|
491
|
+
GROUP BY file_type
|
|
492
|
+
ORDER BY count DESC
|
|
493
|
+
`)
|
|
494
|
+
.all();
|
|
495
|
+
const chunkStats = conn
|
|
496
|
+
.prepare(`
|
|
497
|
+
SELECT
|
|
498
|
+
COALESCE(COUNT(*), 0) as total_chunks,
|
|
499
|
+
COALESCE(AVG(LENGTH(text)), 0) as avg_text_length,
|
|
500
|
+
COALESCE(MIN(LENGTH(text)), 0) as min_text_length,
|
|
501
|
+
COALESCE(MAX(LENGTH(text)), 0) as max_text_length,
|
|
502
|
+
COALESCE(SUM(CASE WHEN is_atomic = 1 THEN 1 ELSE 0 END), 0) as atomic_chunks,
|
|
503
|
+
COALESCE(SUM(CASE WHEN heading_context IS NOT NULL AND heading_context != '' THEN 1 ELSE 0 END), 0) as chunks_with_headings
|
|
504
|
+
FROM chunks
|
|
505
|
+
`)
|
|
506
|
+
.get();
|
|
507
|
+
const chunksPerDoc = conn
|
|
508
|
+
.prepare(`
|
|
509
|
+
SELECT
|
|
510
|
+
COALESCE(AVG(cnt), 0) as avg_chunks,
|
|
511
|
+
COALESCE(MIN(cnt), 0) as min_chunks,
|
|
512
|
+
COALESCE(MAX(cnt), 0) as max_chunks
|
|
513
|
+
FROM (SELECT COUNT(*) as cnt FROM chunks GROUP BY document_id)
|
|
514
|
+
`)
|
|
515
|
+
.get();
|
|
516
|
+
const avgContentTypes = conn
|
|
517
|
+
.prepare(`
|
|
518
|
+
SELECT COALESCE(AVG(
|
|
519
|
+
CASE
|
|
520
|
+
WHEN content_types IS NOT NULL AND content_types != '[]' AND content_types != ''
|
|
521
|
+
THEN json_array_length(content_types)
|
|
522
|
+
ELSE 0
|
|
523
|
+
END
|
|
524
|
+
), 0) as avg_content_types
|
|
525
|
+
FROM chunks
|
|
526
|
+
`)
|
|
527
|
+
.get();
|
|
528
|
+
const corpusData = {
|
|
529
|
+
documents: {
|
|
530
|
+
total_complete: docSizeStats.total_documents,
|
|
531
|
+
avg_page_count: docSizeStats.avg_page_count,
|
|
532
|
+
min_page_count: docSizeStats.min_page_count,
|
|
533
|
+
max_page_count: docSizeStats.max_page_count,
|
|
534
|
+
avg_file_size: docSizeStats.avg_file_size,
|
|
535
|
+
total_file_size: docSizeStats.total_file_size,
|
|
536
|
+
},
|
|
537
|
+
file_types: fileTypeDistribution,
|
|
538
|
+
chunks: {
|
|
539
|
+
total_chunks: chunkStats.total_chunks,
|
|
540
|
+
avg_text_length: chunkStats.avg_text_length,
|
|
541
|
+
min_text_length: chunkStats.min_text_length,
|
|
542
|
+
max_text_length: chunkStats.max_text_length,
|
|
543
|
+
avg_content_types_per_chunk: avgContentTypes.avg_content_types,
|
|
544
|
+
atomic_chunks: chunkStats.atomic_chunks,
|
|
545
|
+
chunks_with_headings: chunkStats.chunks_with_headings,
|
|
546
|
+
per_document: {
|
|
547
|
+
avg: chunksPerDoc.avg_chunks,
|
|
548
|
+
min: chunksPerDoc.min_chunks,
|
|
549
|
+
max: chunksPerDoc.max_chunks,
|
|
550
|
+
},
|
|
551
|
+
},
|
|
552
|
+
};
|
|
553
|
+
if (input.include_content_type_distribution) {
|
|
554
|
+
corpusData.content_type_distribution = conn
|
|
555
|
+
.prepare(`
|
|
556
|
+
SELECT
|
|
557
|
+
j.value as content_type,
|
|
558
|
+
COUNT(*) as count
|
|
559
|
+
FROM chunks, json_each(COALESCE(content_types, '[]')) j
|
|
560
|
+
GROUP BY j.value
|
|
561
|
+
ORDER BY count DESC
|
|
562
|
+
LIMIT ?
|
|
563
|
+
`)
|
|
564
|
+
.all(input.limit);
|
|
565
|
+
}
|
|
566
|
+
if (input.include_section_frequency) {
|
|
567
|
+
corpusData.section_frequency = conn
|
|
568
|
+
.prepare(`
|
|
569
|
+
SELECT
|
|
570
|
+
heading_context,
|
|
571
|
+
COUNT(*) as occurrence_count,
|
|
572
|
+
COUNT(DISTINCT document_id) as document_count
|
|
573
|
+
FROM chunks
|
|
574
|
+
WHERE heading_context IS NOT NULL AND heading_context != ''
|
|
575
|
+
GROUP BY heading_context
|
|
576
|
+
ORDER BY occurrence_count DESC
|
|
577
|
+
LIMIT ?
|
|
578
|
+
`)
|
|
579
|
+
.all(input.limit);
|
|
580
|
+
}
|
|
581
|
+
corpusData.image_type_distribution = conn
|
|
582
|
+
.prepare(`
|
|
583
|
+
SELECT
|
|
584
|
+
COALESCE(json_extract(vlm_structured_data, '$.imageType'), 'unknown') as image_type,
|
|
585
|
+
COUNT(*) as count
|
|
586
|
+
FROM images
|
|
587
|
+
WHERE vlm_status = 'complete' AND vlm_structured_data IS NOT NULL
|
|
588
|
+
GROUP BY image_type
|
|
589
|
+
ORDER BY count DESC
|
|
590
|
+
`)
|
|
591
|
+
.all();
|
|
592
|
+
result.corpus = corpusData;
|
|
593
|
+
}
|
|
594
|
+
result.next_steps = [
|
|
595
|
+
{ tool: 'ocr_report_performance', description: 'Get pipeline performance analytics' },
|
|
596
|
+
{ tool: 'ocr_error_analytics', description: 'Analyze errors and failures' },
|
|
597
|
+
{ tool: 'ocr_trends', description: 'View quality/volume trends over time' },
|
|
598
|
+
];
|
|
599
|
+
return formatResponse(successResult(result));
|
|
600
|
+
}
|
|
601
|
+
catch (error) {
|
|
602
|
+
return handleError(error);
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
606
|
+
// COST ANALYTICS HANDLER
|
|
607
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
608
|
+
/**
|
|
609
|
+
* Handle ocr_cost_summary - Get cost analytics for OCR and form fill operations
|
|
610
|
+
*/
|
|
611
|
+
async function handleCostSummary(params) {
|
|
612
|
+
try {
|
|
613
|
+
const input = validateInput(z.object({
|
|
614
|
+
group_by: z.enum(['document', 'mode', 'month', 'total']).default('total'),
|
|
615
|
+
}), params);
|
|
616
|
+
const { db } = requireDatabase();
|
|
617
|
+
const conn = db.getConnection();
|
|
618
|
+
const totals = conn
|
|
619
|
+
.prepare(`
|
|
620
|
+
SELECT
|
|
621
|
+
(SELECT COALESCE(SUM(cost_cents), 0) FROM ocr_results) as ocr_cost,
|
|
622
|
+
(SELECT COALESCE(SUM(cost_cents), 0) FROM form_fills) as form_fill_cost,
|
|
623
|
+
(SELECT COUNT(*) FROM ocr_results WHERE cost_cents > 0) as ocr_count,
|
|
624
|
+
(SELECT COUNT(*) FROM form_fills WHERE cost_cents > 0) as form_fill_count
|
|
625
|
+
`)
|
|
626
|
+
.get();
|
|
627
|
+
const result = {
|
|
628
|
+
total_cost_cents: totals.ocr_cost + totals.form_fill_cost,
|
|
629
|
+
total_cost_dollars: ((totals.ocr_cost + totals.form_fill_cost) / 100).toFixed(2),
|
|
630
|
+
ocr: { total_cents: totals.ocr_cost, document_count: totals.ocr_count },
|
|
631
|
+
form_fill: { total_cents: totals.form_fill_cost, fill_count: totals.form_fill_count },
|
|
632
|
+
};
|
|
633
|
+
if (input.group_by === 'mode') {
|
|
634
|
+
result.by_mode = conn
|
|
635
|
+
.prepare(`
|
|
636
|
+
SELECT datalab_mode as mode, COUNT(*) as count, COALESCE(SUM(cost_cents), 0) as total_cents
|
|
637
|
+
FROM ocr_results WHERE cost_cents > 0 GROUP BY datalab_mode
|
|
638
|
+
`)
|
|
639
|
+
.all();
|
|
640
|
+
}
|
|
641
|
+
else if (input.group_by === 'document') {
|
|
642
|
+
result.by_document = conn
|
|
643
|
+
.prepare(`
|
|
644
|
+
SELECT d.file_name, o.datalab_mode as mode, o.cost_cents, o.page_count
|
|
645
|
+
FROM ocr_results o JOIN documents d ON d.id = o.document_id
|
|
646
|
+
WHERE o.cost_cents > 0 ORDER BY o.cost_cents DESC LIMIT 50
|
|
647
|
+
`)
|
|
648
|
+
.all();
|
|
649
|
+
}
|
|
650
|
+
else if (input.group_by === 'month') {
|
|
651
|
+
result.by_month = conn
|
|
652
|
+
.prepare(`
|
|
653
|
+
SELECT strftime('%Y-%m', processing_completed_at) as month,
|
|
654
|
+
COUNT(*) as count, COALESCE(SUM(cost_cents), 0) as total_cents
|
|
655
|
+
FROM ocr_results WHERE cost_cents > 0
|
|
656
|
+
GROUP BY strftime('%Y-%m', processing_completed_at) ORDER BY month DESC
|
|
657
|
+
`)
|
|
658
|
+
.all();
|
|
659
|
+
}
|
|
660
|
+
// Comparison processing durations (compute-only, no API cost)
|
|
661
|
+
const compDurations = conn
|
|
662
|
+
.prepare(`
|
|
663
|
+
SELECT COUNT(*) as count,
|
|
664
|
+
COALESCE(SUM(processing_duration_ms), 0) as total_ms,
|
|
665
|
+
AVG(processing_duration_ms) as avg_ms
|
|
666
|
+
FROM comparisons
|
|
667
|
+
`)
|
|
668
|
+
.get();
|
|
669
|
+
result.comparison_compute = {
|
|
670
|
+
total_comparisons: compDurations.count,
|
|
671
|
+
total_duration_ms: compDurations.total_ms,
|
|
672
|
+
avg_duration_ms: compDurations.avg_ms,
|
|
673
|
+
};
|
|
674
|
+
// Clustering processing durations (compute-only, no API cost)
|
|
675
|
+
const clusterDurations = conn
|
|
676
|
+
.prepare(`
|
|
677
|
+
SELECT COUNT(*) as count,
|
|
678
|
+
COUNT(DISTINCT run_id) as runs,
|
|
679
|
+
COALESCE(SUM(processing_duration_ms), 0) as total_ms,
|
|
680
|
+
AVG(processing_duration_ms) as avg_ms
|
|
681
|
+
FROM clusters
|
|
682
|
+
`)
|
|
683
|
+
.get();
|
|
684
|
+
result.clustering_compute = {
|
|
685
|
+
total_clusters: clusterDurations.count,
|
|
686
|
+
total_runs: clusterDurations.runs,
|
|
687
|
+
total_duration_ms: clusterDurations.total_ms,
|
|
688
|
+
avg_duration_ms: clusterDurations.avg_ms,
|
|
689
|
+
};
|
|
690
|
+
result.next_steps = [
|
|
691
|
+
{ tool: 'ocr_report_performance', description: 'Get pipeline performance analytics' },
|
|
692
|
+
{ tool: 'ocr_db_stats', description: 'Get database overview statistics' },
|
|
693
|
+
];
|
|
694
|
+
return formatResponse(successResult(result));
|
|
695
|
+
}
|
|
696
|
+
catch (error) {
|
|
697
|
+
return handleError(error);
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
701
|
+
// CONSOLIDATED PERFORMANCE REPORT HANDLER
|
|
702
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
703
|
+
/**
|
|
704
|
+
* Handle ocr_report_performance - Consolidated pipeline + throughput + bottlenecks
|
|
705
|
+
* Merges former ocr_pipeline_analytics, ocr_throughput_analytics, and ocr_provenance_bottlenecks.
|
|
706
|
+
* section='pipeline' | 'throughput' | 'bottlenecks' | 'all' (default: 'all')
|
|
707
|
+
*/
|
|
708
|
+
export async function handleReportPerformance(params) {
|
|
709
|
+
try {
|
|
710
|
+
const input = validateInput(ReportPerformanceInput, params);
|
|
711
|
+
const section = input.section ?? 'all';
|
|
712
|
+
const { db } = requireDatabase();
|
|
713
|
+
const conn = db.getConnection();
|
|
714
|
+
const result = { section };
|
|
715
|
+
// ---- Pipeline section (former ocr_pipeline_analytics) ----
|
|
716
|
+
if (section === 'pipeline' || section === 'all') {
|
|
717
|
+
const ocrStats = conn
|
|
718
|
+
.prepare(`
|
|
719
|
+
SELECT
|
|
720
|
+
COALESCE(COUNT(*), 0) as total_docs,
|
|
721
|
+
COALESCE(SUM(page_count), 0) as total_pages,
|
|
722
|
+
COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
|
|
723
|
+
COALESCE(MIN(processing_duration_ms), 0) as min_duration_ms,
|
|
724
|
+
COALESCE(MAX(processing_duration_ms), 0) as max_duration_ms,
|
|
725
|
+
COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms,
|
|
726
|
+
COALESCE(AVG(parse_quality_score), 0) as avg_quality
|
|
727
|
+
FROM ocr_results
|
|
728
|
+
`)
|
|
729
|
+
.get();
|
|
730
|
+
const avgMsPerPage = ocrStats.total_pages > 0 ? ocrStats.total_duration_ms / ocrStats.total_pages : 0;
|
|
731
|
+
const embeddingStats = conn
|
|
732
|
+
.prepare(`
|
|
733
|
+
SELECT
|
|
734
|
+
COALESCE(COUNT(*), 0) as total_embeddings,
|
|
735
|
+
COALESCE(AVG(generation_duration_ms), 0) as avg_duration_ms,
|
|
736
|
+
COALESCE(MIN(generation_duration_ms), 0) as min_duration_ms,
|
|
737
|
+
COALESCE(MAX(generation_duration_ms), 0) as max_duration_ms,
|
|
738
|
+
COALESCE(SUM(generation_duration_ms), 0) as total_duration_ms,
|
|
739
|
+
COUNT(DISTINCT gpu_device) as device_count
|
|
740
|
+
FROM embeddings
|
|
741
|
+
`)
|
|
742
|
+
.get();
|
|
743
|
+
const vlmStats = conn
|
|
744
|
+
.prepare(`
|
|
745
|
+
SELECT
|
|
746
|
+
COALESCE(COUNT(*), 0) as total_images,
|
|
747
|
+
COALESCE(SUM(CASE WHEN vlm_status = 'complete' THEN 1 ELSE 0 END), 0) as completed,
|
|
748
|
+
COALESCE(SUM(CASE WHEN vlm_status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
|
|
749
|
+
COALESCE(AVG(CASE WHEN vlm_status = 'complete' THEN vlm_tokens_used END), 0) as avg_tokens,
|
|
750
|
+
COALESCE(SUM(CASE WHEN vlm_status = 'complete' THEN vlm_tokens_used ELSE 0 END), 0) as total_tokens,
|
|
751
|
+
COALESCE(AVG(CASE WHEN vlm_status = 'complete' THEN vlm_confidence END), 0) as avg_confidence
|
|
752
|
+
FROM images
|
|
753
|
+
`)
|
|
754
|
+
.get();
|
|
755
|
+
const compStats = conn
|
|
756
|
+
.prepare(`
|
|
757
|
+
SELECT
|
|
758
|
+
COALESCE(COUNT(*), 0) as total,
|
|
759
|
+
COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
|
|
760
|
+
COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms
|
|
761
|
+
FROM comparisons
|
|
762
|
+
`)
|
|
763
|
+
.get();
|
|
764
|
+
const clusterStats = conn
|
|
765
|
+
.prepare(`
|
|
766
|
+
SELECT
|
|
767
|
+
COALESCE(COUNT(*), 0) as total_clusters,
|
|
768
|
+
COUNT(DISTINCT run_id) as total_runs,
|
|
769
|
+
COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
|
|
770
|
+
COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms
|
|
771
|
+
FROM clusters
|
|
772
|
+
`)
|
|
773
|
+
.get();
|
|
774
|
+
const pagesPerMinute = ocrStats.total_duration_ms > 0
|
|
775
|
+
? (ocrStats.total_pages / ocrStats.total_duration_ms) * 60000
|
|
776
|
+
: 0;
|
|
777
|
+
const embeddingsPerSecond = embeddingStats.total_duration_ms > 0
|
|
778
|
+
? (embeddingStats.total_embeddings / embeddingStats.total_duration_ms) * 1000
|
|
779
|
+
: 0;
|
|
780
|
+
const pipelineData = {
|
|
781
|
+
ocr: {
|
|
782
|
+
total_docs: ocrStats.total_docs,
|
|
783
|
+
total_pages: ocrStats.total_pages,
|
|
784
|
+
avg_duration_ms: ocrStats.avg_duration_ms,
|
|
785
|
+
min_duration_ms: ocrStats.min_duration_ms,
|
|
786
|
+
max_duration_ms: ocrStats.max_duration_ms,
|
|
787
|
+
total_duration_ms: ocrStats.total_duration_ms,
|
|
788
|
+
avg_ms_per_page: avgMsPerPage,
|
|
789
|
+
avg_quality: ocrStats.avg_quality,
|
|
790
|
+
},
|
|
791
|
+
embeddings: {
|
|
792
|
+
total_embeddings: embeddingStats.total_embeddings,
|
|
793
|
+
avg_duration_ms: embeddingStats.avg_duration_ms,
|
|
794
|
+
min_duration_ms: embeddingStats.min_duration_ms,
|
|
795
|
+
max_duration_ms: embeddingStats.max_duration_ms,
|
|
796
|
+
total_duration_ms: embeddingStats.total_duration_ms,
|
|
797
|
+
device_count: embeddingStats.device_count,
|
|
798
|
+
},
|
|
799
|
+
vlm: {
|
|
800
|
+
total_images: vlmStats.total_images,
|
|
801
|
+
completed: vlmStats.completed,
|
|
802
|
+
failed: vlmStats.failed,
|
|
803
|
+
avg_tokens: vlmStats.avg_tokens,
|
|
804
|
+
total_tokens: vlmStats.total_tokens,
|
|
805
|
+
avg_confidence: vlmStats.avg_confidence,
|
|
806
|
+
},
|
|
807
|
+
comparisons: {
|
|
808
|
+
total: compStats.total,
|
|
809
|
+
avg_duration_ms: compStats.avg_duration_ms,
|
|
810
|
+
total_duration_ms: compStats.total_duration_ms,
|
|
811
|
+
},
|
|
812
|
+
clustering: {
|
|
813
|
+
total_clusters: clusterStats.total_clusters,
|
|
814
|
+
total_runs: clusterStats.total_runs,
|
|
815
|
+
avg_duration_ms: clusterStats.avg_duration_ms,
|
|
816
|
+
total_duration_ms: clusterStats.total_duration_ms,
|
|
817
|
+
},
|
|
818
|
+
throughput: {
|
|
819
|
+
pages_per_minute: pagesPerMinute,
|
|
820
|
+
embeddings_per_second: embeddingsPerSecond,
|
|
821
|
+
},
|
|
822
|
+
};
|
|
823
|
+
// Group-by breakdown
|
|
824
|
+
if (input.group_by === 'mode') {
|
|
825
|
+
pipelineData.by_mode = conn
|
|
826
|
+
.prepare(`
|
|
827
|
+
SELECT
|
|
828
|
+
datalab_mode as mode,
|
|
829
|
+
COUNT(*) as count,
|
|
830
|
+
COALESCE(AVG(processing_duration_ms), 0) as avg_ms,
|
|
831
|
+
COALESCE(AVG(parse_quality_score), 0) as avg_quality,
|
|
832
|
+
COALESCE(AVG(cost_cents), 0) as avg_cost
|
|
833
|
+
FROM ocr_results
|
|
834
|
+
GROUP BY datalab_mode
|
|
835
|
+
`)
|
|
836
|
+
.all();
|
|
837
|
+
}
|
|
838
|
+
else if (input.group_by === 'file_type') {
|
|
839
|
+
pipelineData.by_file_type = conn
|
|
840
|
+
.prepare(`
|
|
841
|
+
SELECT
|
|
842
|
+
d.file_type,
|
|
843
|
+
COUNT(*) as count,
|
|
844
|
+
COALESCE(AVG(o.processing_duration_ms), 0) as avg_ms,
|
|
845
|
+
COALESCE(AVG(o.parse_quality_score), 0) as avg_quality
|
|
846
|
+
FROM ocr_results o
|
|
847
|
+
JOIN documents d ON d.id = o.document_id
|
|
848
|
+
GROUP BY d.file_type
|
|
849
|
+
LIMIT ?
|
|
850
|
+
`)
|
|
851
|
+
.all(input.limit);
|
|
852
|
+
}
|
|
853
|
+
else if (input.group_by === 'document') {
|
|
854
|
+
pipelineData.by_document = conn
|
|
855
|
+
.prepare(`
|
|
856
|
+
SELECT
|
|
857
|
+
d.id as document_id,
|
|
858
|
+
d.file_name,
|
|
859
|
+
o.processing_duration_ms,
|
|
860
|
+
o.page_count,
|
|
861
|
+
o.parse_quality_score as quality,
|
|
862
|
+
o.datalab_mode as mode,
|
|
863
|
+
(SELECT COUNT(*) FROM chunks c WHERE c.document_id = d.id) as chunk_count,
|
|
864
|
+
(SELECT COUNT(*) FROM images i WHERE i.document_id = d.id) as image_count
|
|
865
|
+
FROM ocr_results o
|
|
866
|
+
JOIN documents d ON d.id = o.document_id
|
|
867
|
+
ORDER BY o.processing_duration_ms DESC
|
|
868
|
+
LIMIT ?
|
|
869
|
+
`)
|
|
870
|
+
.all(input.limit);
|
|
871
|
+
}
|
|
872
|
+
result.pipeline = pipelineData;
|
|
873
|
+
}
|
|
874
|
+
// ---- Throughput section (former ocr_throughput_analytics from timeline.ts) ----
|
|
875
|
+
if (section === 'throughput' || section === 'all') {
|
|
876
|
+
const bucket = input.bucket ?? 'daily';
|
|
877
|
+
const data = db.getThroughputAnalytics({
|
|
878
|
+
bucket,
|
|
879
|
+
created_after: input.created_after,
|
|
880
|
+
created_before: input.created_before,
|
|
881
|
+
});
|
|
882
|
+
const totalPages = data.reduce((sum, d) => sum + d.pages_processed, 0);
|
|
883
|
+
const totalEmbeddings = data.reduce((sum, d) => sum + d.embeddings_generated, 0);
|
|
884
|
+
const totalImages = data.reduce((sum, d) => sum + d.images_processed, 0);
|
|
885
|
+
const totalOcrMs = data.reduce((sum, d) => sum + d.total_ocr_duration_ms, 0);
|
|
886
|
+
const totalEmbMs = data.reduce((sum, d) => sum + d.total_embedding_duration_ms, 0);
|
|
887
|
+
result.throughput = {
|
|
888
|
+
bucket,
|
|
889
|
+
total_periods: data.length,
|
|
890
|
+
filters: {
|
|
891
|
+
created_after: input.created_after ?? null,
|
|
892
|
+
created_before: input.created_before ?? null,
|
|
893
|
+
},
|
|
894
|
+
summary: {
|
|
895
|
+
total_pages_processed: totalPages,
|
|
896
|
+
total_embeddings_generated: totalEmbeddings,
|
|
897
|
+
total_images_processed: totalImages,
|
|
898
|
+
total_ocr_duration_ms: totalOcrMs,
|
|
899
|
+
total_embedding_duration_ms: totalEmbMs,
|
|
900
|
+
overall_avg_ms_per_page: totalPages > 0
|
|
901
|
+
? Math.round((totalOcrMs / totalPages) * 100) / 100
|
|
902
|
+
: 0,
|
|
903
|
+
overall_avg_ms_per_embedding: totalEmbeddings > 0
|
|
904
|
+
? Math.round((totalEmbMs / totalEmbeddings) * 100) / 100
|
|
905
|
+
: 0,
|
|
906
|
+
},
|
|
907
|
+
data,
|
|
908
|
+
};
|
|
909
|
+
}
|
|
910
|
+
// ---- Bottlenecks section (former ocr_provenance_bottlenecks) ----
|
|
911
|
+
if (section === 'bottlenecks' || section === 'all') {
|
|
912
|
+
const byProcessor = conn
|
|
913
|
+
.prepare(`
|
|
914
|
+
SELECT
|
|
915
|
+
processor,
|
|
916
|
+
type,
|
|
917
|
+
COUNT(*) as count,
|
|
918
|
+
COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
|
|
919
|
+
COALESCE(MIN(processing_duration_ms), 0) as min_duration_ms,
|
|
920
|
+
COALESCE(MAX(processing_duration_ms), 0) as max_duration_ms,
|
|
921
|
+
COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms
|
|
922
|
+
FROM provenance
|
|
923
|
+
WHERE processing_duration_ms IS NOT NULL AND processing_duration_ms > 0
|
|
924
|
+
GROUP BY processor, type
|
|
925
|
+
ORDER BY total_duration_ms DESC
|
|
926
|
+
`)
|
|
927
|
+
.all();
|
|
928
|
+
const byChainDepth = conn
|
|
929
|
+
.prepare(`
|
|
930
|
+
SELECT
|
|
931
|
+
chain_depth,
|
|
932
|
+
type,
|
|
933
|
+
COUNT(*) as count,
|
|
934
|
+
COALESCE(AVG(processing_duration_ms), 0) as avg_duration_ms,
|
|
935
|
+
COALESCE(SUM(processing_duration_ms), 0) as total_duration_ms
|
|
936
|
+
FROM provenance
|
|
937
|
+
WHERE processing_duration_ms IS NOT NULL AND processing_duration_ms > 0
|
|
938
|
+
GROUP BY chain_depth, type
|
|
939
|
+
ORDER BY chain_depth ASC, total_duration_ms DESC
|
|
940
|
+
`)
|
|
941
|
+
.all();
|
|
942
|
+
const slowestOps = conn
|
|
943
|
+
.prepare(`
|
|
944
|
+
SELECT
|
|
945
|
+
p.id as provenance_id,
|
|
946
|
+
p.type,
|
|
947
|
+
p.processor,
|
|
948
|
+
p.processing_duration_ms,
|
|
949
|
+
p.chain_depth,
|
|
950
|
+
p.source_path,
|
|
951
|
+
d.file_name as document_name
|
|
952
|
+
FROM provenance p
|
|
953
|
+
LEFT JOIN documents d ON d.provenance_id = p.root_document_id
|
|
954
|
+
WHERE p.processing_duration_ms IS NOT NULL AND p.processing_duration_ms > 0
|
|
955
|
+
ORDER BY p.processing_duration_ms DESC
|
|
956
|
+
LIMIT 10
|
|
957
|
+
`)
|
|
958
|
+
.all();
|
|
959
|
+
const grandTotal = byProcessor.reduce((sum, p) => sum + p.total_duration_ms, 0);
|
|
960
|
+
result.bottlenecks = {
|
|
961
|
+
grand_total_duration_ms: grandTotal,
|
|
962
|
+
by_processor: byProcessor.map((p) => ({
|
|
963
|
+
processor: p.processor,
|
|
964
|
+
type: p.type,
|
|
965
|
+
count: p.count,
|
|
966
|
+
avg_duration_ms: p.avg_duration_ms,
|
|
967
|
+
min_duration_ms: p.min_duration_ms,
|
|
968
|
+
max_duration_ms: p.max_duration_ms,
|
|
969
|
+
total_duration_ms: p.total_duration_ms,
|
|
970
|
+
pct_of_total: grandTotal > 0
|
|
971
|
+
? Math.round((p.total_duration_ms / grandTotal) * 10000) / 100
|
|
972
|
+
: 0,
|
|
973
|
+
})),
|
|
974
|
+
by_chain_depth: byChainDepth.map((d) => ({
|
|
975
|
+
chain_depth: d.chain_depth,
|
|
976
|
+
type: d.type,
|
|
977
|
+
count: d.count,
|
|
978
|
+
avg_duration_ms: d.avg_duration_ms,
|
|
979
|
+
total_duration_ms: d.total_duration_ms,
|
|
980
|
+
})),
|
|
981
|
+
slowest_operations: slowestOps.map((o) => ({
|
|
982
|
+
provenance_id: o.provenance_id,
|
|
983
|
+
type: o.type,
|
|
984
|
+
processor: o.processor,
|
|
985
|
+
processing_duration_ms: o.processing_duration_ms,
|
|
986
|
+
chain_depth: o.chain_depth,
|
|
987
|
+
document_name: o.document_name,
|
|
988
|
+
source_path: o.source_path,
|
|
989
|
+
})),
|
|
990
|
+
};
|
|
991
|
+
}
|
|
992
|
+
result.next_steps = [
|
|
993
|
+
{ tool: 'ocr_report_overview', description: 'Get quality and corpus overview' },
|
|
994
|
+
{ tool: 'ocr_error_analytics', description: 'Analyze error patterns' },
|
|
995
|
+
];
|
|
996
|
+
return formatResponse(successResult(result));
|
|
997
|
+
}
|
|
998
|
+
catch (error) {
|
|
999
|
+
return handleError(error);
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1003
|
+
// ERROR & RECOVERY ANALYTICS HANDLER
|
|
1004
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1005
|
+
/**
|
|
1006
|
+
* Handle ocr_error_analytics - Get error and recovery analytics
|
|
1007
|
+
*/
|
|
1008
|
+
export async function handleErrorAnalytics(params) {
|
|
1009
|
+
try {
|
|
1010
|
+
const input = validateInput(ErrorAnalyticsInput, params);
|
|
1011
|
+
const { db } = requireDatabase();
|
|
1012
|
+
const conn = db.getConnection();
|
|
1013
|
+
// 1. Document failure rates
|
|
1014
|
+
const docFailures = conn
|
|
1015
|
+
.prepare(`
|
|
1016
|
+
SELECT
|
|
1017
|
+
COUNT(*) as total,
|
|
1018
|
+
COALESCE(SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
|
|
1019
|
+
COALESCE(SUM(CASE WHEN status = 'complete' THEN 1 ELSE 0 END), 0) as complete,
|
|
1020
|
+
COALESCE(SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END), 0) as pending,
|
|
1021
|
+
COALESCE(SUM(CASE WHEN status = 'processing' THEN 1 ELSE 0 END), 0) as processing
|
|
1022
|
+
FROM documents
|
|
1023
|
+
`)
|
|
1024
|
+
.get();
|
|
1025
|
+
const docFailureRate = docFailures.total > 0 ? (docFailures.failed / docFailures.total) * 100 : 0;
|
|
1026
|
+
// 2. Failure by file type
|
|
1027
|
+
const failureByFileType = conn
|
|
1028
|
+
.prepare(`
|
|
1029
|
+
SELECT
|
|
1030
|
+
file_type,
|
|
1031
|
+
COUNT(*) as total,
|
|
1032
|
+
COALESCE(SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
|
|
1033
|
+
ROUND(
|
|
1034
|
+
CASE WHEN COUNT(*) > 0
|
|
1035
|
+
THEN CAST(SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS REAL) / COUNT(*) * 100
|
|
1036
|
+
ELSE 0
|
|
1037
|
+
END,
|
|
1038
|
+
1) as failure_rate_pct
|
|
1039
|
+
FROM documents
|
|
1040
|
+
GROUP BY file_type
|
|
1041
|
+
ORDER BY failed DESC
|
|
1042
|
+
`)
|
|
1043
|
+
.all();
|
|
1044
|
+
// 4. VLM failure stats
|
|
1045
|
+
const vlmFailures = conn
|
|
1046
|
+
.prepare(`
|
|
1047
|
+
SELECT
|
|
1048
|
+
COUNT(*) as total_images,
|
|
1049
|
+
COALESCE(SUM(CASE WHEN vlm_status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
|
|
1050
|
+
COALESCE(SUM(CASE WHEN vlm_status = 'complete' THEN 1 ELSE 0 END), 0) as complete,
|
|
1051
|
+
COALESCE(SUM(CASE WHEN vlm_status = 'pending' THEN 1 ELSE 0 END), 0) as pending
|
|
1052
|
+
FROM images
|
|
1053
|
+
`)
|
|
1054
|
+
.get();
|
|
1055
|
+
const vlmFailureRate = vlmFailures.total_images > 0
|
|
1056
|
+
? (vlmFailures.failed / vlmFailures.total_images) * 100
|
|
1057
|
+
: 0;
|
|
1058
|
+
// 6. Embedding failure stats (from chunks embedding_status)
|
|
1059
|
+
const embeddingFailures = conn
|
|
1060
|
+
.prepare(`
|
|
1061
|
+
SELECT
|
|
1062
|
+
COUNT(*) as total_chunks,
|
|
1063
|
+
COALESCE(SUM(CASE WHEN embedding_status = 'failed' THEN 1 ELSE 0 END), 0) as failed,
|
|
1064
|
+
COALESCE(SUM(CASE WHEN embedding_status = 'complete' THEN 1 ELSE 0 END), 0) as complete,
|
|
1065
|
+
COALESCE(SUM(CASE WHEN embedding_status = 'pending' THEN 1 ELSE 0 END), 0) as pending
|
|
1066
|
+
FROM chunks
|
|
1067
|
+
`)
|
|
1068
|
+
.get();
|
|
1069
|
+
const result = {
|
|
1070
|
+
documents: {
|
|
1071
|
+
total: docFailures.total,
|
|
1072
|
+
failed: docFailures.failed,
|
|
1073
|
+
complete: docFailures.complete,
|
|
1074
|
+
pending: docFailures.pending,
|
|
1075
|
+
processing: docFailures.processing,
|
|
1076
|
+
failure_rate_pct: docFailureRate,
|
|
1077
|
+
},
|
|
1078
|
+
failure_by_file_type: failureByFileType,
|
|
1079
|
+
vlm: {
|
|
1080
|
+
total_images: vlmFailures.total_images,
|
|
1081
|
+
failed: vlmFailures.failed,
|
|
1082
|
+
complete: vlmFailures.complete,
|
|
1083
|
+
pending: vlmFailures.pending,
|
|
1084
|
+
failure_rate_pct: vlmFailureRate,
|
|
1085
|
+
},
|
|
1086
|
+
embeddings: {
|
|
1087
|
+
total_chunks: embeddingFailures.total_chunks,
|
|
1088
|
+
failed: embeddingFailures.failed,
|
|
1089
|
+
complete: embeddingFailures.complete,
|
|
1090
|
+
pending: embeddingFailures.pending,
|
|
1091
|
+
},
|
|
1092
|
+
};
|
|
1093
|
+
// 3. Common document errors (optional)
|
|
1094
|
+
if (input.include_error_messages) {
|
|
1095
|
+
result.common_document_errors = conn
|
|
1096
|
+
.prepare(`
|
|
1097
|
+
SELECT
|
|
1098
|
+
error_message,
|
|
1099
|
+
COUNT(*) as count
|
|
1100
|
+
FROM documents
|
|
1101
|
+
WHERE error_message IS NOT NULL
|
|
1102
|
+
GROUP BY error_message
|
|
1103
|
+
ORDER BY count DESC
|
|
1104
|
+
LIMIT ?
|
|
1105
|
+
`)
|
|
1106
|
+
.all(input.limit);
|
|
1107
|
+
// 5. VLM common errors
|
|
1108
|
+
result.common_vlm_errors = conn
|
|
1109
|
+
.prepare(`
|
|
1110
|
+
SELECT
|
|
1111
|
+
error_message,
|
|
1112
|
+
COUNT(*) as count
|
|
1113
|
+
FROM images
|
|
1114
|
+
WHERE vlm_status = 'failed' AND error_message IS NOT NULL
|
|
1115
|
+
GROUP BY error_message
|
|
1116
|
+
ORDER BY count DESC
|
|
1117
|
+
LIMIT ?
|
|
1118
|
+
`)
|
|
1119
|
+
.all(input.limit);
|
|
1120
|
+
}
|
|
1121
|
+
result.next_steps = [
|
|
1122
|
+
{ tool: 'ocr_retry_failed', description: 'Retry failed documents' },
|
|
1123
|
+
{ tool: 'ocr_image_reset_failed', description: 'Reset failed VLM images' },
|
|
1124
|
+
{ tool: 'ocr_health_check', description: 'Run a full health check' },
|
|
1125
|
+
];
|
|
1126
|
+
return formatResponse(successResult(result));
|
|
1127
|
+
}
|
|
1128
|
+
catch (error) {
|
|
1129
|
+
return handleError(error);
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1133
|
+
// UNIFIED TRENDS HANDLER (MERGE-C)
|
|
1134
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1135
|
+
/**
|
|
1136
|
+
* Handle ocr_trends - Unified time-series trends
|
|
1137
|
+
* metric='quality': OCR quality scores over time (delegates to getQualityTrends)
|
|
1138
|
+
* metric='volume': Processing volume counts over time (delegates to getTimelineStats)
|
|
1139
|
+
*/
|
|
1140
|
+
async function handleTrends(params) {
|
|
1141
|
+
try {
|
|
1142
|
+
const input = validateInput(TrendsInput, params);
|
|
1143
|
+
const { db } = requireDatabase();
|
|
1144
|
+
const bucket = input.bucket ?? 'daily';
|
|
1145
|
+
if (input.metric === 'quality') {
|
|
1146
|
+
const groupBy = input.group_by ?? 'none';
|
|
1147
|
+
const data = db.getQualityTrends({
|
|
1148
|
+
bucket,
|
|
1149
|
+
group_by: groupBy,
|
|
1150
|
+
created_after: input.created_after,
|
|
1151
|
+
created_before: input.created_before,
|
|
1152
|
+
});
|
|
1153
|
+
return formatResponse(successResult({
|
|
1154
|
+
metric: 'quality',
|
|
1155
|
+
bucket,
|
|
1156
|
+
group_by: groupBy,
|
|
1157
|
+
total_periods: data.length,
|
|
1158
|
+
filters: {
|
|
1159
|
+
created_after: input.created_after ?? null,
|
|
1160
|
+
created_before: input.created_before ?? null,
|
|
1161
|
+
},
|
|
1162
|
+
data,
|
|
1163
|
+
next_steps: [
|
|
1164
|
+
{ tool: 'ocr_report_overview', description: 'Get aggregate quality summary' },
|
|
1165
|
+
{ tool: 'ocr_trends', description: 'View volume trends (metric=volume)' },
|
|
1166
|
+
],
|
|
1167
|
+
}));
|
|
1168
|
+
}
|
|
1169
|
+
// metric === 'volume'
|
|
1170
|
+
const volumeMetric = input.volume_metric ?? 'documents';
|
|
1171
|
+
const data = db.getTimelineStats({
|
|
1172
|
+
bucket,
|
|
1173
|
+
metric: volumeMetric,
|
|
1174
|
+
created_after: input.created_after,
|
|
1175
|
+
created_before: input.created_before,
|
|
1176
|
+
});
|
|
1177
|
+
return formatResponse(successResult({
|
|
1178
|
+
metric: 'volume',
|
|
1179
|
+
bucket,
|
|
1180
|
+
volume_metric: volumeMetric,
|
|
1181
|
+
total_periods: data.length,
|
|
1182
|
+
total_count: data.reduce((sum, d) => sum + d.count, 0),
|
|
1183
|
+
filters: {
|
|
1184
|
+
created_after: input.created_after ?? null,
|
|
1185
|
+
created_before: input.created_before ?? null,
|
|
1186
|
+
},
|
|
1187
|
+
data,
|
|
1188
|
+
next_steps: [
|
|
1189
|
+
{ tool: 'ocr_report_performance', description: 'Get detailed pipeline performance' },
|
|
1190
|
+
{ tool: 'ocr_trends', description: 'View quality trends (metric=quality)' },
|
|
1191
|
+
],
|
|
1192
|
+
}));
|
|
1193
|
+
}
|
|
1194
|
+
catch (error) {
|
|
1195
|
+
return handleError(error);
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1198
|
+
function generateMarkdownReport(params) {
|
|
1199
|
+
const now = new Date().toISOString();
|
|
1200
|
+
const { dbStats, imageStats, docStats, lowConfidenceImages, imageTypeDistribution, overallAvgConfidence, confidenceThreshold, } = params;
|
|
1201
|
+
let report = `# Gemini VLM Evaluation Report
|
|
1202
|
+
|
|
1203
|
+
Generated: ${now}
|
|
1204
|
+
|
|
1205
|
+
## Executive Summary
|
|
1206
|
+
|
|
1207
|
+
| Metric | Value |
|
|
1208
|
+
|--------|-------|
|
|
1209
|
+
| Total Documents | ${dbStats.total_documents} |
|
|
1210
|
+
| Total Pages | ${docStats.reduce((sum, d) => sum + (d.page_count || 0), 0)} |
|
|
1211
|
+
| Total Images Extracted | ${imageStats.total} |
|
|
1212
|
+
| VLM Processed | ${imageStats.processed} |
|
|
1213
|
+
| VLM Pending | ${imageStats.pending} |
|
|
1214
|
+
| VLM Failed | ${imageStats.failed} |
|
|
1215
|
+
| **Overall Avg Confidence** | **${(overallAvgConfidence * 100).toFixed(1)}%** |
|
|
1216
|
+
| Low Confidence (< ${(confidenceThreshold * 100).toFixed(0)}%) | ${lowConfidenceImages.length} |
|
|
1217
|
+
|
|
1218
|
+
---
|
|
1219
|
+
|
|
1220
|
+
## Image Type Distribution
|
|
1221
|
+
|
|
1222
|
+
| Type | Count | Percentage |
|
|
1223
|
+
|------|-------|------------|
|
|
1224
|
+
`;
|
|
1225
|
+
const totalImages = Object.values(imageTypeDistribution).reduce((a, b) => a + b, 0);
|
|
1226
|
+
const sortedTypes = Object.entries(imageTypeDistribution).sort(([, a], [, b]) => b - a);
|
|
1227
|
+
for (const [type, count] of sortedTypes) {
|
|
1228
|
+
const pct = totalImages > 0 ? ((count / totalImages) * 100).toFixed(1) : '0.0';
|
|
1229
|
+
report += `| ${type} | ${count} | ${pct}% |\n`;
|
|
1230
|
+
}
|
|
1231
|
+
report += `
|
|
1232
|
+
---
|
|
1233
|
+
|
|
1234
|
+
## Per-Document Summary
|
|
1235
|
+
|
|
1236
|
+
| Document | Pages | Images | Complete | Avg Conf | Min Conf |
|
|
1237
|
+
|----------|-------|--------|----------|----------|----------|
|
|
1238
|
+
`;
|
|
1239
|
+
// Sort by number of images descending
|
|
1240
|
+
const sortedDocs = [...docStats].sort((a, b) => b.image_count - a.image_count);
|
|
1241
|
+
for (const doc of sortedDocs.slice(0, 20)) {
|
|
1242
|
+
// Top 20 documents
|
|
1243
|
+
const fileName = doc.file_name.length > 40 ? doc.file_name.slice(0, 37) + '...' : doc.file_name;
|
|
1244
|
+
report += `| ${fileName} | ${doc.page_count || 'N/A'} | ${doc.image_count} | ${doc.vlm_complete} | ${(doc.avg_confidence * 100).toFixed(1)}% | ${(doc.min_confidence * 100).toFixed(1)}% |\n`;
|
|
1245
|
+
}
|
|
1246
|
+
if (sortedDocs.length > 20) {
|
|
1247
|
+
report += `| ... and ${sortedDocs.length - 20} more | | | | | |\n`;
|
|
1248
|
+
}
|
|
1249
|
+
if (lowConfidenceImages.length > 0) {
|
|
1250
|
+
report += `
|
|
1251
|
+
---
|
|
1252
|
+
|
|
1253
|
+
## Low Confidence Images (< ${(confidenceThreshold * 100).toFixed(0)}%)
|
|
1254
|
+
|
|
1255
|
+
These images may need manual review or reprocessing.
|
|
1256
|
+
|
|
1257
|
+
| Document | Page | Confidence | Type | Path |
|
|
1258
|
+
|----------|------|------------|------|------|
|
|
1259
|
+
`;
|
|
1260
|
+
for (const img of lowConfidenceImages.slice(0, 30)) {
|
|
1261
|
+
const fileName = img.file_name.length > 30 ? img.file_name.slice(0, 27) + '...' : img.file_name;
|
|
1262
|
+
const shortPath = img.path.split('/').slice(-2).join('/');
|
|
1263
|
+
report += `| ${fileName} | ${img.page} | ${(img.confidence * 100).toFixed(1)}% | ${img.image_type} | ${shortPath} |\n`;
|
|
1264
|
+
}
|
|
1265
|
+
if (lowConfidenceImages.length > 30) {
|
|
1266
|
+
report += `| ... and ${lowConfidenceImages.length - 30} more | | | | |\n`;
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
report += `
|
|
1270
|
+
---
|
|
1271
|
+
|
|
1272
|
+
## Processing Statistics
|
|
1273
|
+
|
|
1274
|
+
- **OCR Results**: ${dbStats.total_documents} documents processed
|
|
1275
|
+
- **Text Chunks**: ${dbStats.total_chunks} chunks created
|
|
1276
|
+
- **Text Embeddings**: ${dbStats.total_embeddings} embeddings stored
|
|
1277
|
+
- **Structured Extractions**: ${dbStats.total_extractions} extractions
|
|
1278
|
+
- **Form Fills**: ${dbStats.total_form_fills} form fills
|
|
1279
|
+
- **Comparisons**: ${params.comparisonStats.total} document comparisons
|
|
1280
|
+
- **Clusters**: ${params.clusteringStats.total_clusters} clusters across ${params.clusteringStats.total_runs} runs${params.clusteringStats.avg_coherence !== null ? ` (avg coherence: ${(params.clusteringStats.avg_coherence * 100).toFixed(1)}%)` : ''}
|
|
1281
|
+
|
|
1282
|
+
### VLM Processing Rate
|
|
1283
|
+
|
|
1284
|
+
\`\`\`
|
|
1285
|
+
${imageStats.total > 0 ? `Processed: ${'█'.repeat(Math.round((imageStats.processed / imageStats.total) * 40))}${'░'.repeat(40 - Math.round((imageStats.processed / imageStats.total) * 40))} ${((imageStats.processed / imageStats.total) * 100).toFixed(1)}%` : 'No images to process.'}
|
|
1286
|
+
\`\`\`
|
|
1287
|
+
|
|
1288
|
+
---
|
|
1289
|
+
|
|
1290
|
+
*Report generated by OCR Provenance MCP System*
|
|
1291
|
+
`;
|
|
1292
|
+
return report;
|
|
1293
|
+
}
|
|
1294
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1295
|
+
// TOOL DEFINITIONS FOR MCP REGISTRATION
|
|
1296
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
1297
|
+
/**
|
|
1298
|
+
* Report tools collection for MCP server registration
|
|
1299
|
+
*/
|
|
1300
|
+
export const reportTools = {
|
|
1301
|
+
ocr_evaluation_report: {
|
|
1302
|
+
description: '[STATUS] Use to generate a comprehensive evaluation report with OCR and VLM metrics. Saves as markdown file. Returns report path and summary.',
|
|
1303
|
+
inputSchema: {
|
|
1304
|
+
output_path: z.string().optional().describe('Path to save markdown report (optional)'),
|
|
1305
|
+
confidence_threshold: z
|
|
1306
|
+
.number()
|
|
1307
|
+
.min(0)
|
|
1308
|
+
.max(1)
|
|
1309
|
+
.default(0.7)
|
|
1310
|
+
.describe('Threshold for low confidence flagging'),
|
|
1311
|
+
},
|
|
1312
|
+
handler: handleEvaluationReport,
|
|
1313
|
+
},
|
|
1314
|
+
ocr_document_report: {
|
|
1315
|
+
description: '[STATUS] Use to get a detailed report for a single document (images, extractions, comparisons, clusters). Returns comprehensive document analysis.',
|
|
1316
|
+
inputSchema: {
|
|
1317
|
+
document_id: z.string().min(1).describe('Document ID'),
|
|
1318
|
+
},
|
|
1319
|
+
handler: handleDocumentReport,
|
|
1320
|
+
},
|
|
1321
|
+
ocr_report_overview: {
|
|
1322
|
+
description: '[STATUS] Quality and corpus overview. section="quality"|"corpus"|"all" (default). Aggregate scores, content type stats.',
|
|
1323
|
+
inputSchema: {
|
|
1324
|
+
section: z
|
|
1325
|
+
.enum(['quality', 'corpus', 'all'])
|
|
1326
|
+
.default('all')
|
|
1327
|
+
.describe('Which section to return: quality, corpus, or all'),
|
|
1328
|
+
include_section_frequency: z
|
|
1329
|
+
.boolean()
|
|
1330
|
+
.default(true)
|
|
1331
|
+
.describe('(corpus) Include most common section headings across documents'),
|
|
1332
|
+
include_content_type_distribution: z
|
|
1333
|
+
.boolean()
|
|
1334
|
+
.default(true)
|
|
1335
|
+
.describe('(corpus) Include content type distribution (tables, code, etc.)'),
|
|
1336
|
+
limit: z.number().int().min(1).max(100).default(20).describe('(corpus) Max items per list'),
|
|
1337
|
+
},
|
|
1338
|
+
handler: handleReportOverview,
|
|
1339
|
+
},
|
|
1340
|
+
ocr_cost_summary: {
|
|
1341
|
+
description: '[STATUS] Use to get cost analytics for OCR and form fill operations. Returns costs grouped by document, mode, month, or total.',
|
|
1342
|
+
inputSchema: {
|
|
1343
|
+
group_by: z
|
|
1344
|
+
.enum(['document', 'mode', 'month', 'total'])
|
|
1345
|
+
.default('total')
|
|
1346
|
+
.describe('How to group cost data'),
|
|
1347
|
+
},
|
|
1348
|
+
handler: handleCostSummary,
|
|
1349
|
+
},
|
|
1350
|
+
ocr_report_performance: {
|
|
1351
|
+
description: '[STATUS] Pipeline performance analytics. section="pipeline"|"throughput"|"bottlenecks"|"all" (default).',
|
|
1352
|
+
inputSchema: {
|
|
1353
|
+
section: z
|
|
1354
|
+
.enum(['pipeline', 'throughput', 'bottlenecks', 'all'])
|
|
1355
|
+
.default('all')
|
|
1356
|
+
.describe('Which section to return'),
|
|
1357
|
+
group_by: z
|
|
1358
|
+
.enum(['total', 'document', 'mode', 'file_type'])
|
|
1359
|
+
.default('total')
|
|
1360
|
+
.describe('(pipeline) How to group performance data'),
|
|
1361
|
+
limit: z.number().int().min(1).max(100).default(20).describe('(pipeline) Max items per group'),
|
|
1362
|
+
bucket: z
|
|
1363
|
+
.enum(['hourly', 'daily', 'weekly', 'monthly'])
|
|
1364
|
+
.default('daily')
|
|
1365
|
+
.describe('(throughput) Time bucket granularity'),
|
|
1366
|
+
created_after: z
|
|
1367
|
+
.string()
|
|
1368
|
+
.optional()
|
|
1369
|
+
.describe('(throughput) Filter data created after this ISO 8601 timestamp'),
|
|
1370
|
+
created_before: z
|
|
1371
|
+
.string()
|
|
1372
|
+
.optional()
|
|
1373
|
+
.describe('(throughput) Filter data created before this ISO 8601 timestamp'),
|
|
1374
|
+
},
|
|
1375
|
+
handler: handleReportPerformance,
|
|
1376
|
+
},
|
|
1377
|
+
ocr_error_analytics: {
|
|
1378
|
+
description: '[STATUS] Use to get error and recovery analytics (failure rates, common error messages). Returns error breakdown for documents, VLM, and embeddings.',
|
|
1379
|
+
inputSchema: {
|
|
1380
|
+
include_error_messages: z
|
|
1381
|
+
.boolean()
|
|
1382
|
+
.default(true)
|
|
1383
|
+
.describe('Include most common error messages'),
|
|
1384
|
+
limit: z.number().int().min(1).max(50).default(10),
|
|
1385
|
+
},
|
|
1386
|
+
handler: handleErrorAnalytics,
|
|
1387
|
+
},
|
|
1388
|
+
ocr_trends: {
|
|
1389
|
+
description: '[STATUS] Time-series trends. metric="quality" for OCR scores, "volume" for processing counts. Bucketed by time period.',
|
|
1390
|
+
inputSchema: TrendsInput.shape,
|
|
1391
|
+
handler: handleTrends,
|
|
1392
|
+
},
|
|
1393
|
+
};
|
|
1394
|
+
//# sourceMappingURL=reports.js.map
|