ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Heading Level Normalizer for Section-Aware Chunking
|
|
3
|
+
*
|
|
4
|
+
* Fixes inconsistent heading levels from Datalab OCR by detecting
|
|
5
|
+
* repeating heading patterns (e.g., "ARTICLE N") and normalizing
|
|
6
|
+
* their heading levels to the mode (most common) level within each group.
|
|
7
|
+
*
|
|
8
|
+
* @module services/chunking/heading-normalizer
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Patterns that identify structural heading groups in legal/organizational documents.
|
|
12
|
+
* Each regex matches the heading text (not the markdown # prefix).
|
|
13
|
+
* Bold-wrapped text (e.g., **ARTICLE 1**) is stripped before matching.
|
|
14
|
+
*/
|
|
15
|
+
const HEADING_PATTERNS = [
|
|
16
|
+
{ name: 'ARTICLE', regex: /^ARTICLE\s+\d+/i },
|
|
17
|
+
{ name: 'SECTION', regex: /^SECTION\s+\d+(\.\d+)*/i },
|
|
18
|
+
{ name: 'CHAPTER', regex: /^CHAPTER\s+\d+/i },
|
|
19
|
+
{ name: 'PART', regex: /^PART\s+\d+/i },
|
|
20
|
+
{ name: 'TITLE', regex: /^TITLE\s+\d+/i },
|
|
21
|
+
{ name: 'APPENDIX', regex: /^APPENDIX\s+[A-Z0-9]/i },
|
|
22
|
+
{ name: 'SCHEDULE', regex: /^SCHEDULE\s+[A-Z0-9]/i },
|
|
23
|
+
{ name: 'EXHIBIT', regex: /^EXHIBIT\s+[A-Z0-9]/i },
|
|
24
|
+
];
|
|
25
|
+
/**
|
|
26
|
+
* Strip bold markers (**text**) from heading text for pattern matching.
|
|
27
|
+
*/
|
|
28
|
+
function stripBold(text) {
|
|
29
|
+
return text.replace(/^\*\*(.+)\*\*$/, '$1').trim();
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Compute the mode (most frequent value) of a number array.
|
|
33
|
+
* Ties are broken by preferring the smaller value.
|
|
34
|
+
*/
|
|
35
|
+
function computeMode(values) {
|
|
36
|
+
const counts = new Map();
|
|
37
|
+
for (const v of values) {
|
|
38
|
+
counts.set(v, (counts.get(v) ?? 0) + 1);
|
|
39
|
+
}
|
|
40
|
+
let modeValue = values[0];
|
|
41
|
+
let modeCount = 0;
|
|
42
|
+
for (const [val, count] of counts) {
|
|
43
|
+
if (count > modeCount || (count === modeCount && val < modeValue)) {
|
|
44
|
+
modeValue = val;
|
|
45
|
+
modeCount = count;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return modeValue;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Normalize heading levels in-place for consistent section hierarchy.
|
|
52
|
+
*
|
|
53
|
+
* Groups headings by structural patterns (ARTICLE N, Section N.N, etc.),
|
|
54
|
+
* then normalizes each group to use the mode heading level. This fixes
|
|
55
|
+
* Datalab OCR inconsistencies where identical structural headings get
|
|
56
|
+
* assigned different levels (e.g., ARTICLE 1 as H1 but ARTICLE 5 as H3).
|
|
57
|
+
*
|
|
58
|
+
* Only mutates `block.headingLevel` - never modifies `block.text`.
|
|
59
|
+
*
|
|
60
|
+
* @param blocks - Parsed markdown blocks (mutated in-place)
|
|
61
|
+
* @param config - Normalization configuration
|
|
62
|
+
* @returns The same blocks array (for chaining convenience)
|
|
63
|
+
*/
|
|
64
|
+
export function normalizeHeadingLevels(blocks, config) {
|
|
65
|
+
if (!config.enabled) {
|
|
66
|
+
return blocks;
|
|
67
|
+
}
|
|
68
|
+
const minCount = config.minPatternCount ?? 3;
|
|
69
|
+
// Build pattern groups
|
|
70
|
+
const groups = HEADING_PATTERNS.map(p => ({
|
|
71
|
+
name: p.name,
|
|
72
|
+
blockIndices: [],
|
|
73
|
+
levels: [],
|
|
74
|
+
}));
|
|
75
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
76
|
+
const block = blocks[i];
|
|
77
|
+
if (block.type !== 'heading' || block.headingLevel === null || block.headingText === null) {
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
const cleanText = stripBold(block.headingText);
|
|
81
|
+
for (let g = 0; g < HEADING_PATTERNS.length; g++) {
|
|
82
|
+
if (HEADING_PATTERNS[g].regex.test(cleanText)) {
|
|
83
|
+
groups[g].blockIndices.push(i);
|
|
84
|
+
groups[g].levels.push(block.headingLevel);
|
|
85
|
+
break; // A heading belongs to at most one pattern group
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
// Normalize groups that meet the minimum count threshold
|
|
90
|
+
for (const group of groups) {
|
|
91
|
+
if (group.blockIndices.length < minCount) {
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
const targetLevel = computeMode(group.levels);
|
|
95
|
+
for (const blockIdx of group.blockIndices) {
|
|
96
|
+
blocks[blockIdx].headingLevel = targetLevel;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return blocks;
|
|
100
|
+
}
|
|
101
|
+
//# sourceMappingURL=heading-normalizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"heading-normalizer.js","sourceRoot":"","sources":["../../../src/services/chunking/heading-normalizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAsBH;;;;GAIG;AACH,MAAM,gBAAgB,GAA2C;IAC/D,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,iBAAiB,EAAE;IAC7C,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,yBAAyB,EAAE;IACrD,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,iBAAiB,EAAE;IAC7C,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,cAAc,EAAE;IACvC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,eAAe,EAAE;IACzC,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,uBAAuB,EAAE;IACpD,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,uBAAuB,EAAE;IACpD,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,sBAAsB,EAAE;CACnD,CAAC;AAEF;;GAEG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;AACrD,CAAC;AAED;;;GAGG;AACH,SAAS,WAAW,CAAC,MAAgB;IACnC,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACvB,MAAM,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1C,CAAC;IAED,IAAI,SAAS,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IAC1B,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;QAClC,IAAI,KAAK,GAAG,SAAS,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,GAAG,GAAG,SAAS,CAAC,EAAE,CAAC;YAClE,SAAS,GAAG,GAAG,CAAC;YAChB,SAAS,GAAG,KAAK,CAAC;QACpB,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,sBAAsB,CACpC,MAAuB,EACvB,MAAkC;IAElC,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QACpB,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,eAAe,IAAI,CAAC,CAAC;IAE7C,uBAAuB;IACvB,MAAM,MAAM,GAAmB,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACxD,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,YAAY,EAAE,EAAE;QAChB,MAAM,EAAE,EAAE;KACX,CAAC,CAAC,CAAC;IAEJ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QACxB,IAAI,KAAK,CAAC,IAAI,KAAK,SAAS,IAAI,KAAK,CAAC,YAAY,KAAK,IAAI,IAAI,KAAK,CAAC,WAAW,KAAK,IAAI,EAAE,CAAC;YAC1F,SAAS;QACX,CAAC;QAED,MAAM,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;QAE/C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,IAAI,gBAAgB,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC9C,MAAM,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;gBAC/B,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;gBAC1C,MAAM,CAAC,iDAAiD;YAC1D,CAAC;QACH,CAAC;IACH,CAAC;IAED,yDAAyD;IACzD,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YACzC,SAAS;QACX,CAAC;QAED,MAAM,WAAW,GAAG,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAE9C,KAAK,MAAM,QAAQ,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;YAC1C,MAAM,CAAC,QAAQ,CAAC,CAAC,YAAY,GAAG,WAAW,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON Block Analyzer for Section-Aware Chunking
|
|
3
|
+
*
|
|
4
|
+
* Analyzes Datalab JSON block hierarchy to identify atomic (unsplittable)
|
|
5
|
+
* regions such as tables, figures, and code blocks. These regions inform
|
|
6
|
+
* the hybrid chunker where it must NOT split text.
|
|
7
|
+
*
|
|
8
|
+
* @module services/chunking/json-block-analyzer
|
|
9
|
+
*/
|
|
10
|
+
import { PageOffset } from '../../models/document.js';
|
|
11
|
+
/** A region in the markdown text that should not be split */
|
|
12
|
+
export interface AtomicRegion {
|
|
13
|
+
startOffset: number;
|
|
14
|
+
endOffset: number;
|
|
15
|
+
blockType: string;
|
|
16
|
+
pageNumber: number | null;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Find atomic (unsplittable) regions in the markdown text by analyzing JSON blocks.
|
|
20
|
+
*
|
|
21
|
+
* Walks the Datalab JSON block tree, locates Table, TableGroup, Figure, FigureGroup,
|
|
22
|
+
* and Code blocks, then finds their approximate positions in the markdown text using
|
|
23
|
+
* fuzzy text matching. Returns sorted, non-overlapping regions.
|
|
24
|
+
*
|
|
25
|
+
* @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
|
|
26
|
+
* @param markdownText - The full markdown text to search within
|
|
27
|
+
* @param pageOffsets - Page offset information for page number assignment
|
|
28
|
+
* @returns Sorted array of AtomicRegion representing unsplittable text spans
|
|
29
|
+
*/
|
|
30
|
+
export declare function findAtomicRegions(jsonBlocks: Record<string, unknown> | null, markdownText: string, pageOffsets: PageOffset[]): AtomicRegion[];
|
|
31
|
+
/**
|
|
32
|
+
* Check if a character offset falls within an atomic region.
|
|
33
|
+
*
|
|
34
|
+
* Uses binary search on the sorted regions array for efficient lookup.
|
|
35
|
+
*
|
|
36
|
+
* @param offset - The character offset to check
|
|
37
|
+
* @param regions - Sorted array of AtomicRegion (from findAtomicRegions)
|
|
38
|
+
* @returns The containing AtomicRegion, or null if offset is not in any region
|
|
39
|
+
*/
|
|
40
|
+
export declare function isOffsetInAtomicRegion(offset: number, regions: AtomicRegion[]): AtomicRegion | null;
|
|
41
|
+
/** Statistics about block types found in the JSON block hierarchy */
|
|
42
|
+
export interface BlockTypeStats {
|
|
43
|
+
total_blocks: number;
|
|
44
|
+
text_blocks: number;
|
|
45
|
+
table_blocks: number;
|
|
46
|
+
figure_blocks: number;
|
|
47
|
+
code_blocks: number;
|
|
48
|
+
list_blocks: number;
|
|
49
|
+
header_blocks: number;
|
|
50
|
+
footer_blocks: number;
|
|
51
|
+
heading_blocks: number;
|
|
52
|
+
page_count: number;
|
|
53
|
+
tables_per_page: number;
|
|
54
|
+
figures_per_page: number;
|
|
55
|
+
text_density: number;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Walk the JSON block tree and count block types to produce statistics.
|
|
59
|
+
*
|
|
60
|
+
* Recognizes: Text, Table, TableGroup, Figure, FigureGroup, Code,
|
|
61
|
+
* ListItem, List, PageHeader, PageFooter, SectionHeader, Title, Page.
|
|
62
|
+
*
|
|
63
|
+
* @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
|
|
64
|
+
* @returns BlockTypeStats with counts and derived ratios
|
|
65
|
+
*/
|
|
66
|
+
export declare function computeBlockTypeStats(jsonBlocks: Record<string, unknown> | null): BlockTypeStats | null;
|
|
67
|
+
/**
|
|
68
|
+
* Confidence scores for block types, used to compute chunk quality from
|
|
69
|
+
* the block types present in a chunk. Higher values indicate more structured
|
|
70
|
+
* and typically more reliable content.
|
|
71
|
+
*/
|
|
72
|
+
export declare const BLOCK_TYPE_CONFIDENCE: Record<string, number>;
|
|
73
|
+
/**
|
|
74
|
+
* Compute a confidence score for a chunk based on the block types it contains.
|
|
75
|
+
*
|
|
76
|
+
* Returns the average confidence across all content types in the chunk.
|
|
77
|
+
* Unknown block types default to 0.7. An empty content types array also
|
|
78
|
+
* defaults to 0.7.
|
|
79
|
+
*
|
|
80
|
+
* @param contentTypes - Array of block type strings from the chunk
|
|
81
|
+
* @returns Confidence score between 0 and 1
|
|
82
|
+
*/
|
|
83
|
+
export declare function computeBlockConfidence(contentTypes: string[]): number;
|
|
84
|
+
/** Information about headers and footers detected in the JSON block tree */
|
|
85
|
+
export interface HeaderFooterInfo {
|
|
86
|
+
headerTexts: string[];
|
|
87
|
+
footerTexts: string[];
|
|
88
|
+
repeatedHeaders: string[];
|
|
89
|
+
repeatedFooters: string[];
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Detect repeated headers and footers from the JSON block tree.
|
|
93
|
+
*
|
|
94
|
+
* Walks the block tree for each page, collecting PageHeader and PageFooter
|
|
95
|
+
* block texts. A text is considered "repeated" if it appears on >50% of pages
|
|
96
|
+
* with at least 2 occurrences.
|
|
97
|
+
*
|
|
98
|
+
* @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
|
|
99
|
+
* @returns HeaderFooterInfo with all and repeated header/footer texts
|
|
100
|
+
*/
|
|
101
|
+
export declare function detectRepeatedHeadersFooters(jsonBlocks: Record<string, unknown> | null): HeaderFooterInfo;
|
|
102
|
+
/**
|
|
103
|
+
* Check if a chunk's text closely matches any of the repeated header/footer texts.
|
|
104
|
+
* Uses normalized comparison (lowercased, whitespace-collapsed).
|
|
105
|
+
*
|
|
106
|
+
* @param chunkText - The chunk text to check
|
|
107
|
+
* @param repeatedTexts - Array of repeated header/footer texts
|
|
108
|
+
* @returns true if the chunk text matches a repeated header/footer
|
|
109
|
+
*/
|
|
110
|
+
export declare function isRepeatedHeaderFooter(chunkText: string, repeatedTexts: string[]): boolean;
|
|
111
|
+
/** Extracted structure information for a table block */
|
|
112
|
+
export interface TableStructure {
|
|
113
|
+
startOffset: number;
|
|
114
|
+
endOffset: number;
|
|
115
|
+
columnHeaders: string[];
|
|
116
|
+
rowCount: number;
|
|
117
|
+
columnCount: number;
|
|
118
|
+
pageNumber: number | null;
|
|
119
|
+
/** Human-readable summary of table content */
|
|
120
|
+
summary: string;
|
|
121
|
+
/** Values from the first data row (for summary generation) */
|
|
122
|
+
firstRowValues: string[];
|
|
123
|
+
/** Caption text from preceding block (e.g., "Table 1: Budget Summary") */
|
|
124
|
+
caption?: string;
|
|
125
|
+
/** Index of a prior table this continues (cross-page table detection) */
|
|
126
|
+
continuationOf?: number;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Extract table structures from the JSON block tree.
|
|
130
|
+
*
|
|
131
|
+
* Walks json_blocks for Table/TableGroup blocks, extracts column headers
|
|
132
|
+
* from the first row, and maps to markdown text offsets.
|
|
133
|
+
*
|
|
134
|
+
* @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
|
|
135
|
+
* @param markdownText - The full markdown text to search within
|
|
136
|
+
* @param pageOffsets - Page offset information for page number assignment
|
|
137
|
+
* @returns Array of TableStructure with column headers and position info
|
|
138
|
+
*/
|
|
139
|
+
export declare function extractTableStructures(jsonBlocks: Record<string, unknown> | null, markdownText: string, pageOffsets: PageOffset[]): TableStructure[];
|
|
140
|
+
/**
|
|
141
|
+
* Extract column headers from the first pipe-delimited row of markdown table text.
|
|
142
|
+
* Fallback when JSON block children don't contain TableRow elements.
|
|
143
|
+
*/
|
|
144
|
+
export declare function extractHeadersFromMarkdown(tableMarkdown: string): string[];
|
|
145
|
+
/**
|
|
146
|
+
* Count table dimensions from markdown pipe-delimited text.
|
|
147
|
+
* Counts data rows (excludes header and separator rows).
|
|
148
|
+
*/
|
|
149
|
+
export declare function countTableDimensionsFromMarkdown(tableMarkdown: string): {
|
|
150
|
+
rowCount: number;
|
|
151
|
+
columnCount: number;
|
|
152
|
+
};
|
|
153
|
+
/**
|
|
154
|
+
* Extract values from the first data row (after header and separator) of markdown table.
|
|
155
|
+
*/
|
|
156
|
+
export declare function extractFirstDataRow(tableMarkdown: string): string[];
|
|
157
|
+
/**
|
|
158
|
+
* Generate a human-readable summary of table content.
|
|
159
|
+
* Format: "Table with N rows and columns: col1, col2. Sample: val1, val2"
|
|
160
|
+
* Max 200 chars.
|
|
161
|
+
*/
|
|
162
|
+
export declare function generateTableSummary(columnHeaders: string[], rowCount: number, firstRowValues: string[], caption?: string): string;
|
|
163
|
+
//# sourceMappingURL=json-block-analyzer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json-block-analyzer.d.ts","sourceRoot":"","sources":["../../../src/services/chunking/json-block-analyzer.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AAEtD,6DAA6D;AAC7D,MAAM,WAAW,YAAY;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAWD;;;;;;;;;;;GAWG;AACH,wBAAgB,iBAAiB,CAC/B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,EAC1C,YAAY,EAAE,MAAM,EACpB,WAAW,EAAE,UAAU,EAAE,GACxB,YAAY,EAAE,CA6BhB;AAED;;;;;;;;GAQG;AACH,wBAAgB,sBAAsB,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,EAAE,GAAG,YAAY,GAAG,IAAI,CAuBnG;AA8cD,qEAAqE;AACrE,MAAM,WAAW,cAAc;IAC7B,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,gBAAgB,EAAE,MAAM,CAAC;IACzB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,wBAAgB,qBAAqB,CACnC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,GACzC,cAAc,GAAG,IAAI,CAyFvB;AAMD;;;;GAIG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAYxD,CAAC;AAEF;;;;;;;;;GASG;AACH,wBAAgB,sBAAsB,CAAC,YAAY,EAAE,MAAM,EAAE,GAAG,MAAM,CAIrE;AAMD,4EAA4E;AAC5E,MAAM,WAAW,gBAAgB;IAC/B,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,eAAe,EAAE,MAAM,EAAE,CAAC;CAC3B;AAmCD;;;;;;;;;GASG;AACH,wBAAgB,4BAA4B,CAC1C,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,GACzC,gBAAgB,CA4DlB;AAED;;;;;;;GAOG;AACH,wBAAgB,sBAAsB,CACpC,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,EAAE,GACtB,OAAO,CAmBT;AAMD,wDAAwD;AACxD,MAAM,WAAW,cAAc;IAC7B,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,8CAA8C;IAC9C,OAAO,EAAE,MAAM,CAAC;IAChB,8DAA8D;IAC9D,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,0EAA0E;IAC1E,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,yEAAyE;IACzE,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,sBAAsB,CACpC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,EAC1C,YAAY,EAAE,MAAM,EACpB,WAAW,EAAE,UAAU,EAAE,GACxB,cAAc,EAAE,CAgFlB;AAED;;;GAGG;AACH,wBAAgB,0BAA0B,CAAC,aAAa,EAAE,MAAM,GAAG,MAAM,EAAE,CAc1E;AAED;;;GAGG;AACH,wBAAgB,gCAAgC,CAAC,aAAa,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,WAAW,EAAE,MAAM,CAAA;CAAE,CAyBjH;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,aAAa,EAAE,MAAM,GAAG,MAAM,EAAE,CAiBnE;AAED;;;;GAIG;AACH,wBAAgB,oBAAoB,CAClC,aAAa,EAAE,MAAM,EAAE,EACvB,QAAQ,EAAE,MAAM,EAChB,cAAc,EAAE,MAAM,EAAE,EACxB,OAAO,CAAC,EAAE,MAAM,GACf,MAAM,CAuBR"}
|