ocr-provenance-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocr-provenance-mcp might be problematic. Click here for more details.
- package/.env.example +55 -0
- package/LICENSE +78 -0
- package/README.md +1154 -0
- package/dist/bin-http.d.ts +24 -0
- package/dist/bin-http.d.ts.map +1 -0
- package/dist/bin-http.js +275 -0
- package/dist/bin-http.js.map +1 -0
- package/dist/bin-setup.d.ts +11 -0
- package/dist/bin-setup.d.ts.map +1 -0
- package/dist/bin-setup.js +610 -0
- package/dist/bin-setup.js.map +1 -0
- package/dist/bin.d.ts +16 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +16 -0
- package/dist/bin.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/models/chunk.d.ts +136 -0
- package/dist/models/chunk.d.ts.map +1 -0
- package/dist/models/chunk.js +27 -0
- package/dist/models/chunk.js.map +1 -0
- package/dist/models/cluster.d.ts +79 -0
- package/dist/models/cluster.d.ts.map +1 -0
- package/dist/models/cluster.js +10 -0
- package/dist/models/cluster.js.map +1 -0
- package/dist/models/comparison.d.ts +62 -0
- package/dist/models/comparison.d.ts.map +1 -0
- package/dist/models/comparison.js +8 -0
- package/dist/models/comparison.js.map +1 -0
- package/dist/models/document.d.ts +104 -0
- package/dist/models/document.d.ts.map +1 -0
- package/dist/models/document.js +15 -0
- package/dist/models/document.js.map +1 -0
- package/dist/models/embedding.d.ts +87 -0
- package/dist/models/embedding.d.ts.map +1 -0
- package/dist/models/embedding.js +23 -0
- package/dist/models/embedding.js.map +1 -0
- package/dist/models/extraction.d.ts +15 -0
- package/dist/models/extraction.d.ts.map +1 -0
- package/dist/models/extraction.js +2 -0
- package/dist/models/extraction.js.map +1 -0
- package/dist/models/form-fill.d.ts +23 -0
- package/dist/models/form-fill.d.ts.map +1 -0
- package/dist/models/form-fill.js +2 -0
- package/dist/models/form-fill.js.map +1 -0
- package/dist/models/image.d.ts +177 -0
- package/dist/models/image.d.ts.map +1 -0
- package/dist/models/image.js +8 -0
- package/dist/models/image.js.map +1 -0
- package/dist/models/index.d.ts +14 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +22 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/provenance.d.ts +174 -0
- package/dist/models/provenance.d.ts.map +1 -0
- package/dist/models/provenance.js +53 -0
- package/dist/models/provenance.js.map +1 -0
- package/dist/models/uploaded-file.d.ts +20 -0
- package/dist/models/uploaded-file.d.ts.map +1 -0
- package/dist/models/uploaded-file.js +2 -0
- package/dist/models/uploaded-file.js.map +1 -0
- package/dist/server/errors.d.ts +93 -0
- package/dist/server/errors.d.ts.map +1 -0
- package/dist/server/errors.js +256 -0
- package/dist/server/errors.js.map +1 -0
- package/dist/server/events.d.ts +36 -0
- package/dist/server/events.d.ts.map +1 -0
- package/dist/server/events.js +48 -0
- package/dist/server/events.js.map +1 -0
- package/dist/server/permissions.d.ts +26 -0
- package/dist/server/permissions.d.ts.map +1 -0
- package/dist/server/permissions.js +194 -0
- package/dist/server/permissions.js.map +1 -0
- package/dist/server/register-tools.d.ts +25 -0
- package/dist/server/register-tools.d.ts.map +1 -0
- package/dist/server/register-tools.js +102 -0
- package/dist/server/register-tools.js.map +1 -0
- package/dist/server/startup.d.ts +16 -0
- package/dist/server/startup.d.ts.map +1 -0
- package/dist/server/startup.js +37 -0
- package/dist/server/startup.js.map +1 -0
- package/dist/server/state.d.ts +166 -0
- package/dist/server/state.d.ts.map +1 -0
- package/dist/server/state.js +424 -0
- package/dist/server/state.js.map +1 -0
- package/dist/server/transports/http-transport.d.ts +37 -0
- package/dist/server/transports/http-transport.d.ts.map +1 -0
- package/dist/server/transports/http-transport.js +204 -0
- package/dist/server/transports/http-transport.js.map +1 -0
- package/dist/server/transports/index.d.ts +9 -0
- package/dist/server/transports/index.d.ts.map +1 -0
- package/dist/server/transports/index.js +9 -0
- package/dist/server/transports/index.js.map +1 -0
- package/dist/server/transports/session-manager.d.ts +40 -0
- package/dist/server/transports/session-manager.d.ts.map +1 -0
- package/dist/server/transports/session-manager.js +74 -0
- package/dist/server/transports/session-manager.js.map +1 -0
- package/dist/server/types.d.ts +82 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +14 -0
- package/dist/server/types.js.map +1 -0
- package/dist/services/audit.d.ts +26 -0
- package/dist/services/audit.d.ts.map +1 -0
- package/dist/services/audit.js +43 -0
- package/dist/services/audit.js.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
- package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
- package/dist/services/chunking/chunk-deduplicator.js +46 -0
- package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
- package/dist/services/chunking/chunk-merger.d.ts +26 -0
- package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
- package/dist/services/chunking/chunk-merger.js +94 -0
- package/dist/services/chunking/chunk-merger.js.map +1 -0
- package/dist/services/chunking/chunker.d.ts +62 -0
- package/dist/services/chunking/chunker.d.ts.map +1 -0
- package/dist/services/chunking/chunker.js +566 -0
- package/dist/services/chunking/chunker.js.map +1 -0
- package/dist/services/chunking/heading-normalizer.d.ts +33 -0
- package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/heading-normalizer.js +101 -0
- package/dist/services/chunking/heading-normalizer.js.map +1 -0
- package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
- package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
- package/dist/services/chunking/json-block-analyzer.js +1033 -0
- package/dist/services/chunking/json-block-analyzer.js.map +1 -0
- package/dist/services/chunking/markdown-parser.d.ts +75 -0
- package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
- package/dist/services/chunking/markdown-parser.js +428 -0
- package/dist/services/chunking/markdown-parser.js.map +1 -0
- package/dist/services/chunking/text-normalizer.d.ts +20 -0
- package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
- package/dist/services/chunking/text-normalizer.js +36 -0
- package/dist/services/chunking/text-normalizer.js.map +1 -0
- package/dist/services/clm/contract-schemas.d.ts +36 -0
- package/dist/services/clm/contract-schemas.d.ts.map +1 -0
- package/dist/services/clm/contract-schemas.js +92 -0
- package/dist/services/clm/contract-schemas.js.map +1 -0
- package/dist/services/clm/summarization.d.ts +46 -0
- package/dist/services/clm/summarization.d.ts.map +1 -0
- package/dist/services/clm/summarization.js +61 -0
- package/dist/services/clm/summarization.js.map +1 -0
- package/dist/services/clustering/clustering-service.d.ts +58 -0
- package/dist/services/clustering/clustering-service.d.ts.map +1 -0
- package/dist/services/clustering/clustering-service.js +467 -0
- package/dist/services/clustering/clustering-service.js.map +1 -0
- package/dist/services/comparison/diff-service.d.ts +41 -0
- package/dist/services/comparison/diff-service.d.ts.map +1 -0
- package/dist/services/comparison/diff-service.js +120 -0
- package/dist/services/comparison/diff-service.js.map +1 -0
- package/dist/services/embedding/embedder.d.ts +55 -0
- package/dist/services/embedding/embedder.d.ts.map +1 -0
- package/dist/services/embedding/embedder.js +202 -0
- package/dist/services/embedding/embedder.js.map +1 -0
- package/dist/services/embedding/nomic.d.ts +67 -0
- package/dist/services/embedding/nomic.d.ts.map +1 -0
- package/dist/services/embedding/nomic.js +280 -0
- package/dist/services/embedding/nomic.js.map +1 -0
- package/dist/services/gemini/circuit-breaker.d.ts +106 -0
- package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
- package/dist/services/gemini/circuit-breaker.js +237 -0
- package/dist/services/gemini/circuit-breaker.js.map +1 -0
- package/dist/services/gemini/client.d.ts +173 -0
- package/dist/services/gemini/client.d.ts.map +1 -0
- package/dist/services/gemini/client.js +483 -0
- package/dist/services/gemini/client.js.map +1 -0
- package/dist/services/gemini/config.d.ts +116 -0
- package/dist/services/gemini/config.d.ts.map +1 -0
- package/dist/services/gemini/config.js +118 -0
- package/dist/services/gemini/config.js.map +1 -0
- package/dist/services/gemini/index.d.ts +9 -0
- package/dist/services/gemini/index.d.ts.map +1 -0
- package/dist/services/gemini/index.js +13 -0
- package/dist/services/gemini/index.js.map +1 -0
- package/dist/services/gemini/rate-limiter.d.ts +62 -0
- package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
- package/dist/services/gemini/rate-limiter.js +120 -0
- package/dist/services/gemini/rate-limiter.js.map +1 -0
- package/dist/services/images/extractor.d.ts +88 -0
- package/dist/services/images/extractor.d.ts.map +1 -0
- package/dist/services/images/extractor.js +340 -0
- package/dist/services/images/extractor.js.map +1 -0
- package/dist/services/images/optimizer.d.ts +130 -0
- package/dist/services/images/optimizer.d.ts.map +1 -0
- package/dist/services/images/optimizer.js +228 -0
- package/dist/services/images/optimizer.js.map +1 -0
- package/dist/services/ocr/datalab.d.ts +64 -0
- package/dist/services/ocr/datalab.d.ts.map +1 -0
- package/dist/services/ocr/datalab.js +425 -0
- package/dist/services/ocr/datalab.js.map +1 -0
- package/dist/services/ocr/errors.d.ts +38 -0
- package/dist/services/ocr/errors.d.ts.map +1 -0
- package/dist/services/ocr/errors.js +83 -0
- package/dist/services/ocr/errors.js.map +1 -0
- package/dist/services/ocr/file-manager.d.ts +76 -0
- package/dist/services/ocr/file-manager.d.ts.map +1 -0
- package/dist/services/ocr/file-manager.js +238 -0
- package/dist/services/ocr/file-manager.js.map +1 -0
- package/dist/services/ocr/form-fill.d.ts +48 -0
- package/dist/services/ocr/form-fill.d.ts.map +1 -0
- package/dist/services/ocr/form-fill.js +213 -0
- package/dist/services/ocr/form-fill.js.map +1 -0
- package/dist/services/ocr/processor.d.ts +95 -0
- package/dist/services/ocr/processor.d.ts.map +1 -0
- package/dist/services/ocr/processor.js +259 -0
- package/dist/services/ocr/processor.js.map +1 -0
- package/dist/services/provenance/agent-metadata.d.ts +82 -0
- package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
- package/dist/services/provenance/agent-metadata.js +106 -0
- package/dist/services/provenance/agent-metadata.js.map +1 -0
- package/dist/services/provenance/chain-hash.d.ts +57 -0
- package/dist/services/provenance/chain-hash.d.ts.map +1 -0
- package/dist/services/provenance/chain-hash.js +131 -0
- package/dist/services/provenance/chain-hash.js.map +1 -0
- package/dist/services/provenance/exporter.d.ts +202 -0
- package/dist/services/provenance/exporter.d.ts.map +1 -0
- package/dist/services/provenance/exporter.js +457 -0
- package/dist/services/provenance/exporter.js.map +1 -0
- package/dist/services/provenance/index.d.ts +15 -0
- package/dist/services/provenance/index.d.ts.map +1 -0
- package/dist/services/provenance/index.js +17 -0
- package/dist/services/provenance/index.js.map +1 -0
- package/dist/services/provenance/tracker.d.ts +138 -0
- package/dist/services/provenance/tracker.d.ts.map +1 -0
- package/dist/services/provenance/tracker.js +293 -0
- package/dist/services/provenance/tracker.js.map +1 -0
- package/dist/services/provenance/verifier.d.ts +153 -0
- package/dist/services/provenance/verifier.d.ts.map +1 -0
- package/dist/services/provenance/verifier.js +536 -0
- package/dist/services/provenance/verifier.js.map +1 -0
- package/dist/services/python-pool.d.ts +70 -0
- package/dist/services/python-pool.d.ts.map +1 -0
- package/dist/services/python-pool.js +265 -0
- package/dist/services/python-pool.js.map +1 -0
- package/dist/services/search/bm25.d.ts +180 -0
- package/dist/services/search/bm25.d.ts.map +1 -0
- package/dist/services/search/bm25.js +656 -0
- package/dist/services/search/bm25.js.map +1 -0
- package/dist/services/search/fusion.d.ts +103 -0
- package/dist/services/search/fusion.d.ts.map +1 -0
- package/dist/services/search/fusion.js +122 -0
- package/dist/services/search/fusion.js.map +1 -0
- package/dist/services/search/local-reranker.d.ts +30 -0
- package/dist/services/search/local-reranker.d.ts.map +1 -0
- package/dist/services/search/local-reranker.js +123 -0
- package/dist/services/search/local-reranker.js.map +1 -0
- package/dist/services/search/quality.d.ts +11 -0
- package/dist/services/search/quality.d.ts.map +1 -0
- package/dist/services/search/quality.js +17 -0
- package/dist/services/search/quality.js.map +1 -0
- package/dist/services/search/query-classifier.d.ts +34 -0
- package/dist/services/search/query-classifier.d.ts.map +1 -0
- package/dist/services/search/query-classifier.js +114 -0
- package/dist/services/search/query-classifier.js.map +1 -0
- package/dist/services/search/query-expander.d.ts +73 -0
- package/dist/services/search/query-expander.d.ts.map +1 -0
- package/dist/services/search/query-expander.js +281 -0
- package/dist/services/search/query-expander.js.map +1 -0
- package/dist/services/search/reranker.d.ts +44 -0
- package/dist/services/search/reranker.d.ts.map +1 -0
- package/dist/services/search/reranker.js +101 -0
- package/dist/services/search/reranker.js.map +1 -0
- package/dist/services/storage/database/annotation-operations.d.ts +113 -0
- package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/annotation-operations.js +177 -0
- package/dist/services/storage/database/annotation-operations.js.map +1 -0
- package/dist/services/storage/database/approval-operations.d.ts +132 -0
- package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
- package/dist/services/storage/database/approval-operations.js +206 -0
- package/dist/services/storage/database/approval-operations.js.map +1 -0
- package/dist/services/storage/database/chunk-operations.d.ts +132 -0
- package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
- package/dist/services/storage/database/chunk-operations.js +306 -0
- package/dist/services/storage/database/chunk-operations.js.map +1 -0
- package/dist/services/storage/database/cluster-operations.d.ts +97 -0
- package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
- package/dist/services/storage/database/cluster-operations.js +258 -0
- package/dist/services/storage/database/cluster-operations.js.map +1 -0
- package/dist/services/storage/database/comparison-operations.d.ts +41 -0
- package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
- package/dist/services/storage/database/comparison-operations.js +65 -0
- package/dist/services/storage/database/comparison-operations.js.map +1 -0
- package/dist/services/storage/database/converters.d.ts +36 -0
- package/dist/services/storage/database/converters.d.ts.map +1 -0
- package/dist/services/storage/database/converters.js +244 -0
- package/dist/services/storage/database/converters.js.map +1 -0
- package/dist/services/storage/database/document-operations.d.ts +145 -0
- package/dist/services/storage/database/document-operations.d.ts.map +1 -0
- package/dist/services/storage/database/document-operations.js +498 -0
- package/dist/services/storage/database/document-operations.js.map +1 -0
- package/dist/services/storage/database/embedding-operations.d.ts +130 -0
- package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
- package/dist/services/storage/database/embedding-operations.js +315 -0
- package/dist/services/storage/database/embedding-operations.js.map +1 -0
- package/dist/services/storage/database/extraction-operations.d.ts +47 -0
- package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
- package/dist/services/storage/database/extraction-operations.js +85 -0
- package/dist/services/storage/database/extraction-operations.js.map +1 -0
- package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
- package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
- package/dist/services/storage/database/form-fill-operations.js +116 -0
- package/dist/services/storage/database/form-fill-operations.js.map +1 -0
- package/dist/services/storage/database/helpers.d.ts +29 -0
- package/dist/services/storage/database/helpers.d.ts.map +1 -0
- package/dist/services/storage/database/helpers.js +55 -0
- package/dist/services/storage/database/helpers.js.map +1 -0
- package/dist/services/storage/database/image-operations.d.ts +202 -0
- package/dist/services/storage/database/image-operations.d.ts.map +1 -0
- package/dist/services/storage/database/image-operations.js +484 -0
- package/dist/services/storage/database/image-operations.js.map +1 -0
- package/dist/services/storage/database/index.d.ts +13 -0
- package/dist/services/storage/database/index.d.ts.map +1 -0
- package/dist/services/storage/database/index.js +16 -0
- package/dist/services/storage/database/index.js.map +1 -0
- package/dist/services/storage/database/lock-operations.d.ts +59 -0
- package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
- package/dist/services/storage/database/lock-operations.js +89 -0
- package/dist/services/storage/database/lock-operations.js.map +1 -0
- package/dist/services/storage/database/obligation-operations.d.ts +88 -0
- package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
- package/dist/services/storage/database/obligation-operations.js +206 -0
- package/dist/services/storage/database/obligation-operations.js.map +1 -0
- package/dist/services/storage/database/ocr-operations.d.ts +33 -0
- package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
- package/dist/services/storage/database/ocr-operations.js +70 -0
- package/dist/services/storage/database/ocr-operations.js.map +1 -0
- package/dist/services/storage/database/playbook-operations.d.ts +72 -0
- package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
- package/dist/services/storage/database/playbook-operations.js +247 -0
- package/dist/services/storage/database/playbook-operations.js.map +1 -0
- package/dist/services/storage/database/provenance-operations.d.ts +112 -0
- package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
- package/dist/services/storage/database/provenance-operations.js +251 -0
- package/dist/services/storage/database/provenance-operations.js.map +1 -0
- package/dist/services/storage/database/service.d.ts +142 -0
- package/dist/services/storage/database/service.d.ts.map +1 -0
- package/dist/services/storage/database/service.js +310 -0
- package/dist/services/storage/database/service.js.map +1 -0
- package/dist/services/storage/database/static-operations.d.ts +30 -0
- package/dist/services/storage/database/static-operations.d.ts.map +1 -0
- package/dist/services/storage/database/static-operations.js +218 -0
- package/dist/services/storage/database/static-operations.js.map +1 -0
- package/dist/services/storage/database/stats-operations.d.ts +101 -0
- package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
- package/dist/services/storage/database/stats-operations.js +394 -0
- package/dist/services/storage/database/stats-operations.js.map +1 -0
- package/dist/services/storage/database/tag-operations.d.ts +76 -0
- package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
- package/dist/services/storage/database/tag-operations.js +178 -0
- package/dist/services/storage/database/tag-operations.js.map +1 -0
- package/dist/services/storage/database/types.d.ts +286 -0
- package/dist/services/storage/database/types.d.ts.map +1 -0
- package/dist/services/storage/database/types.js +39 -0
- package/dist/services/storage/database/types.js.map +1 -0
- package/dist/services/storage/database/upload-operations.d.ts +71 -0
- package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
- package/dist/services/storage/database/upload-operations.js +124 -0
- package/dist/services/storage/database/upload-operations.js.map +1 -0
- package/dist/services/storage/database/user-operations.d.ts +102 -0
- package/dist/services/storage/database/user-operations.d.ts.map +1 -0
- package/dist/services/storage/database/user-operations.js +151 -0
- package/dist/services/storage/database/user-operations.js.map +1 -0
- package/dist/services/storage/database/workflow-operations.d.ts +98 -0
- package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
- package/dist/services/storage/database/workflow-operations.js +157 -0
- package/dist/services/storage/database/workflow-operations.js.map +1 -0
- package/dist/services/storage/database.d.ts +16 -0
- package/dist/services/storage/database.d.ts.map +1 -0
- package/dist/services/storage/database.js +15 -0
- package/dist/services/storage/database.js.map +1 -0
- package/dist/services/storage/index.d.ts +10 -0
- package/dist/services/storage/index.d.ts.map +1 -0
- package/dist/services/storage/index.js +10 -0
- package/dist/services/storage/index.js.map +1 -0
- package/dist/services/storage/migrations/index.d.ts +16 -0
- package/dist/services/storage/migrations/index.d.ts.map +1 -0
- package/dist/services/storage/migrations/index.js +20 -0
- package/dist/services/storage/migrations/index.js.map +1 -0
- package/dist/services/storage/migrations/operations.d.ts +40 -0
- package/dist/services/storage/migrations/operations.d.ts.map +1 -0
- package/dist/services/storage/migrations/operations.js +2910 -0
- package/dist/services/storage/migrations/operations.js.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
- package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-definitions.js +1006 -0
- package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
- package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
- package/dist/services/storage/migrations/schema-helpers.js +176 -0
- package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
- package/dist/services/storage/migrations/types.d.ts +15 -0
- package/dist/services/storage/migrations/types.d.ts.map +1 -0
- package/dist/services/storage/migrations/types.js +21 -0
- package/dist/services/storage/migrations/types.js.map +1 -0
- package/dist/services/storage/migrations/verification.d.ts +20 -0
- package/dist/services/storage/migrations/verification.d.ts.map +1 -0
- package/dist/services/storage/migrations/verification.js +78 -0
- package/dist/services/storage/migrations/verification.js.map +1 -0
- package/dist/services/storage/migrations.d.ts +16 -0
- package/dist/services/storage/migrations.d.ts.map +1 -0
- package/dist/services/storage/migrations.js +17 -0
- package/dist/services/storage/migrations.js.map +1 -0
- package/dist/services/storage/types.d.ts +12 -0
- package/dist/services/storage/types.d.ts.map +1 -0
- package/dist/services/storage/types.js +5 -0
- package/dist/services/storage/types.js.map +1 -0
- package/dist/services/storage/vector.d.ts +208 -0
- package/dist/services/storage/vector.d.ts.map +1 -0
- package/dist/services/storage/vector.js +526 -0
- package/dist/services/storage/vector.js.map +1 -0
- package/dist/services/vlm/pipeline.d.ts +194 -0
- package/dist/services/vlm/pipeline.d.ts.map +1 -0
- package/dist/services/vlm/pipeline.js +800 -0
- package/dist/services/vlm/pipeline.js.map +1 -0
- package/dist/services/vlm/prompts.d.ts +171 -0
- package/dist/services/vlm/prompts.d.ts.map +1 -0
- package/dist/services/vlm/prompts.js +229 -0
- package/dist/services/vlm/prompts.js.map +1 -0
- package/dist/services/vlm/service.d.ts +174 -0
- package/dist/services/vlm/service.d.ts.map +1 -0
- package/dist/services/vlm/service.js +256 -0
- package/dist/services/vlm/service.js.map +1 -0
- package/dist/services/webhook-delivery.d.ts +4 -0
- package/dist/services/webhook-delivery.d.ts.map +1 -0
- package/dist/services/webhook-delivery.js +140 -0
- package/dist/services/webhook-delivery.js.map +1 -0
- package/dist/tools/chunks.d.ts +19 -0
- package/dist/tools/chunks.d.ts.map +1 -0
- package/dist/tools/chunks.js +392 -0
- package/dist/tools/chunks.js.map +1 -0
- package/dist/tools/clm.d.ts +16 -0
- package/dist/tools/clm.d.ts.map +1 -0
- package/dist/tools/clm.js +668 -0
- package/dist/tools/clm.js.map +1 -0
- package/dist/tools/clustering.d.ts +13 -0
- package/dist/tools/clustering.d.ts.map +1 -0
- package/dist/tools/clustering.js +498 -0
- package/dist/tools/clustering.js.map +1 -0
- package/dist/tools/collaboration.d.ts +15 -0
- package/dist/tools/collaboration.d.ts.map +1 -0
- package/dist/tools/collaboration.js +516 -0
- package/dist/tools/collaboration.js.map +1 -0
- package/dist/tools/comparison.d.ts +13 -0
- package/dist/tools/comparison.d.ts.map +1 -0
- package/dist/tools/comparison.js +735 -0
- package/dist/tools/comparison.js.map +1 -0
- package/dist/tools/compliance.d.ts +15 -0
- package/dist/tools/compliance.d.ts.map +1 -0
- package/dist/tools/compliance.js +640 -0
- package/dist/tools/compliance.js.map +1 -0
- package/dist/tools/config.d.ts +19 -0
- package/dist/tools/config.d.ts.map +1 -0
- package/dist/tools/config.js +213 -0
- package/dist/tools/config.js.map +1 -0
- package/dist/tools/database.d.ts +62 -0
- package/dist/tools/database.d.ts.map +1 -0
- package/dist/tools/database.js +288 -0
- package/dist/tools/database.js.map +1 -0
- package/dist/tools/documents.d.ts +61 -0
- package/dist/tools/documents.d.ts.map +1 -0
- package/dist/tools/documents.js +1624 -0
- package/dist/tools/documents.js.map +1 -0
- package/dist/tools/embeddings.d.ts +14 -0
- package/dist/tools/embeddings.d.ts.map +1 -0
- package/dist/tools/embeddings.js +626 -0
- package/dist/tools/embeddings.js.map +1 -0
- package/dist/tools/evaluation.d.ts +25 -0
- package/dist/tools/evaluation.d.ts.map +1 -0
- package/dist/tools/evaluation.js +523 -0
- package/dist/tools/evaluation.js.map +1 -0
- package/dist/tools/events.d.ts +16 -0
- package/dist/tools/events.d.ts.map +1 -0
- package/dist/tools/events.js +493 -0
- package/dist/tools/events.js.map +1 -0
- package/dist/tools/extraction-structured.d.ts +13 -0
- package/dist/tools/extraction-structured.d.ts.map +1 -0
- package/dist/tools/extraction-structured.js +390 -0
- package/dist/tools/extraction-structured.js.map +1 -0
- package/dist/tools/extraction.d.ts +24 -0
- package/dist/tools/extraction.d.ts.map +1 -0
- package/dist/tools/extraction.js +424 -0
- package/dist/tools/extraction.js.map +1 -0
- package/dist/tools/file-management.d.ts +14 -0
- package/dist/tools/file-management.d.ts.map +1 -0
- package/dist/tools/file-management.js +523 -0
- package/dist/tools/file-management.js.map +1 -0
- package/dist/tools/form-fill.d.ts +13 -0
- package/dist/tools/form-fill.d.ts.map +1 -0
- package/dist/tools/form-fill.js +250 -0
- package/dist/tools/form-fill.js.map +1 -0
- package/dist/tools/health.d.ts +19 -0
- package/dist/tools/health.d.ts.map +1 -0
- package/dist/tools/health.js +229 -0
- package/dist/tools/health.js.map +1 -0
- package/dist/tools/images.d.ts +54 -0
- package/dist/tools/images.d.ts.map +1 -0
- package/dist/tools/images.js +787 -0
- package/dist/tools/images.js.map +1 -0
- package/dist/tools/ingestion.d.ts +94 -0
- package/dist/tools/ingestion.d.ts.map +1 -0
- package/dist/tools/ingestion.js +1659 -0
- package/dist/tools/ingestion.js.map +1 -0
- package/dist/tools/intelligence.d.ts +18 -0
- package/dist/tools/intelligence.d.ts.map +1 -0
- package/dist/tools/intelligence.js +1039 -0
- package/dist/tools/intelligence.js.map +1 -0
- package/dist/tools/provenance.d.ts +51 -0
- package/dist/tools/provenance.d.ts.map +1 -0
- package/dist/tools/provenance.js +691 -0
- package/dist/tools/provenance.js.map +1 -0
- package/dist/tools/reports.d.ts +41 -0
- package/dist/tools/reports.d.ts.map +1 -0
- package/dist/tools/reports.js +1394 -0
- package/dist/tools/reports.js.map +1 -0
- package/dist/tools/search.d.ts +35 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +2528 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/tools/shared.d.ts +52 -0
- package/dist/tools/shared.d.ts.map +1 -0
- package/dist/tools/shared.js +54 -0
- package/dist/tools/shared.js.map +1 -0
- package/dist/tools/tags.d.ts +15 -0
- package/dist/tools/tags.d.ts.map +1 -0
- package/dist/tools/tags.js +287 -0
- package/dist/tools/tags.js.map +1 -0
- package/dist/tools/timeline.d.ts +15 -0
- package/dist/tools/timeline.d.ts.map +1 -0
- package/dist/tools/timeline.js +14 -0
- package/dist/tools/timeline.js.map +1 -0
- package/dist/tools/users.d.ts +14 -0
- package/dist/tools/users.d.ts.map +1 -0
- package/dist/tools/users.js +257 -0
- package/dist/tools/users.js.map +1 -0
- package/dist/tools/vlm.d.ts +40 -0
- package/dist/tools/vlm.d.ts.map +1 -0
- package/dist/tools/vlm.js +475 -0
- package/dist/tools/vlm.js.map +1 -0
- package/dist/tools/workflow.d.ts +16 -0
- package/dist/tools/workflow.d.ts.map +1 -0
- package/dist/tools/workflow.js +495 -0
- package/dist/tools/workflow.js.map +1 -0
- package/dist/utils/backoff.d.ts +53 -0
- package/dist/utils/backoff.d.ts.map +1 -0
- package/dist/utils/backoff.js +78 -0
- package/dist/utils/backoff.js.map +1 -0
- package/dist/utils/config-persistence.d.ts +33 -0
- package/dist/utils/config-persistence.d.ts.map +1 -0
- package/dist/utils/config-persistence.js +61 -0
- package/dist/utils/config-persistence.js.map +1 -0
- package/dist/utils/hash.d.ts +65 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/dist/utils/hash.js +146 -0
- package/dist/utils/hash.js.map +1 -0
- package/dist/utils/math.d.ts +21 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +39 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/validation.d.ts +697 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +529 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +96 -0
- package/python/.gitkeep +0 -0
- package/python/__init__.py +104 -0
- package/python/clustering_worker.py +440 -0
- package/python/docx_image_extractor.py +524 -0
- package/python/embedding_worker.py +552 -0
- package/python/file_manager_worker.py +564 -0
- package/python/form_fill_worker.py +399 -0
- package/python/gpu_utils.py +582 -0
- package/python/image_extractor.py +317 -0
- package/python/image_optimizer.py +444 -0
- package/python/ocr_worker.py +712 -0
- package/python/pyproject.toml +76 -0
- package/python/requirements.txt +51 -0
- package/python/reranker_worker.py +87 -0
|
@@ -0,0 +1,1033 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON Block Analyzer for Section-Aware Chunking
|
|
3
|
+
*
|
|
4
|
+
* Analyzes Datalab JSON block hierarchy to identify atomic (unsplittable)
|
|
5
|
+
* regions such as tables, figures, and code blocks. These regions inform
|
|
6
|
+
* the hybrid chunker where it must NOT split text.
|
|
7
|
+
*
|
|
8
|
+
* @module services/chunking/json-block-analyzer
|
|
9
|
+
*/
|
|
10
|
+
/** Block types that should be treated as atomic (unsplittable) */
|
|
11
|
+
const ATOMIC_BLOCK_TYPES = new Set([
|
|
12
|
+
'Table',
|
|
13
|
+
'TableGroup',
|
|
14
|
+
'Figure',
|
|
15
|
+
'FigureGroup',
|
|
16
|
+
'Code',
|
|
17
|
+
]);
|
|
18
|
+
/**
|
|
19
|
+
* Find atomic (unsplittable) regions in the markdown text by analyzing JSON blocks.
|
|
20
|
+
*
|
|
21
|
+
* Walks the Datalab JSON block tree, locates Table, TableGroup, Figure, FigureGroup,
|
|
22
|
+
* and Code blocks, then finds their approximate positions in the markdown text using
|
|
23
|
+
* fuzzy text matching. Returns sorted, non-overlapping regions.
|
|
24
|
+
*
|
|
25
|
+
* @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
|
|
26
|
+
* @param markdownText - The full markdown text to search within
|
|
27
|
+
* @param pageOffsets - Page offset information for page number assignment
|
|
28
|
+
* @returns Sorted array of AtomicRegion representing unsplittable text spans
|
|
29
|
+
*/
|
|
30
|
+
export function findAtomicRegions(jsonBlocks, markdownText, pageOffsets) {
|
|
31
|
+
if (!jsonBlocks) {
|
|
32
|
+
return [];
|
|
33
|
+
}
|
|
34
|
+
if (markdownText.length === 0) {
|
|
35
|
+
return [];
|
|
36
|
+
}
|
|
37
|
+
const rawRegions = [];
|
|
38
|
+
// Walk the JSON block tree
|
|
39
|
+
walkBlocks(jsonBlocks, (block, pageNum) => {
|
|
40
|
+
const blockType = block.block_type;
|
|
41
|
+
if (!blockType || !ATOMIC_BLOCK_TYPES.has(blockType)) {
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
const region = locateBlockInMarkdown(block, blockType, pageNum, markdownText, pageOffsets);
|
|
45
|
+
if (region) {
|
|
46
|
+
rawRegions.push(region);
|
|
47
|
+
}
|
|
48
|
+
}, 0);
|
|
49
|
+
// Sort by startOffset
|
|
50
|
+
rawRegions.sort((a, b) => a.startOffset - b.startOffset);
|
|
51
|
+
// Merge overlapping regions
|
|
52
|
+
return mergeOverlappingRegions(rawRegions);
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Check if a character offset falls within an atomic region.
|
|
56
|
+
*
|
|
57
|
+
* Uses binary search on the sorted regions array for efficient lookup.
|
|
58
|
+
*
|
|
59
|
+
* @param offset - The character offset to check
|
|
60
|
+
* @param regions - Sorted array of AtomicRegion (from findAtomicRegions)
|
|
61
|
+
* @returns The containing AtomicRegion, or null if offset is not in any region
|
|
62
|
+
*/
|
|
63
|
+
export function isOffsetInAtomicRegion(offset, regions) {
|
|
64
|
+
if (regions.length === 0) {
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
let low = 0;
|
|
68
|
+
let high = regions.length - 1;
|
|
69
|
+
while (low <= high) {
|
|
70
|
+
const mid = Math.floor((low + high) / 2);
|
|
71
|
+
const region = regions[mid];
|
|
72
|
+
if (offset < region.startOffset) {
|
|
73
|
+
high = mid - 1;
|
|
74
|
+
}
|
|
75
|
+
else if (offset >= region.endOffset) {
|
|
76
|
+
low = mid + 1;
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
// offset >= region.startOffset && offset < region.endOffset
|
|
80
|
+
return region;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return null;
|
|
84
|
+
}
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
// Internal helpers
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
/**
|
|
89
|
+
* Strip HTML tags and decode basic entities from an HTML string
|
|
90
|
+
*/
|
|
91
|
+
function stripHtmlTags(html) {
|
|
92
|
+
// Remove all HTML tags
|
|
93
|
+
let text = html.replace(/<[^>]*>/g, '');
|
|
94
|
+
// Decode basic HTML entities
|
|
95
|
+
text = text.replace(/&/g, '&');
|
|
96
|
+
text = text.replace(/</g, '<');
|
|
97
|
+
text = text.replace(/>/g, '>');
|
|
98
|
+
text = text.replace(/"/g, '"');
|
|
99
|
+
text = text.replace(/'/g, "'");
|
|
100
|
+
text = text.replace(/ /g, ' ');
|
|
101
|
+
return text;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Recursively walk the JSON block tree, calling the callback for each block.
|
|
105
|
+
* Tracks the current page number from Page blocks.
|
|
106
|
+
*/
|
|
107
|
+
function walkBlocks(block, callback, pageNum) {
|
|
108
|
+
callback(block, pageNum);
|
|
109
|
+
const children = (block.children ?? block.blocks);
|
|
110
|
+
if (Array.isArray(children)) {
|
|
111
|
+
let childPageNum = pageNum;
|
|
112
|
+
for (const child of children) {
|
|
113
|
+
const childBlock = child;
|
|
114
|
+
const childType = childBlock.block_type;
|
|
115
|
+
walkBlocks(childBlock, callback, childPageNum);
|
|
116
|
+
// After walking a Page child, increment for the next page
|
|
117
|
+
if (childType === 'Page') {
|
|
118
|
+
childPageNum++;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Attempt to locate a JSON block's content in the markdown text.
|
|
125
|
+
* Uses different strategies depending on block type.
|
|
126
|
+
*/
|
|
127
|
+
function locateBlockInMarkdown(block, blockType, _pageNum, markdownText, pageOffsets) {
|
|
128
|
+
// For Table blocks, search for the table's header row (first pipe-delimited line)
|
|
129
|
+
if (blockType === 'Table' || blockType === 'TableGroup') {
|
|
130
|
+
return locateTableInMarkdown(block, blockType, markdownText, pageOffsets);
|
|
131
|
+
}
|
|
132
|
+
// For Figure, FigureGroup, Code blocks: use HTML content
|
|
133
|
+
return locateByHtmlContent(block, blockType, markdownText, pageOffsets);
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Locate a table block by searching for its header row pattern in markdown
|
|
137
|
+
*/
|
|
138
|
+
function locateTableInMarkdown(block, blockType, markdownText, pageOffsets) {
|
|
139
|
+
// Try to get table content from the block's HTML or text
|
|
140
|
+
const html = block.html ?? '';
|
|
141
|
+
const strippedText = stripHtmlTags(html).trim();
|
|
142
|
+
// Extract the first meaningful line as a search key
|
|
143
|
+
let searchKey = '';
|
|
144
|
+
if (strippedText.length > 0) {
|
|
145
|
+
// Get first non-empty line from stripped HTML
|
|
146
|
+
const lines = strippedText.split('\n').filter((l) => l.trim().length > 0);
|
|
147
|
+
if (lines.length > 0) {
|
|
148
|
+
searchKey = lines[0].trim().slice(0, 60);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
// Also try to find a markdown table pattern near the expected location
|
|
152
|
+
// Search for pipe-delimited lines
|
|
153
|
+
if (searchKey.length < 5) {
|
|
154
|
+
// Fallback: try to find any table near the expected page
|
|
155
|
+
return locateTableByPattern(blockType, markdownText, pageOffsets);
|
|
156
|
+
}
|
|
157
|
+
// Search for the key in the markdown
|
|
158
|
+
const keyIdx = findFuzzyMatch(searchKey, markdownText);
|
|
159
|
+
if (keyIdx === -1) {
|
|
160
|
+
console.error(`[json-block-analyzer] Could not locate ${blockType} block with search key: "${searchKey.slice(0, 40)}..."`);
|
|
161
|
+
return null;
|
|
162
|
+
}
|
|
163
|
+
// Find the extent of the table around this match point
|
|
164
|
+
const tableExtent = findTableExtent(markdownText, keyIdx);
|
|
165
|
+
if (!tableExtent) {
|
|
166
|
+
return null;
|
|
167
|
+
}
|
|
168
|
+
validateRegionOffsets(tableExtent.start, tableExtent.end);
|
|
169
|
+
return {
|
|
170
|
+
startOffset: tableExtent.start,
|
|
171
|
+
endOffset: tableExtent.end,
|
|
172
|
+
blockType,
|
|
173
|
+
pageNumber: getPageNumberForOffset(tableExtent.start, pageOffsets),
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Locate a block by its HTML content using fuzzy text matching
|
|
178
|
+
*/
|
|
179
|
+
function locateByHtmlContent(block, blockType, markdownText, pageOffsets) {
|
|
180
|
+
const html = block.html ?? '';
|
|
181
|
+
if (html.length === 0) {
|
|
182
|
+
// No HTML content to match against
|
|
183
|
+
return null;
|
|
184
|
+
}
|
|
185
|
+
const strippedText = stripHtmlTags(html).trim();
|
|
186
|
+
if (strippedText.length === 0) {
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
// Use the first 50 characters as a search key
|
|
190
|
+
const searchKey = strippedText.slice(0, 50).trim();
|
|
191
|
+
if (searchKey.length < 3) {
|
|
192
|
+
return null;
|
|
193
|
+
}
|
|
194
|
+
const matchIdx = findFuzzyMatch(searchKey, markdownText);
|
|
195
|
+
if (matchIdx === -1) {
|
|
196
|
+
console.error(`[json-block-analyzer] Could not locate ${blockType} block with content: "${searchKey.slice(0, 40)}..."`);
|
|
197
|
+
return null;
|
|
198
|
+
}
|
|
199
|
+
// Estimate the end of this block:
|
|
200
|
+
// For code blocks, look for closing fence
|
|
201
|
+
// For figures, use a reasonable extent based on the full stripped text length
|
|
202
|
+
let endIdx;
|
|
203
|
+
if (blockType === 'Code') {
|
|
204
|
+
endIdx = findCodeBlockEnd(markdownText, matchIdx);
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
// Figure/FigureGroup: estimate based on content length
|
|
208
|
+
// Use the stripped text length as a rough guide, with a minimum extent
|
|
209
|
+
const estimatedLength = Math.max(strippedText.length, 20);
|
|
210
|
+
endIdx = Math.min(matchIdx + estimatedLength, markdownText.length);
|
|
211
|
+
}
|
|
212
|
+
validateRegionOffsets(matchIdx, endIdx);
|
|
213
|
+
return {
|
|
214
|
+
startOffset: matchIdx,
|
|
215
|
+
endOffset: endIdx,
|
|
216
|
+
blockType,
|
|
217
|
+
pageNumber: getPageNumberForOffset(matchIdx, pageOffsets),
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Find a fuzzy match for a search key in the markdown text.
|
|
222
|
+
* First tries exact substring match, then falls back to normalized matching.
|
|
223
|
+
*
|
|
224
|
+
* @returns The start index of the match, or -1 if not found
|
|
225
|
+
*/
|
|
226
|
+
function findFuzzyMatch(searchKey, markdownText) {
|
|
227
|
+
// Try exact match first
|
|
228
|
+
const exactIdx = markdownText.indexOf(searchKey);
|
|
229
|
+
if (exactIdx !== -1) {
|
|
230
|
+
return exactIdx;
|
|
231
|
+
}
|
|
232
|
+
// Normalize both strings: collapse whitespace, lowercase
|
|
233
|
+
const normalizedKey = normalizeForSearch(searchKey);
|
|
234
|
+
if (normalizedKey.length < 3) {
|
|
235
|
+
return -1;
|
|
236
|
+
}
|
|
237
|
+
const normalizedText = normalizeForSearch(markdownText);
|
|
238
|
+
const normalizedIdx = normalizedText.indexOf(normalizedKey);
|
|
239
|
+
if (normalizedIdx === -1) {
|
|
240
|
+
return -1;
|
|
241
|
+
}
|
|
242
|
+
// Map the normalized index back to the original text position.
|
|
243
|
+
// Walk the original text, counting non-whitespace characters to find the
|
|
244
|
+
// position that corresponds to the normalized index.
|
|
245
|
+
return mapNormalizedIndexToOriginal(markdownText, normalizedIdx);
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Normalize text for fuzzy matching: collapse whitespace, lowercase
|
|
249
|
+
*/
|
|
250
|
+
function normalizeForSearch(text) {
|
|
251
|
+
return text.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Map a character index in normalized text back to the original text position.
|
|
255
|
+
*/
|
|
256
|
+
function mapNormalizedIndexToOriginal(originalText, normalizedIdx) {
|
|
257
|
+
let normalizedPos = 0;
|
|
258
|
+
let inWhitespace = false;
|
|
259
|
+
let started = false;
|
|
260
|
+
for (let i = 0; i < originalText.length; i++) {
|
|
261
|
+
const ch = originalText[i];
|
|
262
|
+
const isWs = /\s/.test(ch);
|
|
263
|
+
if (!started && isWs) {
|
|
264
|
+
// Skip leading whitespace
|
|
265
|
+
continue;
|
|
266
|
+
}
|
|
267
|
+
started = true;
|
|
268
|
+
if (isWs) {
|
|
269
|
+
if (!inWhitespace) {
|
|
270
|
+
// First whitespace char after non-whitespace counts as one space
|
|
271
|
+
if (normalizedPos === normalizedIdx) {
|
|
272
|
+
return i;
|
|
273
|
+
}
|
|
274
|
+
normalizedPos++;
|
|
275
|
+
inWhitespace = true;
|
|
276
|
+
}
|
|
277
|
+
// Additional whitespace chars are collapsed, don't increment
|
|
278
|
+
}
|
|
279
|
+
else {
|
|
280
|
+
if (normalizedPos === normalizedIdx) {
|
|
281
|
+
return i;
|
|
282
|
+
}
|
|
283
|
+
normalizedPos++;
|
|
284
|
+
inWhitespace = false;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
// If we reach here, return the end of the text
|
|
288
|
+
return originalText.length;
|
|
289
|
+
}
|
|
290
|
+
/**
|
|
291
|
+
* Find the full extent of a markdown table around a given position
|
|
292
|
+
*/
|
|
293
|
+
function findTableExtent(markdownText, nearIdx) {
|
|
294
|
+
// Find the start of the line containing nearIdx
|
|
295
|
+
let lineStart = nearIdx;
|
|
296
|
+
while (lineStart > 0 && markdownText[lineStart - 1] !== '\n') {
|
|
297
|
+
lineStart--;
|
|
298
|
+
}
|
|
299
|
+
// Scan backward to find the first line of the table (starts with |)
|
|
300
|
+
let tableStart = lineStart;
|
|
301
|
+
while (tableStart > 0) {
|
|
302
|
+
// Find start of previous line
|
|
303
|
+
let prevLineStart = tableStart - 1;
|
|
304
|
+
if (prevLineStart >= 0 && markdownText[prevLineStart] === '\n') {
|
|
305
|
+
prevLineStart--;
|
|
306
|
+
}
|
|
307
|
+
while (prevLineStart > 0 && markdownText[prevLineStart - 1] !== '\n') {
|
|
308
|
+
prevLineStart--;
|
|
309
|
+
}
|
|
310
|
+
const prevLine = markdownText.slice(prevLineStart, tableStart).trim();
|
|
311
|
+
if (prevLine.startsWith('|') || prevLine.length === 0) {
|
|
312
|
+
// The previous line is part of the table or empty (could be above table)
|
|
313
|
+
if (prevLine.startsWith('|')) {
|
|
314
|
+
tableStart = prevLineStart;
|
|
315
|
+
}
|
|
316
|
+
else {
|
|
317
|
+
break;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
else {
|
|
321
|
+
break;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
// Scan forward to find the last line of the table
|
|
325
|
+
let tableEnd = nearIdx;
|
|
326
|
+
while (tableEnd < markdownText.length) {
|
|
327
|
+
// Find end of current line
|
|
328
|
+
let lineEnd = tableEnd;
|
|
329
|
+
while (lineEnd < markdownText.length && markdownText[lineEnd] !== '\n') {
|
|
330
|
+
lineEnd++;
|
|
331
|
+
}
|
|
332
|
+
const currentLine = markdownText.slice(tableEnd, lineEnd).trim();
|
|
333
|
+
if (currentLine.startsWith('|') || currentLine.length === 0) {
|
|
334
|
+
tableEnd = lineEnd + 1;
|
|
335
|
+
if (currentLine.length === 0 && tableEnd > nearIdx + 2) {
|
|
336
|
+
// Empty line after some table content - table is done
|
|
337
|
+
break;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
else {
|
|
341
|
+
// Non-table line, table ends at start of this line
|
|
342
|
+
break;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
// Ensure we don't go past the text
|
|
346
|
+
tableEnd = Math.min(tableEnd, markdownText.length);
|
|
347
|
+
if (tableEnd <= tableStart) {
|
|
348
|
+
return null;
|
|
349
|
+
}
|
|
350
|
+
return { start: tableStart, end: tableEnd };
|
|
351
|
+
}
|
|
352
|
+
/**
|
|
353
|
+
* Find the end of a code block starting near a given position
|
|
354
|
+
*/
|
|
355
|
+
function findCodeBlockEnd(markdownText, startIdx) {
|
|
356
|
+
// Look for the opening ``` line
|
|
357
|
+
const searchFrom = startIdx;
|
|
358
|
+
// First, find the opening fence if we're not exactly at it
|
|
359
|
+
let fenceStart = markdownText.lastIndexOf('```', searchFrom);
|
|
360
|
+
if (fenceStart === -1) {
|
|
361
|
+
fenceStart = startIdx;
|
|
362
|
+
}
|
|
363
|
+
// Find the end of the opening fence line
|
|
364
|
+
let pos = fenceStart + 3;
|
|
365
|
+
while (pos < markdownText.length && markdownText[pos] !== '\n') {
|
|
366
|
+
pos++;
|
|
367
|
+
}
|
|
368
|
+
pos++; // Skip the newline
|
|
369
|
+
// Now look for the closing ```
|
|
370
|
+
while (pos < markdownText.length) {
|
|
371
|
+
if (markdownText.slice(pos).trimStart().startsWith('```')) {
|
|
372
|
+
// Find the end of the closing fence line
|
|
373
|
+
let endPos = pos;
|
|
374
|
+
while (endPos < markdownText.length && markdownText[endPos] !== '\n') {
|
|
375
|
+
endPos++;
|
|
376
|
+
}
|
|
377
|
+
return Math.min(endPos + 1, markdownText.length);
|
|
378
|
+
}
|
|
379
|
+
// Move to next line
|
|
380
|
+
while (pos < markdownText.length && markdownText[pos] !== '\n') {
|
|
381
|
+
pos++;
|
|
382
|
+
}
|
|
383
|
+
pos++; // Skip newline
|
|
384
|
+
}
|
|
385
|
+
// No closing fence found, return end of text
|
|
386
|
+
return markdownText.length;
|
|
387
|
+
}
|
|
388
|
+
/**
|
|
389
|
+
* Fallback: try to locate a table by scanning for pipe-delimited patterns
|
|
390
|
+
* near the expected page region
|
|
391
|
+
*/
|
|
392
|
+
function locateTableByPattern(blockType, _markdownText, _pageOffsets) {
|
|
393
|
+
// This is a fallback when we have no content to match.
|
|
394
|
+
// We cannot reliably locate a specific table without content.
|
|
395
|
+
console.error(`[json-block-analyzer] Could not locate ${blockType} block: no searchable content in HTML`);
|
|
396
|
+
return null;
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Get page number for a character offset (delegates to page offsets lookup)
|
|
400
|
+
*/
|
|
401
|
+
function getPageNumberForOffset(charOffset, pageOffsets) {
|
|
402
|
+
if (pageOffsets.length === 0) {
|
|
403
|
+
return null;
|
|
404
|
+
}
|
|
405
|
+
for (const page of pageOffsets) {
|
|
406
|
+
if (charOffset >= page.charStart && charOffset < page.charEnd) {
|
|
407
|
+
return page.page;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
// If past all pages, return last page
|
|
411
|
+
if (charOffset >= pageOffsets[pageOffsets.length - 1].charEnd) {
|
|
412
|
+
return pageOffsets[pageOffsets.length - 1].page;
|
|
413
|
+
}
|
|
414
|
+
return pageOffsets[0].page;
|
|
415
|
+
}
|
|
416
|
+
/**
|
|
417
|
+
* Merge overlapping or adjacent regions in a sorted array
|
|
418
|
+
*/
|
|
419
|
+
function mergeOverlappingRegions(regions) {
|
|
420
|
+
if (regions.length <= 1) {
|
|
421
|
+
return regions;
|
|
422
|
+
}
|
|
423
|
+
const merged = [regions[0]];
|
|
424
|
+
for (let i = 1; i < regions.length; i++) {
|
|
425
|
+
const current = regions[i];
|
|
426
|
+
const last = merged[merged.length - 1];
|
|
427
|
+
if (current.startOffset <= last.endOffset) {
|
|
428
|
+
// Overlapping or adjacent - merge
|
|
429
|
+
last.endOffset = Math.max(last.endOffset, current.endOffset);
|
|
430
|
+
// Keep the block type of the larger region
|
|
431
|
+
if (current.endOffset - current.startOffset > last.endOffset - last.startOffset) {
|
|
432
|
+
last.blockType = current.blockType;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
else {
|
|
436
|
+
merged.push(current);
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
return merged;
|
|
440
|
+
}
|
|
441
|
+
/**
|
|
442
|
+
* Validate that region offsets are non-negative and properly ordered
|
|
443
|
+
*/
|
|
444
|
+
function validateRegionOffsets(start, end) {
|
|
445
|
+
if (start < 0) {
|
|
446
|
+
throw new Error(`Invalid negative startOffset in atomic region: ${start}`);
|
|
447
|
+
}
|
|
448
|
+
if (end < start) {
|
|
449
|
+
throw new Error(`endOffset (${end}) is less than startOffset (${start}) in atomic region`);
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* Walk the JSON block tree and count block types to produce statistics.
|
|
454
|
+
*
|
|
455
|
+
* Recognizes: Text, Table, TableGroup, Figure, FigureGroup, Code,
|
|
456
|
+
* ListItem, List, PageHeader, PageFooter, SectionHeader, Title, Page.
|
|
457
|
+
*
|
|
458
|
+
* @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
|
|
459
|
+
* @returns BlockTypeStats with counts and derived ratios
|
|
460
|
+
*/
|
|
461
|
+
export function computeBlockTypeStats(jsonBlocks) {
|
|
462
|
+
if (!jsonBlocks) {
|
|
463
|
+
return null;
|
|
464
|
+
}
|
|
465
|
+
const counts = {
|
|
466
|
+
total: 0,
|
|
467
|
+
text: 0,
|
|
468
|
+
table: 0,
|
|
469
|
+
figure: 0,
|
|
470
|
+
code: 0,
|
|
471
|
+
list: 0,
|
|
472
|
+
header: 0,
|
|
473
|
+
footer: 0,
|
|
474
|
+
heading: 0,
|
|
475
|
+
page: 0,
|
|
476
|
+
};
|
|
477
|
+
const countBlocks = (block) => {
|
|
478
|
+
const blockType = block.block_type;
|
|
479
|
+
if (blockType) {
|
|
480
|
+
counts.total++;
|
|
481
|
+
switch (blockType) {
|
|
482
|
+
case 'Text':
|
|
483
|
+
counts.text++;
|
|
484
|
+
break;
|
|
485
|
+
case 'Table':
|
|
486
|
+
case 'TableGroup':
|
|
487
|
+
counts.table++;
|
|
488
|
+
break;
|
|
489
|
+
case 'Figure':
|
|
490
|
+
case 'FigureGroup':
|
|
491
|
+
counts.figure++;
|
|
492
|
+
break;
|
|
493
|
+
case 'Code':
|
|
494
|
+
counts.code++;
|
|
495
|
+
break;
|
|
496
|
+
case 'ListItem':
|
|
497
|
+
case 'List':
|
|
498
|
+
counts.list++;
|
|
499
|
+
break;
|
|
500
|
+
case 'PageHeader':
|
|
501
|
+
counts.header++;
|
|
502
|
+
break;
|
|
503
|
+
case 'PageFooter':
|
|
504
|
+
counts.footer++;
|
|
505
|
+
break;
|
|
506
|
+
case 'SectionHeader':
|
|
507
|
+
case 'Title':
|
|
508
|
+
counts.heading++;
|
|
509
|
+
break;
|
|
510
|
+
case 'Page':
|
|
511
|
+
counts.page++;
|
|
512
|
+
break;
|
|
513
|
+
// Other block types still count toward total_blocks
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
const children = (block.children ?? block.blocks);
|
|
517
|
+
if (Array.isArray(children)) {
|
|
518
|
+
for (const child of children) {
|
|
519
|
+
countBlocks(child);
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
};
|
|
523
|
+
countBlocks(jsonBlocks);
|
|
524
|
+
const pageCount = Math.max(counts.page, 1);
|
|
525
|
+
// Content blocks = non-structural blocks (exclude Page, PageHeader, PageFooter)
|
|
526
|
+
const contentBlocks = counts.total - counts.page - counts.header - counts.footer;
|
|
527
|
+
return {
|
|
528
|
+
total_blocks: counts.total,
|
|
529
|
+
text_blocks: counts.text,
|
|
530
|
+
table_blocks: counts.table,
|
|
531
|
+
figure_blocks: counts.figure,
|
|
532
|
+
code_blocks: counts.code,
|
|
533
|
+
list_blocks: counts.list,
|
|
534
|
+
header_blocks: counts.header,
|
|
535
|
+
footer_blocks: counts.footer,
|
|
536
|
+
heading_blocks: counts.heading,
|
|
537
|
+
page_count: counts.page,
|
|
538
|
+
tables_per_page: Math.round((counts.table / pageCount) * 100) / 100,
|
|
539
|
+
figures_per_page: Math.round((counts.figure / pageCount) * 100) / 100,
|
|
540
|
+
text_density: contentBlocks > 0
|
|
541
|
+
? Math.round((counts.text / contentBlocks) * 100) / 100
|
|
542
|
+
: 0,
|
|
543
|
+
};
|
|
544
|
+
}
|
|
545
|
+
// ---------------------------------------------------------------------------
|
|
546
|
+
// Block-Type Confidence Scoring (ME-8 / Task 4.3)
|
|
547
|
+
// ---------------------------------------------------------------------------
|
|
548
|
+
/**
|
|
549
|
+
* Confidence scores for block types, used to compute chunk quality from
|
|
550
|
+
* the block types present in a chunk. Higher values indicate more structured
|
|
551
|
+
* and typically more reliable content.
|
|
552
|
+
*/
|
|
553
|
+
export const BLOCK_TYPE_CONFIDENCE = {
|
|
554
|
+
Table: 0.9,
|
|
555
|
+
TableGroup: 0.9,
|
|
556
|
+
Code: 0.9,
|
|
557
|
+
SectionHeader: 0.85,
|
|
558
|
+
Title: 0.85,
|
|
559
|
+
ListItem: 0.8,
|
|
560
|
+
List: 0.8,
|
|
561
|
+
Text: 0.7,
|
|
562
|
+
Figure: 0.6,
|
|
563
|
+
PageHeader: 0.5,
|
|
564
|
+
PageFooter: 0.5,
|
|
565
|
+
};
|
|
566
|
+
/**
|
|
567
|
+
* Compute a confidence score for a chunk based on the block types it contains.
|
|
568
|
+
*
|
|
569
|
+
* Returns the average confidence across all content types in the chunk.
|
|
570
|
+
* Unknown block types default to 0.7. An empty content types array also
|
|
571
|
+
* defaults to 0.7.
|
|
572
|
+
*
|
|
573
|
+
* @param contentTypes - Array of block type strings from the chunk
|
|
574
|
+
* @returns Confidence score between 0 and 1
|
|
575
|
+
*/
|
|
576
|
+
export function computeBlockConfidence(contentTypes) {
|
|
577
|
+
if (contentTypes.length === 0)
|
|
578
|
+
return 0.7;
|
|
579
|
+
const scores = contentTypes.map((t) => BLOCK_TYPE_CONFIDENCE[t] ?? 0.7);
|
|
580
|
+
return scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
581
|
+
}
|
|
582
|
+
/**
|
|
583
|
+
* Extract text content from a block by walking its HTML or children.
|
|
584
|
+
* Returns the concatenated text content, stripped of HTML tags.
|
|
585
|
+
*/
|
|
586
|
+
function extractBlockText(block) {
|
|
587
|
+
// Try HTML content first
|
|
588
|
+
const html = block.html ?? '';
|
|
589
|
+
if (html.length > 0) {
|
|
590
|
+
return stripHtmlTags(html).trim();
|
|
591
|
+
}
|
|
592
|
+
// Try direct text content
|
|
593
|
+
const text = block.text ?? '';
|
|
594
|
+
if (text.length > 0) {
|
|
595
|
+
return text.trim();
|
|
596
|
+
}
|
|
597
|
+
// Walk children to collect text
|
|
598
|
+
const children = (block.children ?? block.blocks);
|
|
599
|
+
if (Array.isArray(children)) {
|
|
600
|
+
const parts = [];
|
|
601
|
+
for (const child of children) {
|
|
602
|
+
const childText = extractBlockText(child);
|
|
603
|
+
if (childText.length > 0) {
|
|
604
|
+
parts.push(childText);
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
return parts.join(' ').trim();
|
|
608
|
+
}
|
|
609
|
+
return '';
|
|
610
|
+
}
|
|
611
|
+
/**
|
|
612
|
+
* Detect repeated headers and footers from the JSON block tree.
|
|
613
|
+
*
|
|
614
|
+
* Walks the block tree for each page, collecting PageHeader and PageFooter
|
|
615
|
+
* block texts. A text is considered "repeated" if it appears on >50% of pages
|
|
616
|
+
* with at least 2 occurrences.
|
|
617
|
+
*
|
|
618
|
+
* @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
|
|
619
|
+
* @returns HeaderFooterInfo with all and repeated header/footer texts
|
|
620
|
+
*/
|
|
621
|
+
export function detectRepeatedHeadersFooters(jsonBlocks) {
|
|
622
|
+
const result = {
|
|
623
|
+
headerTexts: [],
|
|
624
|
+
footerTexts: [],
|
|
625
|
+
repeatedHeaders: [],
|
|
626
|
+
repeatedFooters: [],
|
|
627
|
+
};
|
|
628
|
+
if (!jsonBlocks) {
|
|
629
|
+
return result;
|
|
630
|
+
}
|
|
631
|
+
// Collect header/footer texts per page
|
|
632
|
+
const headerCounts = new Map();
|
|
633
|
+
const footerCounts = new Map();
|
|
634
|
+
let pageCount = 0;
|
|
635
|
+
// Walk the tree collecting PageHeader and PageFooter blocks
|
|
636
|
+
walkBlocks(jsonBlocks, (block, _pageNum) => {
|
|
637
|
+
const blockType = block.block_type;
|
|
638
|
+
if (!blockType)
|
|
639
|
+
return;
|
|
640
|
+
if (blockType === 'Page') {
|
|
641
|
+
pageCount++;
|
|
642
|
+
return;
|
|
643
|
+
}
|
|
644
|
+
if (blockType === 'PageHeader') {
|
|
645
|
+
const text = extractBlockText(block);
|
|
646
|
+
if (text.length > 0) {
|
|
647
|
+
result.headerTexts.push(text);
|
|
648
|
+
headerCounts.set(text, (headerCounts.get(text) ?? 0) + 1);
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
else if (blockType === 'PageFooter') {
|
|
652
|
+
const text = extractBlockText(block);
|
|
653
|
+
if (text.length > 0) {
|
|
654
|
+
result.footerTexts.push(text);
|
|
655
|
+
footerCounts.set(text, (footerCounts.get(text) ?? 0) + 1);
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
}, 0);
|
|
659
|
+
// Ensure at least 1 page for percentage calculation
|
|
660
|
+
const effectivePageCount = Math.max(pageCount, 1);
|
|
661
|
+
const threshold = effectivePageCount / 2;
|
|
662
|
+
// Repeated = appears on >50% of pages, with at least 2 occurrences
|
|
663
|
+
for (const [text, count] of headerCounts) {
|
|
664
|
+
if (count >= 2 && count > threshold) {
|
|
665
|
+
result.repeatedHeaders.push(text);
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
for (const [text, count] of footerCounts) {
|
|
669
|
+
if (count >= 2 && count > threshold) {
|
|
670
|
+
result.repeatedFooters.push(text);
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
return result;
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Check if a chunk's text closely matches any of the repeated header/footer texts.
|
|
677
|
+
* Uses normalized comparison (lowercased, whitespace-collapsed).
|
|
678
|
+
*
|
|
679
|
+
* @param chunkText - The chunk text to check
|
|
680
|
+
* @param repeatedTexts - Array of repeated header/footer texts
|
|
681
|
+
* @returns true if the chunk text matches a repeated header/footer
|
|
682
|
+
*/
|
|
683
|
+
export function isRepeatedHeaderFooter(chunkText, repeatedTexts) {
|
|
684
|
+
if (repeatedTexts.length === 0)
|
|
685
|
+
return false;
|
|
686
|
+
const normalizedChunk = chunkText.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
687
|
+
if (normalizedChunk.length === 0)
|
|
688
|
+
return false;
|
|
689
|
+
for (const repeated of repeatedTexts) {
|
|
690
|
+
const normalizedRepeated = repeated.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
691
|
+
// Exact match or chunk contains the repeated text
|
|
692
|
+
if (normalizedChunk === normalizedRepeated)
|
|
693
|
+
return true;
|
|
694
|
+
// Check if the chunk is very short and is a substring of the repeated text
|
|
695
|
+
if (normalizedChunk.length <= normalizedRepeated.length * 1.2 &&
|
|
696
|
+
normalizedRepeated.includes(normalizedChunk))
|
|
697
|
+
return true;
|
|
698
|
+
// Check if the repeated text is contained in a short chunk
|
|
699
|
+
if (normalizedChunk.length <= normalizedRepeated.length * 1.5 &&
|
|
700
|
+
normalizedChunk.includes(normalizedRepeated))
|
|
701
|
+
return true;
|
|
702
|
+
}
|
|
703
|
+
return false;
|
|
704
|
+
}
|
|
705
|
+
/**
|
|
706
|
+
* Extract table structures from the JSON block tree.
|
|
707
|
+
*
|
|
708
|
+
* Walks json_blocks for Table/TableGroup blocks, extracts column headers
|
|
709
|
+
* from the first row, and maps to markdown text offsets.
|
|
710
|
+
*
|
|
711
|
+
* @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
|
|
712
|
+
* @param markdownText - The full markdown text to search within
|
|
713
|
+
* @param pageOffsets - Page offset information for page number assignment
|
|
714
|
+
* @returns Array of TableStructure with column headers and position info
|
|
715
|
+
*/
|
|
716
|
+
export function extractTableStructures(jsonBlocks, markdownText, pageOffsets) {
|
|
717
|
+
if (!jsonBlocks || markdownText.length === 0) {
|
|
718
|
+
return [];
|
|
719
|
+
}
|
|
720
|
+
const structures = [];
|
|
721
|
+
/** Track previous block for caption detection */
|
|
722
|
+
let previousBlockText = '';
|
|
723
|
+
walkBlocks(jsonBlocks, (block, _pageNum) => {
|
|
724
|
+
const blockType = block.block_type;
|
|
725
|
+
// Track non-table block text for caption detection
|
|
726
|
+
if (blockType && blockType !== 'Table' && blockType !== 'TableGroup') {
|
|
727
|
+
const text = extractBlockText(block);
|
|
728
|
+
if (text.length > 0) {
|
|
729
|
+
previousBlockText = text;
|
|
730
|
+
}
|
|
731
|
+
return;
|
|
732
|
+
}
|
|
733
|
+
if (blockType !== 'Table' && blockType !== 'TableGroup') {
|
|
734
|
+
return;
|
|
735
|
+
}
|
|
736
|
+
// Locate the table in markdown text first (needed for markdown fallbacks)
|
|
737
|
+
const region = locateBlockInMarkdown(block, blockType, _pageNum, markdownText, pageOffsets);
|
|
738
|
+
if (!region) {
|
|
739
|
+
previousBlockText = '';
|
|
740
|
+
return;
|
|
741
|
+
}
|
|
742
|
+
// Get the markdown text range for this table
|
|
743
|
+
const tableMarkdown = markdownText.slice(region.startOffset, region.endOffset);
|
|
744
|
+
// Extract column headers from the block's children, with markdown fallback
|
|
745
|
+
let columnHeaders = extractTableColumnHeaders(block);
|
|
746
|
+
if (columnHeaders.length === 0) {
|
|
747
|
+
columnHeaders = extractHeadersFromMarkdown(tableMarkdown);
|
|
748
|
+
}
|
|
749
|
+
// Count rows from block children, with markdown fallback
|
|
750
|
+
let { rowCount, columnCount } = countTableDimensions(block, columnHeaders.length);
|
|
751
|
+
if (rowCount === 0) {
|
|
752
|
+
const mdDims = countTableDimensionsFromMarkdown(tableMarkdown);
|
|
753
|
+
rowCount = mdDims.rowCount;
|
|
754
|
+
if (columnCount === 0)
|
|
755
|
+
columnCount = mdDims.columnCount;
|
|
756
|
+
}
|
|
757
|
+
// Extract first data row values from markdown for summary
|
|
758
|
+
const firstRowValues = extractFirstDataRow(tableMarkdown);
|
|
759
|
+
// Detect caption from preceding block
|
|
760
|
+
let caption;
|
|
761
|
+
if (previousBlockText.length > 0 && /^(Table|Figure)\s+\d+[.:]/i.test(previousBlockText)) {
|
|
762
|
+
caption = previousBlockText.slice(0, 200);
|
|
763
|
+
}
|
|
764
|
+
// Generate summary
|
|
765
|
+
const summary = generateTableSummary(columnHeaders, rowCount, firstRowValues, caption);
|
|
766
|
+
structures.push({
|
|
767
|
+
startOffset: region.startOffset,
|
|
768
|
+
endOffset: region.endOffset,
|
|
769
|
+
columnHeaders,
|
|
770
|
+
rowCount,
|
|
771
|
+
columnCount,
|
|
772
|
+
pageNumber: region.pageNumber,
|
|
773
|
+
summary,
|
|
774
|
+
firstRowValues,
|
|
775
|
+
caption,
|
|
776
|
+
});
|
|
777
|
+
previousBlockText = '';
|
|
778
|
+
}, 0);
|
|
779
|
+
// Cross-page table continuity detection
|
|
780
|
+
detectTableContinuations(structures);
|
|
781
|
+
return structures;
|
|
782
|
+
}
|
|
783
|
+
/**
|
|
784
|
+
* Extract column headers from the first pipe-delimited row of markdown table text.
|
|
785
|
+
* Fallback when JSON block children don't contain TableRow elements.
|
|
786
|
+
*/
|
|
787
|
+
export function extractHeadersFromMarkdown(tableMarkdown) {
|
|
788
|
+
const lines = tableMarkdown.split('\n').filter(l => l.trim().length > 0);
|
|
789
|
+
for (const line of lines) {
|
|
790
|
+
const trimmed = line.trim();
|
|
791
|
+
if (!trimmed.includes('|'))
|
|
792
|
+
continue;
|
|
793
|
+
// Skip separator rows like |---|---|
|
|
794
|
+
if (/^\|?[\s-:|]+\|?$/.test(trimmed))
|
|
795
|
+
continue;
|
|
796
|
+
// Parse pipe-delimited cells
|
|
797
|
+
const cells = trimmed.split('|')
|
|
798
|
+
.map(c => c.trim())
|
|
799
|
+
.filter(c => c.length > 0);
|
|
800
|
+
if (cells.length > 0)
|
|
801
|
+
return cells;
|
|
802
|
+
}
|
|
803
|
+
return [];
|
|
804
|
+
}
|
|
805
|
+
/**
|
|
806
|
+
* Count table dimensions from markdown pipe-delimited text.
|
|
807
|
+
* Counts data rows (excludes header and separator rows).
|
|
808
|
+
*/
|
|
809
|
+
export function countTableDimensionsFromMarkdown(tableMarkdown) {
|
|
810
|
+
const lines = tableMarkdown.split('\n').filter(l => l.trim().length > 0 && l.includes('|'));
|
|
811
|
+
if (lines.length === 0)
|
|
812
|
+
return { rowCount: 0, columnCount: 0 };
|
|
813
|
+
let maxCols = 0;
|
|
814
|
+
let headerFound = false;
|
|
815
|
+
let dataRows = 0;
|
|
816
|
+
for (const line of lines) {
|
|
817
|
+
const trimmed = line.trim();
|
|
818
|
+
// Check if separator row
|
|
819
|
+
if (/^\|?[\s-:|]+\|?$/.test(trimmed)) {
|
|
820
|
+
continue;
|
|
821
|
+
}
|
|
822
|
+
const cells = trimmed.split('|').map(c => c.trim()).filter(c => c.length > 0);
|
|
823
|
+
if (cells.length > maxCols)
|
|
824
|
+
maxCols = cells.length;
|
|
825
|
+
if (!headerFound) {
|
|
826
|
+
headerFound = true; // first non-separator row is the header
|
|
827
|
+
}
|
|
828
|
+
else {
|
|
829
|
+
dataRows++;
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
return { rowCount: dataRows, columnCount: maxCols };
|
|
833
|
+
}
|
|
834
|
+
/**
|
|
835
|
+
* Extract values from the first data row (after header and separator) of markdown table.
|
|
836
|
+
*/
|
|
837
|
+
export function extractFirstDataRow(tableMarkdown) {
|
|
838
|
+
const lines = tableMarkdown.split('\n').filter(l => l.trim().length > 0 && l.includes('|'));
|
|
839
|
+
let headerSeen = false;
|
|
840
|
+
for (const line of lines) {
|
|
841
|
+
const trimmed = line.trim();
|
|
842
|
+
if (/^\|?[\s-:|]+\|?$/.test(trimmed)) {
|
|
843
|
+
continue;
|
|
844
|
+
}
|
|
845
|
+
if (!headerSeen) {
|
|
846
|
+
headerSeen = true;
|
|
847
|
+
continue;
|
|
848
|
+
}
|
|
849
|
+
// First non-header, non-separator row is the first data row
|
|
850
|
+
return trimmed.split('|').map(c => c.trim()).filter(c => c.length > 0);
|
|
851
|
+
}
|
|
852
|
+
return [];
|
|
853
|
+
}
|
|
854
|
+
/**
|
|
855
|
+
* Generate a human-readable summary of table content.
|
|
856
|
+
* Format: "Table with N rows and columns: col1, col2. Sample: val1, val2"
|
|
857
|
+
* Max 200 chars.
|
|
858
|
+
*/
|
|
859
|
+
export function generateTableSummary(columnHeaders, rowCount, firstRowValues, caption) {
|
|
860
|
+
const parts = [];
|
|
861
|
+
if (caption) {
|
|
862
|
+
parts.push(caption);
|
|
863
|
+
}
|
|
864
|
+
const rowDesc = rowCount > 0 ? `${rowCount} rows` : 'rows';
|
|
865
|
+
if (columnHeaders.length > 0) {
|
|
866
|
+
parts.push(`Table with ${rowDesc} and columns: ${columnHeaders.join(', ')}`);
|
|
867
|
+
}
|
|
868
|
+
else {
|
|
869
|
+
parts.push(`Table with ${rowDesc}`);
|
|
870
|
+
}
|
|
871
|
+
if (firstRowValues.length > 0) {
|
|
872
|
+
parts.push(`Sample: ${firstRowValues.join(', ')}`);
|
|
873
|
+
}
|
|
874
|
+
let summary = parts.join('. ');
|
|
875
|
+
if (summary.length > 200) {
|
|
876
|
+
summary = summary.slice(0, 197) + '...';
|
|
877
|
+
}
|
|
878
|
+
return summary;
|
|
879
|
+
}
|
|
880
|
+
/**
|
|
881
|
+
* Detect cross-page table continuations by comparing column headers.
|
|
882
|
+
* Consecutive tables with matching headers on adjacent pages are linked.
|
|
883
|
+
*/
|
|
884
|
+
function detectTableContinuations(structures) {
|
|
885
|
+
if (structures.length < 2)
|
|
886
|
+
return;
|
|
887
|
+
for (let i = 1; i < structures.length; i++) {
|
|
888
|
+
const prev = structures[i - 1];
|
|
889
|
+
const curr = structures[i];
|
|
890
|
+
// Both must have column headers to compare
|
|
891
|
+
if (prev.columnHeaders.length === 0 || curr.columnHeaders.length === 0)
|
|
892
|
+
continue;
|
|
893
|
+
// Must be on adjacent pages (or page info unavailable)
|
|
894
|
+
if (prev.pageNumber !== null && curr.pageNumber !== null) {
|
|
895
|
+
if (curr.pageNumber - prev.pageNumber > 1)
|
|
896
|
+
continue;
|
|
897
|
+
}
|
|
898
|
+
// Compare column headers: exact match or >80% overlap
|
|
899
|
+
const overlap = columnHeaderOverlap(prev.columnHeaders, curr.columnHeaders);
|
|
900
|
+
if (overlap >= 0.8) {
|
|
901
|
+
curr.continuationOf = i - 1;
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
/**
|
|
906
|
+
* Compute Sorensen-Dice similarity between two column header arrays.
|
|
907
|
+
*/
|
|
908
|
+
function columnHeaderOverlap(a, b) {
|
|
909
|
+
if (a.length === 0 && b.length === 0)
|
|
910
|
+
return 1;
|
|
911
|
+
if (a.length === 0 || b.length === 0)
|
|
912
|
+
return 0;
|
|
913
|
+
const setA = new Set(a.map(h => h.toLowerCase().trim()));
|
|
914
|
+
const setB = new Set(b.map(h => h.toLowerCase().trim()));
|
|
915
|
+
let intersection = 0;
|
|
916
|
+
for (const h of setA) {
|
|
917
|
+
if (setB.has(h))
|
|
918
|
+
intersection++;
|
|
919
|
+
}
|
|
920
|
+
return (2 * intersection) / (setA.size + setB.size);
|
|
921
|
+
}
|
|
922
|
+
/**
|
|
923
|
+
* Extract column headers from the first row of a table block.
|
|
924
|
+
* Looks for the first TableRow/Row child and extracts cell texts.
|
|
925
|
+
*/
|
|
926
|
+
function extractTableColumnHeaders(block) {
|
|
927
|
+
const children = (block.children ?? block.blocks);
|
|
928
|
+
if (!Array.isArray(children) || children.length === 0) {
|
|
929
|
+
// Try extracting from HTML content as fallback
|
|
930
|
+
return extractHeadersFromHtml(block);
|
|
931
|
+
}
|
|
932
|
+
// Look for the first row-like child
|
|
933
|
+
for (const child of children) {
|
|
934
|
+
const childBlock = child;
|
|
935
|
+
const childType = childBlock.block_type;
|
|
936
|
+
if (childType === 'TableRow' || childType === 'Row' || childType === 'TableHeader') {
|
|
937
|
+
const cells = (childBlock.children ?? childBlock.blocks);
|
|
938
|
+
if (Array.isArray(cells) && cells.length > 0) {
|
|
939
|
+
const headers = [];
|
|
940
|
+
for (const cell of cells) {
|
|
941
|
+
const cellBlock = cell;
|
|
942
|
+
const cellText = extractBlockText(cellBlock);
|
|
943
|
+
if (cellText.length > 0) {
|
|
944
|
+
headers.push(cellText);
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
if (headers.length > 0)
|
|
948
|
+
return headers;
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
// For TableGroup, check nested Table children
|
|
952
|
+
if (childType === 'Table') {
|
|
953
|
+
const tableHeaders = extractTableColumnHeaders(childBlock);
|
|
954
|
+
if (tableHeaders.length > 0)
|
|
955
|
+
return tableHeaders;
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
// Fallback: try HTML parsing
|
|
959
|
+
return extractHeadersFromHtml(block);
|
|
960
|
+
}
|
|
961
|
+
/**
|
|
962
|
+
* Extract table headers from block HTML content (fallback).
|
|
963
|
+
* Looks for the first row in an HTML table.
|
|
964
|
+
*/
|
|
965
|
+
function extractHeadersFromHtml(block) {
|
|
966
|
+
const html = block.html ?? '';
|
|
967
|
+
if (html.length === 0)
|
|
968
|
+
return [];
|
|
969
|
+
// Try to find <th> elements first
|
|
970
|
+
const thMatches = html.match(/<th[^>]*>(.*?)<\/th>/gi);
|
|
971
|
+
if (thMatches && thMatches.length > 0) {
|
|
972
|
+
return thMatches.map(th => stripHtmlTags(th).trim()).filter(t => t.length > 0);
|
|
973
|
+
}
|
|
974
|
+
// Try first <tr> and extract <td> elements
|
|
975
|
+
const firstRowMatch = html.match(/<tr[^>]*>(.*?)<\/tr>/i);
|
|
976
|
+
if (firstRowMatch) {
|
|
977
|
+
const tdMatches = firstRowMatch[1].match(/<td[^>]*>(.*?)<\/td>/gi);
|
|
978
|
+
if (tdMatches && tdMatches.length > 0) {
|
|
979
|
+
return tdMatches.map(td => stripHtmlTags(td).trim()).filter(t => t.length > 0);
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
return [];
|
|
983
|
+
}
|
|
984
|
+
/**
|
|
985
|
+
* Count table dimensions from block children, with HTML fallback.
|
|
986
|
+
*/
|
|
987
|
+
function countTableDimensions(block, headerColumnCount) {
|
|
988
|
+
const children = (block.children ?? block.blocks);
|
|
989
|
+
let rowCount = 0;
|
|
990
|
+
let maxColumns = headerColumnCount;
|
|
991
|
+
if (Array.isArray(children) && children.length > 0) {
|
|
992
|
+
for (const child of children) {
|
|
993
|
+
const childBlock = child;
|
|
994
|
+
const childType = childBlock.block_type;
|
|
995
|
+
if (childType === 'TableRow' || childType === 'Row' || childType === 'TableHeader') {
|
|
996
|
+
rowCount++;
|
|
997
|
+
const cells = (childBlock.children ?? childBlock.blocks);
|
|
998
|
+
if (Array.isArray(cells) && cells.length > maxColumns) {
|
|
999
|
+
maxColumns = cells.length;
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
else if (childType === 'Table') {
|
|
1003
|
+
// Nested table in TableGroup
|
|
1004
|
+
const nested = countTableDimensions(childBlock, headerColumnCount);
|
|
1005
|
+
rowCount += nested.rowCount;
|
|
1006
|
+
if (nested.columnCount > maxColumns)
|
|
1007
|
+
maxColumns = nested.columnCount;
|
|
1008
|
+
}
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
// HTML fallback: count <tr> elements when block children yield 0 rows
|
|
1012
|
+
if (rowCount === 0) {
|
|
1013
|
+
const html = block.html ?? '';
|
|
1014
|
+
if (html.length > 0) {
|
|
1015
|
+
const trMatches = html.match(/<tr[^>]*>/gi);
|
|
1016
|
+
if (trMatches) {
|
|
1017
|
+
// Subtract 1 for header row (data rows only)
|
|
1018
|
+
rowCount = Math.max(0, trMatches.length - 1);
|
|
1019
|
+
}
|
|
1020
|
+
// Count max columns from HTML if needed
|
|
1021
|
+
if (maxColumns === 0) {
|
|
1022
|
+
const firstRowMatch = html.match(/<tr[^>]*>([\s\S]*?)<\/tr>/i);
|
|
1023
|
+
if (firstRowMatch) {
|
|
1024
|
+
const cellCount = (firstRowMatch[1].match(/<t[dh][^>]*>/gi) ?? []).length;
|
|
1025
|
+
if (cellCount > maxColumns)
|
|
1026
|
+
maxColumns = cellCount;
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
return { rowCount, columnCount: maxColumns };
|
|
1032
|
+
}
|
|
1033
|
+
//# sourceMappingURL=json-block-analyzer.js.map
|