kreuzberg 3.10.0__tar.gz → 3.10.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/ci.yaml +97 -46
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.pre-commit-config.yaml +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/PKG-INFO +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/benchmark_baseline.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/end_to_end_benchmark.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +1 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/statistical_benchmark.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_api/main.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_chunker.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_config.py +23 -2
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_document_classification.py +40 -5
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_entity_extraction.py +2 -2
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_email.py +31 -8
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pdf.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_structured.py +3 -3
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_gmft.py +2 -2
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_language_detection.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_mcp/server.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_base.py +3 -3
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_easyocr.py +3 -3
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_paddleocr.py +2 -2
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_playa.py +3 -1
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_types.py +5 -5
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_device.py +6 -6
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_document_cache.py +1 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/cli.py +6 -6
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/pyproject.toml +17 -2
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/api/main_test.py +323 -0
- kreuzberg-3.10.1/tests/cli_command_test.py +523 -0
- kreuzberg-3.10.1/tests/config_test.py +1570 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/conftest.py +6 -0
- kreuzberg-3.10.1/tests/document_classification_test.py +886 -0
- kreuzberg-3.10.1/tests/entity_extraction_test.py +675 -0
- kreuzberg-3.10.1/tests/extraction_test.py +834 -0
- kreuzberg-3.10.1/tests/extractors/email_test.py +1003 -0
- kreuzberg-3.10.1/tests/extractors/image_test.py +768 -0
- kreuzberg-3.10.1/tests/extractors/pandoc_test.py +2123 -0
- kreuzberg-3.10.1/tests/extractors/pdf_test.py +973 -0
- kreuzberg-3.10.1/tests/extractors/presentation_test.py +1005 -0
- kreuzberg-3.10.1/tests/extractors/spreed_sheet_test.py +1237 -0
- kreuzberg-3.10.1/tests/extractors/structured_test.py +302 -0
- kreuzberg-3.10.1/tests/gmft_test.py +720 -0
- kreuzberg-3.10.1/tests/language_detection_test.py +172 -0
- kreuzberg-3.10.1/tests/mcp_server_test.py +883 -0
- kreuzberg-3.10.1/tests/ocr/tesseract_test.py +1141 -0
- kreuzberg-3.10.1/tests/playa_helpers_test.py +549 -0
- kreuzberg-3.10.1/tests/types_test.py +440 -0
- kreuzberg-3.10.1/tests/utils/string_test.py +305 -0
- kreuzberg-3.10.1/tests/utils_errors_test.py +299 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/uv.lock +154 -123
- kreuzberg-3.10.0/tests/config_test.py +0 -401
- kreuzberg-3.10.0/tests/document_classification_test.py +0 -86
- kreuzberg-3.10.0/tests/entity_extraction_test.py +0 -102
- kreuzberg-3.10.0/tests/extraction_test.py +0 -389
- kreuzberg-3.10.0/tests/extractors/email_comprehensive_test.py +0 -326
- kreuzberg-3.10.0/tests/extractors/email_test.py +0 -31
- kreuzberg-3.10.0/tests/extractors/image_test.py +0 -275
- kreuzberg-3.10.0/tests/extractors/pandoc_test.py +0 -458
- kreuzberg-3.10.0/tests/extractors/pdf_test.py +0 -438
- kreuzberg-3.10.0/tests/extractors/presentation_test.py +0 -410
- kreuzberg-3.10.0/tests/extractors/spreed_sheet_test.py +0 -325
- kreuzberg-3.10.0/tests/extractors/structured_test.py +0 -90
- kreuzberg-3.10.0/tests/gmft_test.py +0 -397
- kreuzberg-3.10.0/tests/language_detection_test.py +0 -237
- kreuzberg-3.10.0/tests/mcp_server_test.py +0 -382
- kreuzberg-3.10.0/tests/ocr/tesseract_test.py +0 -477
- kreuzberg-3.10.0/tests/types_test.py +0 -191
- kreuzberg-3.10.0/tests/utils/string_test.py +0 -85
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.commitlintrc +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.deepsource.toml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.docker/README.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.dockerignore +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.gitignore +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/LICENSE +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/README.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/ai-rulez.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/changelog.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/cli.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/contributing.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/mkdocs.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/tmp_test.py +0 -0
@@ -51,12 +51,103 @@ jobs:
|
|
51
51
|
- name: Execute Pre-Commit
|
52
52
|
run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
|
53
53
|
|
54
|
+
# Coverage job runs first, only on Python 3.13 Ubuntu
|
55
|
+
coverage:
|
56
|
+
needs: validate
|
57
|
+
runs-on: ubuntu-latest
|
58
|
+
timeout-minutes: 20
|
59
|
+
steps:
|
60
|
+
- name: Checkout
|
61
|
+
uses: actions/checkout@v4
|
62
|
+
|
63
|
+
- name: Install uv
|
64
|
+
uses: astral-sh/setup-uv@v6
|
65
|
+
with:
|
66
|
+
enable-cache: true
|
67
|
+
|
68
|
+
- name: Install Python
|
69
|
+
uses: actions/setup-python@v5
|
70
|
+
id: setup-python
|
71
|
+
with:
|
72
|
+
python-version: "3.13"
|
73
|
+
|
74
|
+
- name: Cache Python Dependencies
|
75
|
+
id: python-cache
|
76
|
+
uses: actions/cache@v4
|
77
|
+
with:
|
78
|
+
path: |
|
79
|
+
~/.cache/uv
|
80
|
+
.venv
|
81
|
+
key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
|
82
|
+
restore-keys: |
|
83
|
+
python-dependencies-ubuntu-latest-3.13-
|
84
|
+
|
85
|
+
- name: Install Dependencies
|
86
|
+
uses: nick-fields/retry@v3
|
87
|
+
with:
|
88
|
+
timeout_minutes: 5
|
89
|
+
max_attempts: 3
|
90
|
+
retry_wait_seconds: 30
|
91
|
+
command: |
|
92
|
+
uv sync --all-packages --all-extras --dev
|
93
|
+
shell: bash
|
94
|
+
|
95
|
+
- name: Install System Dependencies
|
96
|
+
uses: nick-fields/retry@v3
|
97
|
+
with:
|
98
|
+
timeout_minutes: 5
|
99
|
+
max_attempts: 3
|
100
|
+
retry_wait_seconds: 30
|
101
|
+
command: |
|
102
|
+
sudo apt-get update
|
103
|
+
sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
|
104
|
+
shell: bash
|
105
|
+
|
106
|
+
- name: Run Tests with Coverage
|
107
|
+
uses: nick-fields/retry@v3
|
108
|
+
with:
|
109
|
+
timeout_minutes: 15
|
110
|
+
max_attempts: 3
|
111
|
+
retry_wait_seconds: 10
|
112
|
+
command: |
|
113
|
+
uv run coverage erase
|
114
|
+
uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
|
115
|
+
uv run coverage report --precision=2
|
116
|
+
shell: bash
|
117
|
+
|
118
|
+
- name: Upload Coverage to DeepSource
|
119
|
+
if: always() && github.event_name == 'push'
|
120
|
+
env:
|
121
|
+
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
122
|
+
run: |
|
123
|
+
# Install DeepSource CLI
|
124
|
+
curl -fsSL https://deepsource.io/cli | sh
|
125
|
+
# Upload coverage report
|
126
|
+
./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
|
127
|
+
|
128
|
+
- name: Upload Coverage Artifacts
|
129
|
+
if: always()
|
130
|
+
uses: actions/upload-artifact@v4
|
131
|
+
with:
|
132
|
+
name: coverage-report-${{ github.sha }}
|
133
|
+
path: |
|
134
|
+
coverage.lcov
|
135
|
+
.coverage
|
136
|
+
retention-days: 7
|
137
|
+
|
138
|
+
# Full test matrix runs only after coverage succeeds
|
54
139
|
test:
|
140
|
+
needs: coverage
|
141
|
+
runs-on: ${{ matrix.os }}
|
55
142
|
strategy:
|
143
|
+
fail-fast: false
|
56
144
|
matrix:
|
57
|
-
os: [
|
58
|
-
python:
|
59
|
-
|
145
|
+
os: [ubuntu-latest, windows-latest, macos-latest]
|
146
|
+
python: ["3.10", "3.11", "3.12", "3.13"]
|
147
|
+
exclude:
|
148
|
+
# Skip Python 3.13 on macOS for now due to compatibility issues
|
149
|
+
- os: macos-latest
|
150
|
+
python: "3.13"
|
60
151
|
timeout-minutes: 30
|
61
152
|
steps:
|
62
153
|
- name: Checkout
|
@@ -146,52 +237,12 @@ jobs:
|
|
146
237
|
pandoc --version
|
147
238
|
shell: pwsh
|
148
239
|
|
149
|
-
- name:
|
150
|
-
run: |
|
151
|
-
rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
|
152
|
-
shell: bash
|
153
|
-
|
154
|
-
- name: Run Tests with Coverage
|
155
|
-
run: |
|
156
|
-
uv run coverage erase
|
157
|
-
uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
|
158
|
-
|
159
|
-
- name: Upload Coverage Artifacts
|
160
|
-
if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
|
161
|
-
uses: actions/upload-artifact@v4
|
162
|
-
with:
|
163
|
-
name: coverage-report
|
164
|
-
path: coverage.lcov
|
165
|
-
retention-days: 1
|
166
|
-
|
167
|
-
upload-coverage:
|
168
|
-
needs: test
|
169
|
-
runs-on: ubuntu-latest
|
170
|
-
if: github.event_name == 'push' || github.event_name == 'pull_request'
|
171
|
-
steps:
|
172
|
-
- name: Checkout
|
173
|
-
uses: actions/checkout@v4
|
174
|
-
with:
|
175
|
-
ref: ${{ github.event.pull_request.head.sha || github.sha }}
|
176
|
-
|
177
|
-
- name: Download Coverage Artifacts
|
178
|
-
uses: actions/download-artifact@v4
|
179
|
-
with:
|
180
|
-
name: coverage-report
|
181
|
-
path: .
|
182
|
-
|
183
|
-
- name: Install DeepSource CLI
|
240
|
+
- name: Run Tests (without coverage)
|
184
241
|
uses: nick-fields/retry@v3
|
185
242
|
with:
|
186
|
-
timeout_minutes:
|
243
|
+
timeout_minutes: 15
|
187
244
|
max_attempts: 3
|
188
245
|
retry_wait_seconds: 10
|
189
246
|
command: |
|
190
|
-
|
247
|
+
uv run pytest -s -vvv --reruns 2 --reruns-delay 1
|
191
248
|
shell: bash
|
192
|
-
|
193
|
-
- name: Upload Coverage to DeepSource
|
194
|
-
env:
|
195
|
-
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
196
|
-
run: |
|
197
|
-
./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.10.
|
3
|
+
Version: 3.10.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
|
|
108
108
|
return results # type: ignore[return-value]
|
109
109
|
|
110
110
|
|
111
|
-
if __name__ == "__main__":
|
111
|
+
if __name__ == "__main__": # pragma: no cover
|
112
112
|
baseline_results = asyncio.run(run_baseline_benchmark())
|
113
113
|
|
114
114
|
baseline_file = Path("baseline_results.json")
|
@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
|
|
195
195
|
}
|
196
196
|
|
197
197
|
|
198
|
-
if __name__ == "__main__":
|
198
|
+
if __name__ == "__main__": # pragma: no cover
|
199
199
|
print("🧪 REPRODUCIBLE CACHE BENCHMARK")
|
200
200
|
print("Testing msgpack implementation with statistical rigor...")
|
201
201
|
print()
|
@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
|
|
187
187
|
}
|
188
188
|
|
189
189
|
|
190
|
-
if __name__ == "__main__":
|
190
|
+
if __name__ == "__main__": # pragma: no cover
|
191
191
|
print("🧪 STATISTICAL CACHE BENCHMARK")
|
192
192
|
print("Testing msgpack implementation with proper error analysis...")
|
193
193
|
print()
|
@@ -30,7 +30,7 @@ try:
|
|
30
30
|
HTTP_422_UNPROCESSABLE_ENTITY,
|
31
31
|
HTTP_500_INTERNAL_SERVER_ERROR,
|
32
32
|
)
|
33
|
-
except ImportError as e:
|
33
|
+
except ImportError as e: # pragma: no cover
|
34
34
|
raise MissingDependencyError.create_for_package(
|
35
35
|
dependency_group="litestar",
|
36
36
|
functionality="Litestar API and docker container",
|
@@ -43,7 +43,7 @@ def get_chunker(
|
|
43
43
|
from semantic_text_splitter import TextSplitter # noqa: PLC0415
|
44
44
|
|
45
45
|
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
|
-
except ImportError as e:
|
46
|
+
except ImportError as e: # pragma: no cover
|
47
47
|
raise MissingDependencyError.create_for_package(
|
48
48
|
dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
|
49
49
|
) from e
|
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
|
|
13
13
|
|
14
14
|
if sys.version_info >= (3, 11):
|
15
15
|
import tomllib
|
16
|
-
else:
|
16
|
+
else: # pragma: no cover
|
17
17
|
import tomli as tomllib # type: ignore[import-not-found]
|
18
18
|
|
19
19
|
from kreuzberg._gmft import GMFTConfig
|
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
50
50
|
# Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
|
51
51
|
if config_path.name == "kreuzberg.toml":
|
52
52
|
return data # type: ignore[no-any-return]
|
53
|
-
|
53
|
+
|
54
|
+
# For other files, check if they have [tool.kreuzberg] section
|
55
|
+
if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
|
56
|
+
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
57
|
+
|
58
|
+
# Otherwise assume root-level configuration
|
59
|
+
return data # type: ignore[no-any-return]
|
54
60
|
|
55
61
|
|
56
62
|
def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
@@ -129,12 +135,23 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
129
135
|
"extract_keywords",
|
130
136
|
"auto_detect_language",
|
131
137
|
"enable_quality_processing",
|
138
|
+
"auto_detect_document_type",
|
139
|
+
"document_type_confidence_threshold",
|
140
|
+
"document_classification_mode",
|
141
|
+
"keyword_count",
|
132
142
|
}
|
133
143
|
extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
|
134
144
|
|
135
145
|
# Handle OCR backend configuration
|
136
146
|
ocr_backend = extraction_config.get("ocr_backend")
|
137
147
|
if ocr_backend and ocr_backend != "none":
|
148
|
+
# Validate OCR backend
|
149
|
+
valid_backends = {"tesseract", "easyocr", "paddleocr"}
|
150
|
+
if ocr_backend not in valid_backends:
|
151
|
+
raise ValidationError(
|
152
|
+
f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
|
153
|
+
context={"provided": ocr_backend, "valid": sorted(valid_backends)},
|
154
|
+
)
|
138
155
|
ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
|
139
156
|
if ocr_config:
|
140
157
|
extraction_config["ocr_config"] = ocr_config
|
@@ -286,6 +303,10 @@ _CONFIG_FIELDS = [
|
|
286
303
|
"extract_keywords",
|
287
304
|
"auto_detect_language",
|
288
305
|
"enable_quality_processing",
|
306
|
+
"auto_detect_document_type",
|
307
|
+
"document_type_confidence_threshold",
|
308
|
+
"document_classification_mode",
|
309
|
+
"keyword_count",
|
289
310
|
]
|
290
311
|
|
291
312
|
|
@@ -4,13 +4,12 @@ import re
|
|
4
4
|
from typing import TYPE_CHECKING
|
5
5
|
|
6
6
|
from kreuzberg._ocr import get_ocr_backend
|
7
|
+
from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
|
7
8
|
from kreuzberg.exceptions import MissingDependencyError
|
8
9
|
|
9
10
|
if TYPE_CHECKING:
|
10
11
|
from pathlib import Path
|
11
12
|
|
12
|
-
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
13
|
-
|
14
13
|
|
15
14
|
DOCUMENT_CLASSIFIERS = {
|
16
15
|
"invoice": [
|
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
52
51
|
Raises:
|
53
52
|
MissingDependencyError: If the deep-translator package is not installed
|
54
53
|
"""
|
54
|
+
# Combine content with metadata for classification
|
55
|
+
text_to_classify = result.content
|
56
|
+
if result.metadata:
|
57
|
+
# Add metadata values to the text for classification
|
58
|
+
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
59
|
+
text_to_classify = f"{text_to_classify} {metadata_text}"
|
60
|
+
|
55
61
|
try:
|
56
62
|
from deep_translator import GoogleTranslator # noqa: PLC0415
|
57
|
-
except ImportError as e:
|
63
|
+
except ImportError as e: # pragma: no cover
|
58
64
|
raise MissingDependencyError(
|
59
65
|
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
|
60
66
|
) from e
|
61
67
|
|
62
|
-
|
68
|
+
try:
|
69
|
+
return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
70
|
+
except Exception: # noqa: BLE001
|
71
|
+
# Fall back to original content in lowercase if translation fails
|
72
|
+
return text_to_classify.lower()
|
63
73
|
|
64
74
|
|
65
75
|
def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
|
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
|
|
73
83
|
A tuple containing the detected document type and the confidence score,
|
74
84
|
or (None, None) if no type is detected with sufficient confidence.
|
75
85
|
"""
|
86
|
+
if not config.auto_detect_document_type:
|
87
|
+
return None, None
|
88
|
+
|
76
89
|
translated_text = _get_translated_text(result)
|
77
90
|
scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
|
78
91
|
|
@@ -108,7 +121,8 @@ def classify_document_from_layout(
|
|
108
121
|
A tuple containing the detected document type and the confidence score,
|
109
122
|
or (None, None) if no type is detected with sufficient confidence.
|
110
123
|
"""
|
111
|
-
|
124
|
+
if not config.auto_detect_document_type:
|
125
|
+
return None, None
|
112
126
|
|
113
127
|
if result.layout is None or result.layout.empty:
|
114
128
|
return None, None
|
@@ -117,6 +131,24 @@ def classify_document_from_layout(
|
|
117
131
|
if not all(col in layout_df.columns for col in ["text", "top", "height"]):
|
118
132
|
return None, None
|
119
133
|
|
134
|
+
# Use layout text for classification, not the content
|
135
|
+
layout_text = " ".join(layout_df["text"].astype(str).tolist())
|
136
|
+
|
137
|
+
# Translate layout text directly for classification
|
138
|
+
text_to_classify = layout_text
|
139
|
+
if result.metadata:
|
140
|
+
# Add metadata values to the text for classification
|
141
|
+
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
142
|
+
text_to_classify = f"{text_to_classify} {metadata_text}"
|
143
|
+
|
144
|
+
try:
|
145
|
+
from deep_translator import GoogleTranslator # noqa: PLC0415
|
146
|
+
|
147
|
+
translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
148
|
+
except Exception: # noqa: BLE001
|
149
|
+
# Fall back to original content in lowercase if translation fails
|
150
|
+
translated_text = text_to_classify.lower()
|
151
|
+
|
120
152
|
layout_df["translated_text"] = translated_text
|
121
153
|
|
122
154
|
page_height = layout_df["top"].max() + layout_df["height"].max()
|
@@ -151,6 +183,9 @@ def auto_detect_document_type(
|
|
151
183
|
if config.document_classification_mode == "vision" and file_path:
|
152
184
|
layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
|
153
185
|
result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
|
186
|
+
elif result.layout is not None and not result.layout.empty:
|
187
|
+
# Use layout-based classification if layout data is available
|
188
|
+
result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
|
154
189
|
else:
|
155
190
|
result.document_type, result.document_type_confidence = classify_document(result, config)
|
156
191
|
return result
|
@@ -139,7 +139,7 @@ def extract_entities(
|
|
139
139
|
|
140
140
|
try:
|
141
141
|
import spacy # noqa: F401, PLC0415
|
142
|
-
except ImportError as e:
|
142
|
+
except ImportError as e: # pragma: no cover
|
143
143
|
raise MissingDependencyError.create_for_package(
|
144
144
|
package_name="spacy",
|
145
145
|
dependency_group="entity-extraction",
|
@@ -230,7 +230,7 @@ def extract_keywords(
|
|
230
230
|
return [(kw, float(score)) for kw, score in keywords]
|
231
231
|
except (RuntimeError, OSError, ValueError):
|
232
232
|
return []
|
233
|
-
except ImportError as e:
|
233
|
+
except ImportError as e: # pragma: no cover
|
234
234
|
raise MissingDependencyError.create_for_package(
|
235
235
|
package_name="keybert",
|
236
236
|
dependency_group="entity-extraction",
|
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
|
|
19
19
|
# Import optional dependencies at module level with proper error handling
|
20
20
|
try:
|
21
21
|
import mailparse
|
22
|
-
except ImportError:
|
22
|
+
except ImportError: # pragma: no cover
|
23
23
|
mailparse = None
|
24
24
|
|
25
25
|
try:
|
26
26
|
import html2text # type: ignore[import-not-found]
|
27
|
-
except ImportError:
|
27
|
+
except ImportError: # pragma: no cover
|
28
28
|
html2text = None
|
29
29
|
|
30
30
|
# Compile regex pattern once at module level
|
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
|
|
59
59
|
|
60
60
|
to_info = parsed_email.get("to")
|
61
61
|
if to_info:
|
62
|
+
# Store the raw value in metadata (could be string, dict, or list)
|
62
63
|
if isinstance(to_info, list) and to_info:
|
64
|
+
# For metadata, use first recipient's email if it's a list
|
63
65
|
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
66
|
+
metadata["email_to"] = to_email
|
64
67
|
elif isinstance(to_info, dict):
|
65
|
-
|
68
|
+
metadata["email_to"] = to_info.get("email", "")
|
66
69
|
else:
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
+
metadata["email_to"] = str(to_info)
|
71
|
+
|
72
|
+
# For display, format all recipients
|
73
|
+
to_formatted = self._format_email_field(to_info)
|
74
|
+
text_parts.append(f"To: {to_formatted}")
|
70
75
|
|
71
76
|
date = parsed_email.get("date")
|
72
77
|
if date:
|
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
|
|
76
81
|
cc = parsed_email.get("cc")
|
77
82
|
if cc:
|
78
83
|
metadata["email_cc"] = cc
|
79
|
-
|
84
|
+
cc_formatted = self._format_email_field(cc)
|
85
|
+
text_parts.append(f"CC: {cc_formatted}")
|
80
86
|
|
81
87
|
bcc = parsed_email.get("bcc")
|
82
88
|
if bcc:
|
83
89
|
metadata["email_bcc"] = bcc
|
84
|
-
|
90
|
+
bcc_formatted = self._format_email_field(bcc)
|
91
|
+
text_parts.append(f"BCC: {bcc_formatted}")
|
92
|
+
|
93
|
+
def _format_email_field(self, field: Any) -> str:
|
94
|
+
"""Format email field (to, cc, bcc) for display."""
|
95
|
+
if isinstance(field, list):
|
96
|
+
emails = []
|
97
|
+
for item in field:
|
98
|
+
if isinstance(item, dict):
|
99
|
+
email = item.get("email", "")
|
100
|
+
if email:
|
101
|
+
emails.append(email)
|
102
|
+
else:
|
103
|
+
emails.append(str(item))
|
104
|
+
return ", ".join(emails)
|
105
|
+
if isinstance(field, dict):
|
106
|
+
return str(field.get("email", ""))
|
107
|
+
return str(field)
|
85
108
|
|
86
109
|
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
87
110
|
"""Extract and process email body content."""
|
@@ -82,7 +82,7 @@ class PDFExtractor(Extractor):
|
|
82
82
|
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
83
83
|
|
84
84
|
result.tables = await extract_tables(path, self.config.gmft_config)
|
85
|
-
except ImportError:
|
85
|
+
except ImportError: # pragma: no cover
|
86
86
|
result.tables = []
|
87
87
|
|
88
88
|
# Enhance metadata with table information
|
@@ -6,15 +6,15 @@ from typing import TYPE_CHECKING, Any, ClassVar
|
|
6
6
|
|
7
7
|
if sys.version_info >= (3, 11):
|
8
8
|
import tomllib
|
9
|
-
else:
|
9
|
+
else: # pragma: no cover
|
10
10
|
try:
|
11
11
|
import tomli as tomllib # type: ignore[import-not-found]
|
12
|
-
except ImportError:
|
12
|
+
except ImportError: # pragma: no cover
|
13
13
|
tomllib = None
|
14
14
|
|
15
15
|
try:
|
16
16
|
import yaml
|
17
|
-
except ImportError:
|
17
|
+
except ImportError: # pragma: no cover
|
18
18
|
yaml = None
|
19
19
|
|
20
20
|
from anyio import Path as AsyncPath
|
@@ -265,7 +265,7 @@ async def extract_tables(
|
|
265
265
|
finally:
|
266
266
|
await run_sync(doc.close)
|
267
267
|
|
268
|
-
except ImportError as e:
|
268
|
+
except ImportError as e: # pragma: no cover
|
269
269
|
raise MissingDependencyError.create_for_package(
|
270
270
|
dependency_group="gmft", functionality="table extraction", package_name="gmft"
|
271
271
|
) from e
|
@@ -379,7 +379,7 @@ def extract_tables_sync(
|
|
379
379
|
finally:
|
380
380
|
doc.close() # type: ignore[no-untyped-call]
|
381
381
|
|
382
|
-
except ImportError as e:
|
382
|
+
except ImportError as e: # pragma: no cover
|
383
383
|
raise MissingDependencyError.create_for_package(
|
384
384
|
dependency_group="gmft", functionality="table extraction", package_name="gmft"
|
385
385
|
) from e
|
@@ -268,7 +268,7 @@ def extract_structured(file_path: str) -> list[TextContent]:
|
|
268
268
|
return [TextContent(type="text", text=content)]
|
269
269
|
|
270
270
|
|
271
|
-
def main() -> None:
|
271
|
+
def main() -> None: # pragma: no cover
|
272
272
|
"""Main entry point for the MCP server."""
|
273
273
|
mcp.run()
|
274
274
|
|
@@ -88,7 +88,7 @@ class OCRBackend(ABC, Generic[T]):
|
|
88
88
|
Returns:
|
89
89
|
List of extraction result objects in the same order as input paths
|
90
90
|
"""
|
91
|
-
return [self.process_file_sync(path, **kwargs) for path in paths]
|
91
|
+
return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
|
92
92
|
|
93
93
|
async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
|
94
94
|
"""Asynchronously process a batch of files and extract their text and metadata.
|
@@ -106,8 +106,8 @@ class OCRBackend(ABC, Generic[T]):
|
|
106
106
|
from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
|
107
107
|
|
108
108
|
tasks = [self.process_file(path, **kwargs) for path in paths]
|
109
|
-
return await run_taskgroup(*tasks)
|
109
|
+
return await run_taskgroup(*tasks) # pragma: no cover
|
110
110
|
|
111
111
|
def __hash__(self) -> int:
|
112
112
|
"""Hash function for allowing caching."""
|
113
|
-
return hash(type(self).__name__)
|
113
|
+
return hash(type(self).__name__) # pragma: no cover
|
@@ -321,7 +321,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
321
321
|
import torch # noqa: PLC0415
|
322
322
|
|
323
323
|
return bool(torch.cuda.is_available())
|
324
|
-
except ImportError:
|
324
|
+
except ImportError: # pragma: no cover
|
325
325
|
return False
|
326
326
|
|
327
327
|
@classmethod
|
@@ -340,7 +340,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
340
340
|
|
341
341
|
try:
|
342
342
|
import easyocr # noqa: PLC0415
|
343
|
-
except ImportError as e:
|
343
|
+
except ImportError as e: # pragma: no cover
|
344
344
|
raise MissingDependencyError.create_for_package(
|
345
345
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
346
346
|
) from e
|
@@ -508,7 +508,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
508
508
|
|
509
509
|
try:
|
510
510
|
import easyocr # noqa: PLC0415
|
511
|
-
except ImportError as e:
|
511
|
+
except ImportError as e: # pragma: no cover
|
512
512
|
raise MissingDependencyError.create_for_package(
|
513
513
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
514
514
|
) from e
|
@@ -261,7 +261,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
261
261
|
|
262
262
|
try:
|
263
263
|
from paddleocr import PaddleOCR # noqa: PLC0415
|
264
|
-
except ImportError as e:
|
264
|
+
except ImportError as e: # pragma: no cover
|
265
265
|
raise MissingDependencyError.create_for_package(
|
266
266
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
267
267
|
) from e
|
@@ -428,7 +428,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
428
428
|
|
429
429
|
try:
|
430
430
|
from paddleocr import PaddleOCR # noqa: PLC0415
|
431
|
-
except ImportError as e:
|
431
|
+
except ImportError as e: # pragma: no cover
|
432
432
|
raise MissingDependencyError.create_for_package(
|
433
433
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
434
434
|
) from e
|
@@ -143,7 +143,9 @@ def _parse_date_string(date_str: str) -> str:
|
|
143
143
|
minute = date_str[10:12]
|
144
144
|
second = date_str[12:14]
|
145
145
|
time_part = f"T{hour}:{minute}:{second}"
|
146
|
-
|
146
|
+
if time_part:
|
147
|
+
return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").isoformat() # noqa: DTZ007
|
148
|
+
return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").isoformat() # noqa: DTZ007
|
147
149
|
return date_str
|
148
150
|
|
149
151
|
|