kreuzberg 3.10.0__tar.gz → 3.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/ci.yaml +97 -46
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.pre-commit-config.yaml +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/PKG-INFO +7 -5
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/benchmark_baseline.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/end_to_end_benchmark.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +1 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/statistical_benchmark.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/contributing.md +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/getting-started/installation.md +11 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/index.md +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/document-classification.md +9 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_api/main.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_chunker.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_config.py +41 -16
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_document_classification.py +41 -6
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_entity_extraction.py +2 -2
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_base.py +1 -2
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_email.py +31 -8
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_image.py +18 -17
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_pdf.py +31 -34
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_structured.py +3 -3
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_gmft.py +2 -2
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_language_detection.py +1 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_mcp/server.py +2 -2
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/_base.py +3 -3
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/_easyocr.py +3 -3
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/_paddleocr.py +2 -2
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_playa.py +3 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_types.py +14 -13
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_device.py +6 -6
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_document_cache.py +1 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/cli.py +6 -6
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/mkdocs.yaml +0 -1
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/pyproject.toml +24 -9
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/api/main_test.py +323 -0
- kreuzberg-3.11.0/tests/cli_command_test.py +523 -0
- kreuzberg-3.11.0/tests/config_test.py +1570 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/conftest.py +6 -0
- kreuzberg-3.11.0/tests/document_classification_test.py +921 -0
- kreuzberg-3.11.0/tests/entity_extraction_test.py +675 -0
- kreuzberg-3.11.0/tests/extraction_test.py +834 -0
- kreuzberg-3.11.0/tests/extractors/email_test.py +1003 -0
- kreuzberg-3.11.0/tests/extractors/image_test.py +768 -0
- kreuzberg-3.11.0/tests/extractors/pandoc_test.py +2123 -0
- kreuzberg-3.11.0/tests/extractors/pdf_test.py +973 -0
- kreuzberg-3.11.0/tests/extractors/presentation_test.py +1005 -0
- kreuzberg-3.11.0/tests/extractors/spreed_sheet_test.py +1237 -0
- kreuzberg-3.11.0/tests/extractors/structured_test.py +302 -0
- kreuzberg-3.11.0/tests/gmft_test.py +720 -0
- kreuzberg-3.11.0/tests/language_detection_test.py +172 -0
- kreuzberg-3.11.0/tests/mcp_server_test.py +883 -0
- kreuzberg-3.11.0/tests/ocr/tesseract_test.py +1141 -0
- kreuzberg-3.11.0/tests/playa_helpers_test.py +549 -0
- kreuzberg-3.11.0/tests/types_test.py +440 -0
- kreuzberg-3.11.0/tests/utils/string_test.py +305 -0
- kreuzberg-3.11.0/tests/utils_errors_test.py +299 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/uv.lock +238 -184
- kreuzberg-3.10.0/docs/changelog.md +0 -49
- kreuzberg-3.10.0/tests/config_test.py +0 -401
- kreuzberg-3.10.0/tests/document_classification_test.py +0 -86
- kreuzberg-3.10.0/tests/entity_extraction_test.py +0 -102
- kreuzberg-3.10.0/tests/extraction_test.py +0 -389
- kreuzberg-3.10.0/tests/extractors/email_comprehensive_test.py +0 -326
- kreuzberg-3.10.0/tests/extractors/email_test.py +0 -31
- kreuzberg-3.10.0/tests/extractors/image_test.py +0 -275
- kreuzberg-3.10.0/tests/extractors/pandoc_test.py +0 -458
- kreuzberg-3.10.0/tests/extractors/pdf_test.py +0 -438
- kreuzberg-3.10.0/tests/extractors/presentation_test.py +0 -410
- kreuzberg-3.10.0/tests/extractors/spreed_sheet_test.py +0 -325
- kreuzberg-3.10.0/tests/extractors/structured_test.py +0 -90
- kreuzberg-3.10.0/tests/gmft_test.py +0 -397
- kreuzberg-3.10.0/tests/language_detection_test.py +0 -237
- kreuzberg-3.10.0/tests/mcp_server_test.py +0 -382
- kreuzberg-3.10.0/tests/ocr/tesseract_test.py +0 -477
- kreuzberg-3.10.0/tests/types_test.py +0 -191
- kreuzberg-3.10.0/tests/utils/string_test.py +0 -85
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.commitlintrc +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.deepsource.toml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.docker/README.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.dockerignore +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.gitignore +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/LICENSE +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/README.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/ai-rulez.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/cli.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/cli_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/tmp_test.py +0 -0
@@ -51,12 +51,103 @@ jobs:
|
|
51
51
|
- name: Execute Pre-Commit
|
52
52
|
run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
|
53
53
|
|
54
|
+
# Coverage job runs first, only on Python 3.13 Ubuntu
|
55
|
+
coverage:
|
56
|
+
needs: validate
|
57
|
+
runs-on: ubuntu-latest
|
58
|
+
timeout-minutes: 20
|
59
|
+
steps:
|
60
|
+
- name: Checkout
|
61
|
+
uses: actions/checkout@v4
|
62
|
+
|
63
|
+
- name: Install uv
|
64
|
+
uses: astral-sh/setup-uv@v6
|
65
|
+
with:
|
66
|
+
enable-cache: true
|
67
|
+
|
68
|
+
- name: Install Python
|
69
|
+
uses: actions/setup-python@v5
|
70
|
+
id: setup-python
|
71
|
+
with:
|
72
|
+
python-version: "3.13"
|
73
|
+
|
74
|
+
- name: Cache Python Dependencies
|
75
|
+
id: python-cache
|
76
|
+
uses: actions/cache@v4
|
77
|
+
with:
|
78
|
+
path: |
|
79
|
+
~/.cache/uv
|
80
|
+
.venv
|
81
|
+
key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
|
82
|
+
restore-keys: |
|
83
|
+
python-dependencies-ubuntu-latest-3.13-
|
84
|
+
|
85
|
+
- name: Install Dependencies
|
86
|
+
uses: nick-fields/retry@v3
|
87
|
+
with:
|
88
|
+
timeout_minutes: 5
|
89
|
+
max_attempts: 3
|
90
|
+
retry_wait_seconds: 30
|
91
|
+
command: |
|
92
|
+
uv sync --all-packages --all-extras --dev
|
93
|
+
shell: bash
|
94
|
+
|
95
|
+
- name: Install System Dependencies
|
96
|
+
uses: nick-fields/retry@v3
|
97
|
+
with:
|
98
|
+
timeout_minutes: 5
|
99
|
+
max_attempts: 3
|
100
|
+
retry_wait_seconds: 30
|
101
|
+
command: |
|
102
|
+
sudo apt-get update
|
103
|
+
sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
|
104
|
+
shell: bash
|
105
|
+
|
106
|
+
- name: Run Tests with Coverage
|
107
|
+
uses: nick-fields/retry@v3
|
108
|
+
with:
|
109
|
+
timeout_minutes: 15
|
110
|
+
max_attempts: 3
|
111
|
+
retry_wait_seconds: 10
|
112
|
+
command: |
|
113
|
+
uv run coverage erase
|
114
|
+
uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
|
115
|
+
uv run coverage report --precision=2
|
116
|
+
shell: bash
|
117
|
+
|
118
|
+
- name: Upload Coverage to DeepSource
|
119
|
+
if: always() && github.event_name == 'push'
|
120
|
+
env:
|
121
|
+
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
122
|
+
run: |
|
123
|
+
# Install DeepSource CLI
|
124
|
+
curl -fsSL https://deepsource.io/cli | sh
|
125
|
+
# Upload coverage report
|
126
|
+
./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
|
127
|
+
|
128
|
+
- name: Upload Coverage Artifacts
|
129
|
+
if: always()
|
130
|
+
uses: actions/upload-artifact@v4
|
131
|
+
with:
|
132
|
+
name: coverage-report-${{ github.sha }}
|
133
|
+
path: |
|
134
|
+
coverage.lcov
|
135
|
+
.coverage
|
136
|
+
retention-days: 7
|
137
|
+
|
138
|
+
# Full test matrix runs only after coverage succeeds
|
54
139
|
test:
|
140
|
+
needs: coverage
|
141
|
+
runs-on: ${{ matrix.os }}
|
55
142
|
strategy:
|
143
|
+
fail-fast: false
|
56
144
|
matrix:
|
57
|
-
os: [
|
58
|
-
python:
|
59
|
-
|
145
|
+
os: [ubuntu-latest, windows-latest, macos-latest]
|
146
|
+
python: ["3.10", "3.11", "3.12", "3.13"]
|
147
|
+
exclude:
|
148
|
+
# Skip Python 3.13 on macOS for now due to compatibility issues
|
149
|
+
- os: macos-latest
|
150
|
+
python: "3.13"
|
60
151
|
timeout-minutes: 30
|
61
152
|
steps:
|
62
153
|
- name: Checkout
|
@@ -146,52 +237,12 @@ jobs:
|
|
146
237
|
pandoc --version
|
147
238
|
shell: pwsh
|
148
239
|
|
149
|
-
- name:
|
150
|
-
run: |
|
151
|
-
rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
|
152
|
-
shell: bash
|
153
|
-
|
154
|
-
- name: Run Tests with Coverage
|
155
|
-
run: |
|
156
|
-
uv run coverage erase
|
157
|
-
uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
|
158
|
-
|
159
|
-
- name: Upload Coverage Artifacts
|
160
|
-
if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
|
161
|
-
uses: actions/upload-artifact@v4
|
162
|
-
with:
|
163
|
-
name: coverage-report
|
164
|
-
path: coverage.lcov
|
165
|
-
retention-days: 1
|
166
|
-
|
167
|
-
upload-coverage:
|
168
|
-
needs: test
|
169
|
-
runs-on: ubuntu-latest
|
170
|
-
if: github.event_name == 'push' || github.event_name == 'pull_request'
|
171
|
-
steps:
|
172
|
-
- name: Checkout
|
173
|
-
uses: actions/checkout@v4
|
174
|
-
with:
|
175
|
-
ref: ${{ github.event.pull_request.head.sha || github.sha }}
|
176
|
-
|
177
|
-
- name: Download Coverage Artifacts
|
178
|
-
uses: actions/download-artifact@v4
|
179
|
-
with:
|
180
|
-
name: coverage-report
|
181
|
-
path: .
|
182
|
-
|
183
|
-
- name: Install DeepSource CLI
|
240
|
+
- name: Run Tests (without coverage)
|
184
241
|
uses: nick-fields/retry@v3
|
185
242
|
with:
|
186
|
-
timeout_minutes:
|
243
|
+
timeout_minutes: 15
|
187
244
|
max_attempts: 3
|
188
245
|
retry_wait_seconds: 10
|
189
246
|
command: |
|
190
|
-
|
247
|
+
uv run pytest -s -vvv --reruns 2 --reruns-delay 1
|
191
248
|
shell: bash
|
192
|
-
|
193
|
-
- name: Upload Coverage to DeepSource
|
194
|
-
env:
|
195
|
-
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
196
|
-
run: |
|
197
|
-
./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.11.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.9.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.12.
|
35
|
+
Requires-Dist: mcp>=1.12.3
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: playa-pdf>=0.6.4
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
@@ -45,6 +45,7 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
|
45
45
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
46
46
|
Provides-Extra: all
|
47
47
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
48
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
48
49
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
49
50
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
50
51
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
@@ -53,6 +54,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
|
|
53
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
55
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
56
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
57
|
+
Requires-Dist: pandas>=2.3.1; extra == 'all'
|
56
58
|
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
|
57
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
58
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
@@ -61,9 +63,6 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
|
|
61
63
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
62
64
|
Provides-Extra: api
|
63
65
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
64
|
-
Provides-Extra: auto-classify-document-type
|
65
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
|
66
|
-
Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
|
67
66
|
Provides-Extra: chunking
|
68
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
69
68
|
Provides-Extra: cli
|
@@ -72,6 +71,9 @@ Requires-Dist: rich>=14.1.0; extra == 'cli'
|
|
72
71
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
73
72
|
Provides-Extra: crypto
|
74
73
|
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
|
74
|
+
Provides-Extra: document-classification
|
75
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
|
+
Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
|
75
77
|
Provides-Extra: easyocr
|
76
78
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
77
79
|
Provides-Extra: entity-extraction
|
@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
|
|
108
108
|
return results # type: ignore[return-value]
|
109
109
|
|
110
110
|
|
111
|
-
if __name__ == "__main__":
|
111
|
+
if __name__ == "__main__": # pragma: no cover
|
112
112
|
baseline_results = asyncio.run(run_baseline_benchmark())
|
113
113
|
|
114
114
|
baseline_file = Path("baseline_results.json")
|
@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
|
|
195
195
|
}
|
196
196
|
|
197
197
|
|
198
|
-
if __name__ == "__main__":
|
198
|
+
if __name__ == "__main__": # pragma: no cover
|
199
199
|
print("🧪 REPRODUCIBLE CACHE BENCHMARK")
|
200
200
|
print("Testing msgpack implementation with statistical rigor...")
|
201
201
|
print()
|
@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
|
|
187
187
|
}
|
188
188
|
|
189
189
|
|
190
|
-
if __name__ == "__main__":
|
190
|
+
if __name__ == "__main__": # pragma: no cover
|
191
191
|
print("🧪 STATISTICAL CACHE BENCHMARK")
|
192
192
|
print("Testing msgpack implementation with proper error analysis...")
|
193
193
|
print()
|
@@ -34,7 +34,7 @@ All commands run through `uv run`:
|
|
34
34
|
# Testing
|
35
35
|
uv run pytest # Run all tests
|
36
36
|
uv run pytest tests/foo_test.py # Run specific test
|
37
|
-
uv run pytest --cov # With coverage (must be ≥
|
37
|
+
uv run pytest --cov # With coverage (must be ≥85%)
|
38
38
|
|
39
39
|
# Code quality
|
40
40
|
uv run ruff format # Format code
|
@@ -134,6 +134,16 @@ python -m spacy download es_core_news_sm # Spanish
|
|
134
134
|
|
135
135
|
spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
|
136
136
|
|
137
|
+
### Document Classification
|
138
|
+
|
139
|
+
For automatic document type detection (invoice, contract, receipt, etc.), install the document classification extra:
|
140
|
+
|
141
|
+
```shell
|
142
|
+
pip install "kreuzberg[document-classification]"
|
143
|
+
```
|
144
|
+
|
145
|
+
This feature uses Google Translate for multi-language support and requires explicit opt-in by setting `auto_detect_document_type=True` in your configuration.
|
146
|
+
|
137
147
|
### All Optional Dependencies
|
138
148
|
|
139
149
|
To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
|
@@ -145,5 +155,5 @@ pip install "kreuzberg[all]"
|
|
145
155
|
This is equivalent to:
|
146
156
|
|
147
157
|
```shell
|
148
|
-
pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
|
158
|
+
pip install "kreuzberg[chunking,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
|
149
159
|
```
|
@@ -22,7 +22,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
|
|
22
22
|
|
23
23
|
### Engineering Principles
|
24
24
|
|
25
|
-
- **Test Coverage**:
|
25
|
+
- **Test Coverage**: Comprehensive test suites ensuring code reliability
|
26
26
|
- **API Design**: True async/await implementation alongside synchronous APIs
|
27
27
|
- **Error Handling**: Consistent exception hierarchy with detailed context
|
28
28
|
- **Type Safety**: Full type annotations for enhanced developer experience
|
@@ -2,9 +2,17 @@
|
|
2
2
|
|
3
3
|
Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
|
4
4
|
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Document classification requires the `document-classification` extra to be installed:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
pip install "kreuzberg[document-classification]"
|
11
|
+
```
|
12
|
+
|
5
13
|
## Enabling Document Classification
|
6
14
|
|
7
|
-
To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
|
15
|
+
Document classification is disabled by default. To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
|
8
16
|
|
9
17
|
```python
|
10
18
|
from kreuzberg import ExtractionConfig, extract_file
|
@@ -30,7 +30,7 @@ try:
|
|
30
30
|
HTTP_422_UNPROCESSABLE_ENTITY,
|
31
31
|
HTTP_500_INTERNAL_SERVER_ERROR,
|
32
32
|
)
|
33
|
-
except ImportError as e:
|
33
|
+
except ImportError as e: # pragma: no cover
|
34
34
|
raise MissingDependencyError.create_for_package(
|
35
35
|
dependency_group="litestar",
|
36
36
|
functionality="Litestar API and docker container",
|
@@ -43,7 +43,7 @@ def get_chunker(
|
|
43
43
|
from semantic_text_splitter import TextSplitter # noqa: PLC0415
|
44
44
|
|
45
45
|
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
|
-
except ImportError as e:
|
46
|
+
except ImportError as e: # pragma: no cover
|
47
47
|
raise MissingDependencyError.create_for_package(
|
48
48
|
dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
|
49
49
|
) from e
|
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
|
|
13
13
|
|
14
14
|
if sys.version_info >= (3, 11):
|
15
15
|
import tomllib
|
16
|
-
else:
|
16
|
+
else: # pragma: no cover
|
17
17
|
import tomli as tomllib # type: ignore[import-not-found]
|
18
18
|
|
19
19
|
from kreuzberg._gmft import GMFTConfig
|
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
50
50
|
# Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
|
51
51
|
if config_path.name == "kreuzberg.toml":
|
52
52
|
return data # type: ignore[no-any-return]
|
53
|
-
|
53
|
+
|
54
|
+
# For other files, check if they have [tool.kreuzberg] section
|
55
|
+
if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
|
56
|
+
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
57
|
+
|
58
|
+
# Otherwise assume root-level configuration
|
59
|
+
return data # type: ignore[no-any-return]
|
54
60
|
|
55
61
|
|
56
62
|
def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
@@ -91,19 +97,21 @@ def parse_ocr_backend_config(
|
|
91
97
|
if not isinstance(backend_config, dict):
|
92
98
|
return None
|
93
99
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
100
|
+
match backend:
|
101
|
+
case "tesseract":
|
102
|
+
# Convert psm integer to PSMMode enum if needed
|
103
|
+
processed_config = backend_config.copy()
|
104
|
+
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
105
|
+
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
106
|
+
|
107
|
+
processed_config["psm"] = PSMMode(processed_config["psm"])
|
108
|
+
return TesseractConfig(**processed_config)
|
109
|
+
case "easyocr":
|
110
|
+
return EasyOCRConfig(**backend_config)
|
111
|
+
case "paddleocr":
|
112
|
+
return PaddleOCRConfig(**backend_config)
|
113
|
+
case _:
|
114
|
+
return None
|
107
115
|
|
108
116
|
|
109
117
|
def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
|
@@ -129,12 +137,25 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
129
137
|
"extract_keywords",
|
130
138
|
"auto_detect_language",
|
131
139
|
"enable_quality_processing",
|
140
|
+
"auto_detect_document_type",
|
141
|
+
"document_type_confidence_threshold",
|
142
|
+
"document_classification_mode",
|
143
|
+
"keyword_count",
|
144
|
+
}
|
145
|
+
extraction_config = extraction_config | {
|
146
|
+
field: config_dict[field] for field in basic_fields if field in config_dict
|
132
147
|
}
|
133
|
-
extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
|
134
148
|
|
135
149
|
# Handle OCR backend configuration
|
136
150
|
ocr_backend = extraction_config.get("ocr_backend")
|
137
151
|
if ocr_backend and ocr_backend != "none":
|
152
|
+
# Validate OCR backend
|
153
|
+
valid_backends = {"tesseract", "easyocr", "paddleocr"}
|
154
|
+
if ocr_backend not in valid_backends:
|
155
|
+
raise ValidationError(
|
156
|
+
f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
|
157
|
+
context={"provided": ocr_backend, "valid": sorted(valid_backends)},
|
158
|
+
)
|
138
159
|
ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
|
139
160
|
if ocr_config:
|
140
161
|
extraction_config["ocr_config"] = ocr_config
|
@@ -286,6 +307,10 @@ _CONFIG_FIELDS = [
|
|
286
307
|
"extract_keywords",
|
287
308
|
"auto_detect_language",
|
288
309
|
"enable_quality_processing",
|
310
|
+
"auto_detect_document_type",
|
311
|
+
"document_type_confidence_threshold",
|
312
|
+
"document_classification_mode",
|
313
|
+
"keyword_count",
|
289
314
|
]
|
290
315
|
|
291
316
|
|
@@ -4,13 +4,12 @@ import re
|
|
4
4
|
from typing import TYPE_CHECKING
|
5
5
|
|
6
6
|
from kreuzberg._ocr import get_ocr_backend
|
7
|
+
from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
|
7
8
|
from kreuzberg.exceptions import MissingDependencyError
|
8
9
|
|
9
10
|
if TYPE_CHECKING:
|
10
11
|
from pathlib import Path
|
11
12
|
|
12
|
-
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
13
|
-
|
14
13
|
|
15
14
|
DOCUMENT_CLASSIFIERS = {
|
16
15
|
"invoice": [
|
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
52
51
|
Raises:
|
53
52
|
MissingDependencyError: If the deep-translator package is not installed
|
54
53
|
"""
|
54
|
+
# Combine content with metadata for classification
|
55
|
+
text_to_classify = result.content
|
56
|
+
if result.metadata:
|
57
|
+
# Add metadata values to the text for classification
|
58
|
+
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
59
|
+
text_to_classify = f"{text_to_classify} {metadata_text}"
|
60
|
+
|
55
61
|
try:
|
56
62
|
from deep_translator import GoogleTranslator # noqa: PLC0415
|
57
|
-
except ImportError as e:
|
63
|
+
except ImportError as e: # pragma: no cover
|
58
64
|
raise MissingDependencyError(
|
59
|
-
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[
|
65
|
+
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
|
60
66
|
) from e
|
61
67
|
|
62
|
-
|
68
|
+
try:
|
69
|
+
return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
70
|
+
except Exception: # noqa: BLE001
|
71
|
+
# Fall back to original content in lowercase if translation fails
|
72
|
+
return text_to_classify.lower()
|
63
73
|
|
64
74
|
|
65
75
|
def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
|
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
|
|
73
83
|
A tuple containing the detected document type and the confidence score,
|
74
84
|
or (None, None) if no type is detected with sufficient confidence.
|
75
85
|
"""
|
86
|
+
if not config.auto_detect_document_type:
|
87
|
+
return None, None
|
88
|
+
|
76
89
|
translated_text = _get_translated_text(result)
|
77
90
|
scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
|
78
91
|
|
@@ -108,7 +121,8 @@ def classify_document_from_layout(
|
|
108
121
|
A tuple containing the detected document type and the confidence score,
|
109
122
|
or (None, None) if no type is detected with sufficient confidence.
|
110
123
|
"""
|
111
|
-
|
124
|
+
if not config.auto_detect_document_type:
|
125
|
+
return None, None
|
112
126
|
|
113
127
|
if result.layout is None or result.layout.empty:
|
114
128
|
return None, None
|
@@ -117,6 +131,24 @@ def classify_document_from_layout(
|
|
117
131
|
if not all(col in layout_df.columns for col in ["text", "top", "height"]):
|
118
132
|
return None, None
|
119
133
|
|
134
|
+
# Use layout text for classification, not the content
|
135
|
+
layout_text = " ".join(layout_df["text"].astype(str).tolist())
|
136
|
+
|
137
|
+
# Translate layout text directly for classification
|
138
|
+
text_to_classify = layout_text
|
139
|
+
if result.metadata:
|
140
|
+
# Add metadata values to the text for classification
|
141
|
+
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
142
|
+
text_to_classify = f"{text_to_classify} {metadata_text}"
|
143
|
+
|
144
|
+
try:
|
145
|
+
from deep_translator import GoogleTranslator # noqa: PLC0415
|
146
|
+
|
147
|
+
translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
148
|
+
except Exception: # noqa: BLE001
|
149
|
+
# Fall back to original content in lowercase if translation fails
|
150
|
+
translated_text = text_to_classify.lower()
|
151
|
+
|
120
152
|
layout_df["translated_text"] = translated_text
|
121
153
|
|
122
154
|
page_height = layout_df["top"].max() + layout_df["height"].max()
|
@@ -151,6 +183,9 @@ def auto_detect_document_type(
|
|
151
183
|
if config.document_classification_mode == "vision" and file_path:
|
152
184
|
layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
|
153
185
|
result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
|
186
|
+
elif result.layout is not None and not result.layout.empty:
|
187
|
+
# Use layout-based classification if layout data is available
|
188
|
+
result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
|
154
189
|
else:
|
155
190
|
result.document_type, result.document_type_confidence = classify_document(result, config)
|
156
191
|
return result
|
@@ -139,7 +139,7 @@ def extract_entities(
|
|
139
139
|
|
140
140
|
try:
|
141
141
|
import spacy # noqa: F401, PLC0415
|
142
|
-
except ImportError as e:
|
142
|
+
except ImportError as e: # pragma: no cover
|
143
143
|
raise MissingDependencyError.create_for_package(
|
144
144
|
package_name="spacy",
|
145
145
|
dependency_group="entity-extraction",
|
@@ -230,7 +230,7 @@ def extract_keywords(
|
|
230
230
|
return [(kw, float(score)) for kw, score in keywords]
|
231
231
|
except (RuntimeError, OSError, ValueError):
|
232
232
|
return []
|
233
|
-
except ImportError as e:
|
233
|
+
except ImportError as e: # pragma: no cover
|
234
234
|
raise MissingDependencyError.create_for_package(
|
235
235
|
package_name="keybert",
|
236
236
|
dependency_group="entity-extraction",
|
@@ -116,8 +116,7 @@ class Extractor(ABC):
|
|
116
116
|
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
117
|
|
118
118
|
# Add quality metadata
|
119
|
-
enhanced_metadata = dict(result.metadata) if result.metadata else {}
|
120
|
-
enhanced_metadata["quality_score"] = quality_score
|
119
|
+
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
121
120
|
|
122
121
|
# Return enhanced result
|
123
122
|
return ExtractionResult(
|
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
|
|
19
19
|
# Import optional dependencies at module level with proper error handling
|
20
20
|
try:
|
21
21
|
import mailparse
|
22
|
-
except ImportError:
|
22
|
+
except ImportError: # pragma: no cover
|
23
23
|
mailparse = None
|
24
24
|
|
25
25
|
try:
|
26
26
|
import html2text # type: ignore[import-not-found]
|
27
|
-
except ImportError:
|
27
|
+
except ImportError: # pragma: no cover
|
28
28
|
html2text = None
|
29
29
|
|
30
30
|
# Compile regex pattern once at module level
|
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
|
|
59
59
|
|
60
60
|
to_info = parsed_email.get("to")
|
61
61
|
if to_info:
|
62
|
+
# Store the raw value in metadata (could be string, dict, or list)
|
62
63
|
if isinstance(to_info, list) and to_info:
|
64
|
+
# For metadata, use first recipient's email if it's a list
|
63
65
|
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
66
|
+
metadata["email_to"] = to_email
|
64
67
|
elif isinstance(to_info, dict):
|
65
|
-
|
68
|
+
metadata["email_to"] = to_info.get("email", "")
|
66
69
|
else:
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
+
metadata["email_to"] = str(to_info)
|
71
|
+
|
72
|
+
# For display, format all recipients
|
73
|
+
to_formatted = self._format_email_field(to_info)
|
74
|
+
text_parts.append(f"To: {to_formatted}")
|
70
75
|
|
71
76
|
date = parsed_email.get("date")
|
72
77
|
if date:
|
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
|
|
76
81
|
cc = parsed_email.get("cc")
|
77
82
|
if cc:
|
78
83
|
metadata["email_cc"] = cc
|
79
|
-
|
84
|
+
cc_formatted = self._format_email_field(cc)
|
85
|
+
text_parts.append(f"CC: {cc_formatted}")
|
80
86
|
|
81
87
|
bcc = parsed_email.get("bcc")
|
82
88
|
if bcc:
|
83
89
|
metadata["email_bcc"] = bcc
|
84
|
-
|
90
|
+
bcc_formatted = self._format_email_field(bcc)
|
91
|
+
text_parts.append(f"BCC: {bcc_formatted}")
|
92
|
+
|
93
|
+
def _format_email_field(self, field: Any) -> str:
|
94
|
+
"""Format email field (to, cc, bcc) for display."""
|
95
|
+
if isinstance(field, list):
|
96
|
+
emails = []
|
97
|
+
for item in field:
|
98
|
+
if isinstance(item, dict):
|
99
|
+
email = item.get("email", "")
|
100
|
+
if email:
|
101
|
+
emails.append(email)
|
102
|
+
else:
|
103
|
+
emails.append(str(item))
|
104
|
+
return ", ".join(emails)
|
105
|
+
if isinstance(field, dict):
|
106
|
+
return str(field.get("email", ""))
|
107
|
+
return str(field)
|
85
108
|
|
86
109
|
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
87
110
|
"""Extract and process email body content."""
|