kreuzberg 3.10.1__tar.gz → 3.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/PKG-INFO +7 -5
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/contributing.md +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/getting-started/installation.md +11 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/index.md +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/document-classification.md +9 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_config.py +18 -14
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_document_classification.py +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_base.py +1 -2
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_image.py +18 -17
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_pdf.py +30 -33
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_mcp/server.py +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_types.py +11 -10
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/mkdocs.yaml +0 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/pyproject.toml +8 -8
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/document_classification_test.py +49 -14
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/uv.lock +85 -62
- kreuzberg-3.10.1/docs/changelog.md +0 -49
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.commitlintrc +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.deepsource.toml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.docker/README.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.dockerignore +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.gitignore +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/LICENSE +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/README.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/ai-rulez.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/cli.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/api/main_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/cli_command_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/cli_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/config_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/conftest.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extraction_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/gmft_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/playa_helpers_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/types_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/tmp_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils_errors_test.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.11.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.9.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.12.
|
35
|
+
Requires-Dist: mcp>=1.12.3
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: playa-pdf>=0.6.4
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
@@ -45,6 +45,7 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
|
45
45
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
46
46
|
Provides-Extra: all
|
47
47
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
48
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
48
49
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
49
50
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
50
51
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
@@ -53,6 +54,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
|
|
53
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
55
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
56
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
57
|
+
Requires-Dist: pandas>=2.3.1; extra == 'all'
|
56
58
|
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
|
57
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
58
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
@@ -61,9 +63,6 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
|
|
61
63
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
62
64
|
Provides-Extra: api
|
63
65
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
64
|
-
Provides-Extra: auto-classify-document-type
|
65
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
|
66
|
-
Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
|
67
66
|
Provides-Extra: chunking
|
68
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
69
68
|
Provides-Extra: cli
|
@@ -72,6 +71,9 @@ Requires-Dist: rich>=14.1.0; extra == 'cli'
|
|
72
71
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
73
72
|
Provides-Extra: crypto
|
74
73
|
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
|
74
|
+
Provides-Extra: document-classification
|
75
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
|
+
Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
|
75
77
|
Provides-Extra: easyocr
|
76
78
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
77
79
|
Provides-Extra: entity-extraction
|
@@ -34,7 +34,7 @@ All commands run through `uv run`:
|
|
34
34
|
# Testing
|
35
35
|
uv run pytest # Run all tests
|
36
36
|
uv run pytest tests/foo_test.py # Run specific test
|
37
|
-
uv run pytest --cov # With coverage (must be ≥
|
37
|
+
uv run pytest --cov # With coverage (must be ≥85%)
|
38
38
|
|
39
39
|
# Code quality
|
40
40
|
uv run ruff format # Format code
|
@@ -134,6 +134,16 @@ python -m spacy download es_core_news_sm # Spanish
|
|
134
134
|
|
135
135
|
spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
|
136
136
|
|
137
|
+
### Document Classification
|
138
|
+
|
139
|
+
For automatic document type detection (invoice, contract, receipt, etc.), install the document classification extra:
|
140
|
+
|
141
|
+
```shell
|
142
|
+
pip install "kreuzberg[document-classification]"
|
143
|
+
```
|
144
|
+
|
145
|
+
This feature uses Google Translate for multi-language support and requires explicit opt-in by setting `auto_detect_document_type=True` in your configuration.
|
146
|
+
|
137
147
|
### All Optional Dependencies
|
138
148
|
|
139
149
|
To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
|
@@ -145,5 +155,5 @@ pip install "kreuzberg[all]"
|
|
145
155
|
This is equivalent to:
|
146
156
|
|
147
157
|
```shell
|
148
|
-
pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
|
158
|
+
pip install "kreuzberg[chunking,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
|
149
159
|
```
|
@@ -22,7 +22,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
|
|
22
22
|
|
23
23
|
### Engineering Principles
|
24
24
|
|
25
|
-
- **Test Coverage**:
|
25
|
+
- **Test Coverage**: Comprehensive test suites ensuring code reliability
|
26
26
|
- **API Design**: True async/await implementation alongside synchronous APIs
|
27
27
|
- **Error Handling**: Consistent exception hierarchy with detailed context
|
28
28
|
- **Type Safety**: Full type annotations for enhanced developer experience
|
@@ -2,9 +2,17 @@
|
|
2
2
|
|
3
3
|
Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
|
4
4
|
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Document classification requires the `document-classification` extra to be installed:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
pip install "kreuzberg[document-classification]"
|
11
|
+
```
|
12
|
+
|
5
13
|
## Enabling Document Classification
|
6
14
|
|
7
|
-
To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
|
15
|
+
Document classification is disabled by default. To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
|
8
16
|
|
9
17
|
```python
|
10
18
|
from kreuzberg import ExtractionConfig, extract_file
|
@@ -97,19 +97,21 @@ def parse_ocr_backend_config(
|
|
97
97
|
if not isinstance(backend_config, dict):
|
98
98
|
return None
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
100
|
+
match backend:
|
101
|
+
case "tesseract":
|
102
|
+
# Convert psm integer to PSMMode enum if needed
|
103
|
+
processed_config = backend_config.copy()
|
104
|
+
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
105
|
+
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
106
|
+
|
107
|
+
processed_config["psm"] = PSMMode(processed_config["psm"])
|
108
|
+
return TesseractConfig(**processed_config)
|
109
|
+
case "easyocr":
|
110
|
+
return EasyOCRConfig(**backend_config)
|
111
|
+
case "paddleocr":
|
112
|
+
return PaddleOCRConfig(**backend_config)
|
113
|
+
case _:
|
114
|
+
return None
|
113
115
|
|
114
116
|
|
115
117
|
def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
|
@@ -140,7 +142,9 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
140
142
|
"document_classification_mode",
|
141
143
|
"keyword_count",
|
142
144
|
}
|
143
|
-
extraction_config
|
145
|
+
extraction_config = extraction_config | {
|
146
|
+
field: config_dict[field] for field in basic_fields if field in config_dict
|
147
|
+
}
|
144
148
|
|
145
149
|
# Handle OCR backend configuration
|
146
150
|
ocr_backend = extraction_config.get("ocr_backend")
|
@@ -62,7 +62,7 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
62
62
|
from deep_translator import GoogleTranslator # noqa: PLC0415
|
63
63
|
except ImportError as e: # pragma: no cover
|
64
64
|
raise MissingDependencyError(
|
65
|
-
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[
|
65
|
+
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
|
66
66
|
) from e
|
67
67
|
|
68
68
|
try:
|
@@ -116,8 +116,7 @@ class Extractor(ABC):
|
|
116
116
|
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
117
|
|
118
118
|
# Add quality metadata
|
119
|
-
enhanced_metadata = dict(result.metadata) if result.metadata else {}
|
120
|
-
enhanced_metadata["quality_score"] = quality_score
|
119
|
+
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
121
120
|
|
122
121
|
# Return enhanced result
|
123
122
|
return ExtractionResult(
|
@@ -85,23 +85,24 @@ class ImageExtractor(Extractor):
|
|
85
85
|
|
86
86
|
backend = get_ocr_backend(self.config.ocr_backend)
|
87
87
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
88
|
+
match self.config.ocr_backend:
|
89
|
+
case "tesseract":
|
90
|
+
config = (
|
91
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
92
|
+
)
|
93
|
+
result = backend.process_file_sync(path, **asdict(config))
|
94
|
+
case "paddleocr":
|
95
|
+
paddle_config = (
|
96
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
97
|
+
)
|
98
|
+
result = backend.process_file_sync(path, **asdict(paddle_config))
|
99
|
+
case "easyocr":
|
100
|
+
easy_config = (
|
101
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
102
|
+
)
|
103
|
+
result = backend.process_file_sync(path, **asdict(easy_config))
|
104
|
+
case _:
|
105
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
105
106
|
return self._apply_quality_processing(result)
|
106
107
|
|
107
108
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
@@ -88,14 +88,12 @@ class PDFExtractor(Extractor):
|
|
88
88
|
# Enhance metadata with table information
|
89
89
|
if result.tables:
|
90
90
|
table_summary = generate_table_summary(result.tables)
|
91
|
-
result.metadata.
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
}
|
98
|
-
)
|
91
|
+
result.metadata = result.metadata | {
|
92
|
+
"table_count": table_summary["table_count"],
|
93
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
94
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
95
|
+
f"{table_summary['total_rows']} total rows",
|
96
|
+
}
|
99
97
|
|
100
98
|
return self._apply_quality_processing(result)
|
101
99
|
|
@@ -153,14 +151,12 @@ class PDFExtractor(Extractor):
|
|
153
151
|
# Enhance metadata with table information
|
154
152
|
if tables:
|
155
153
|
table_summary = generate_table_summary(tables)
|
156
|
-
result.metadata.
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
}
|
163
|
-
)
|
154
|
+
result.metadata = result.metadata | {
|
155
|
+
"table_count": table_summary["table_count"],
|
156
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
157
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
158
|
+
f"{table_summary['total_rows']} total rows",
|
159
|
+
}
|
164
160
|
|
165
161
|
# Apply quality processing
|
166
162
|
return self._apply_quality_processing(result)
|
@@ -386,23 +382,24 @@ class PDFExtractor(Extractor):
|
|
386
382
|
backend = get_ocr_backend(self.config.ocr_backend)
|
387
383
|
paths = [Path(p) for p in image_paths]
|
388
384
|
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
385
|
+
match self.config.ocr_backend:
|
386
|
+
case "tesseract":
|
387
|
+
config = (
|
388
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
389
|
+
)
|
390
|
+
results = backend.process_batch_sync(paths, **asdict(config))
|
391
|
+
case "paddleocr":
|
392
|
+
paddle_config = (
|
393
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
394
|
+
)
|
395
|
+
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
396
|
+
case "easyocr":
|
397
|
+
easy_config = (
|
398
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
399
|
+
)
|
400
|
+
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
401
|
+
case _:
|
402
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
406
403
|
|
407
404
|
# Use list comprehension and join for efficient string building
|
408
405
|
return "\n\n".join(result.content for result in results)
|
@@ -349,7 +349,7 @@ class ExtractionConfig:
|
|
349
349
|
"""Configuration for language detection. If None, uses default settings."""
|
350
350
|
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
351
351
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
352
|
-
auto_detect_document_type: bool =
|
352
|
+
auto_detect_document_type: bool = False
|
353
353
|
"""Whether to automatically detect the document type."""
|
354
354
|
document_type_confidence_threshold: float = 0.5
|
355
355
|
"""Confidence threshold for document type detection."""
|
@@ -398,15 +398,16 @@ class ExtractionConfig:
|
|
398
398
|
return asdict(self.ocr_config)
|
399
399
|
|
400
400
|
# Lazy load and cache default configs instead of creating new instances
|
401
|
-
|
402
|
-
|
401
|
+
match self.ocr_backend:
|
402
|
+
case "tesseract":
|
403
|
+
from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
|
403
404
|
|
404
|
-
|
405
|
-
|
406
|
-
|
405
|
+
return asdict(TesseractConfig())
|
406
|
+
case "easyocr":
|
407
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
|
407
408
|
|
408
|
-
|
409
|
-
|
410
|
-
|
409
|
+
return asdict(EasyOCRConfig())
|
410
|
+
case _: # paddleocr or any other backend
|
411
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
|
411
412
|
|
412
|
-
|
413
|
+
return asdict(PaddleOCRConfig())
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.
|
8
|
+
version = "3.11.0"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -61,7 +61,7 @@ dependencies = [
|
|
61
61
|
"chardetng-py>=0.3.5",
|
62
62
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
63
63
|
"html-to-markdown[lxml]>=1.9.0",
|
64
|
-
"mcp>=1.12.
|
64
|
+
"mcp>=1.12.3",
|
65
65
|
"msgspec>=0.18.0",
|
66
66
|
"playa-pdf>=0.6.4", # pinned due to breaking changes in 0.5.0
|
67
67
|
"psutil>=7.0.0",
|
@@ -76,15 +76,11 @@ optional-dependencies.additional-extensions = [
|
|
76
76
|
"tomli>=2.0.0; python_version<'3.11'",
|
77
77
|
]
|
78
78
|
optional-dependencies.all = [
|
79
|
-
"kreuzberg[additional-extensions,api,chunking,cli,crypto,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
|
79
|
+
"kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
|
80
80
|
]
|
81
81
|
optional-dependencies.api = [
|
82
82
|
"litestar[standard,structlog,opentelemetry]>=2.16.0",
|
83
83
|
]
|
84
|
-
optional-dependencies.auto-classify-document-type = [
|
85
|
-
"deep-translator>=1.11.4",
|
86
|
-
"pandas>=2.3.1",
|
87
|
-
]
|
88
84
|
optional-dependencies.chunking = [ "semantic-text-splitter>=0.27.0" ]
|
89
85
|
optional-dependencies.cli = [
|
90
86
|
"click>=8.2.1",
|
@@ -92,6 +88,10 @@ optional-dependencies.cli = [
|
|
92
88
|
"tomli>=2.0.0; python_version<'3.11'",
|
93
89
|
]
|
94
90
|
optional-dependencies.crypto = [ "playa-pdf[crypto]>=0.6.4" ]
|
91
|
+
optional-dependencies.document-classification = [
|
92
|
+
"deep-translator>=1.11.4",
|
93
|
+
"pandas>=2.3.1",
|
94
|
+
]
|
95
95
|
optional-dependencies.easyocr = [ "easyocr>=1.7.2" ]
|
96
96
|
optional-dependencies.entity-extraction = [ "keybert>=0.9.0", "spacy>=3.8.7" ]
|
97
97
|
optional-dependencies.gmft = [ "gmft>=0.4.2" ]
|
@@ -256,7 +256,7 @@ exclude_lines = [
|
|
256
256
|
"class .*\\bProtocol\\):",
|
257
257
|
"@(abc\\.)?abstractmethod",
|
258
258
|
]
|
259
|
-
fail_under =
|
259
|
+
fail_under = 85
|
260
260
|
|
261
261
|
[tool.mypy]
|
262
262
|
packages = [ "kreuzberg", "tests", "benchmarks.src.kreuzberg_benchmarks" ]
|
@@ -2,8 +2,10 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
+
import builtins
|
6
|
+
import sys
|
5
7
|
from pathlib import Path
|
6
|
-
from typing import TYPE_CHECKING
|
8
|
+
from typing import TYPE_CHECKING, Any
|
7
9
|
|
8
10
|
import pandas as pd
|
9
11
|
import pytest
|
@@ -15,6 +17,7 @@ from kreuzberg._document_classification import (
|
|
15
17
|
classify_document_from_layout,
|
16
18
|
)
|
17
19
|
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
20
|
+
from kreuzberg.exceptions import MissingDependencyError
|
18
21
|
|
19
22
|
if TYPE_CHECKING:
|
20
23
|
from pytest_mock import MockerFixture
|
@@ -112,7 +115,7 @@ def test_classify_document_with_metadata() -> None:
|
|
112
115
|
mime_type="text/plain",
|
113
116
|
metadata={"title": "Invoice #12345", "subject": "Payment Due"},
|
114
117
|
)
|
115
|
-
config = ExtractionConfig()
|
118
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
116
119
|
|
117
120
|
doc_type, confidence = classify_document(result, config)
|
118
121
|
|
@@ -142,7 +145,7 @@ def test_classify_document_empty_content() -> None:
|
|
142
145
|
mime_type="text/plain",
|
143
146
|
metadata={},
|
144
147
|
)
|
145
|
-
config = ExtractionConfig()
|
148
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
146
149
|
|
147
150
|
doc_type, confidence = classify_document(result, config)
|
148
151
|
|
@@ -158,7 +161,7 @@ def test_classify_document_with_exclusions() -> None:
|
|
158
161
|
mime_type="text/plain",
|
159
162
|
metadata={},
|
160
163
|
)
|
161
|
-
config = ExtractionConfig()
|
164
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
162
165
|
|
163
166
|
doc_type, confidence = classify_document(result, config)
|
164
167
|
|
@@ -184,7 +187,7 @@ def test_classify_document_from_layout_basic() -> None:
|
|
184
187
|
metadata={},
|
185
188
|
layout=layout_df,
|
186
189
|
)
|
187
|
-
config = ExtractionConfig()
|
190
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
188
191
|
|
189
192
|
doc_type, confidence = classify_document_from_layout(result, config)
|
190
193
|
|
@@ -200,7 +203,7 @@ def test_classify_document_from_layout_no_layout() -> None:
|
|
200
203
|
mime_type="text/plain",
|
201
204
|
metadata={},
|
202
205
|
)
|
203
|
-
config = ExtractionConfig()
|
206
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
204
207
|
|
205
208
|
doc_type, confidence = classify_document_from_layout(result, config)
|
206
209
|
|
@@ -218,7 +221,7 @@ def test_classify_document_from_layout_empty_layout() -> None:
|
|
218
221
|
metadata={},
|
219
222
|
layout=layout_df,
|
220
223
|
)
|
221
|
-
config = ExtractionConfig()
|
224
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
222
225
|
|
223
226
|
doc_type, confidence = classify_document_from_layout(result, config)
|
224
227
|
|
@@ -236,7 +239,7 @@ def test_classify_document_from_layout_missing_columns() -> None:
|
|
236
239
|
metadata={},
|
237
240
|
layout=layout_df,
|
238
241
|
)
|
239
|
-
config = ExtractionConfig()
|
242
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
240
243
|
|
241
244
|
doc_type, confidence = classify_document_from_layout(result, config)
|
242
245
|
|
@@ -260,7 +263,7 @@ def test_classify_document_from_layout_no_pattern_matches() -> None:
|
|
260
263
|
metadata={},
|
261
264
|
layout=layout_df,
|
262
265
|
)
|
263
|
-
config = ExtractionConfig()
|
266
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
264
267
|
|
265
268
|
doc_type, confidence = classify_document_from_layout(result, config)
|
266
269
|
|
@@ -285,7 +288,7 @@ def test_classify_document_from_layout_header_patterns() -> None:
|
|
285
288
|
metadata={},
|
286
289
|
layout=layout_df,
|
287
290
|
)
|
288
|
-
config = ExtractionConfig()
|
291
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
289
292
|
|
290
293
|
doc_type, confidence = classify_document_from_layout(result, config)
|
291
294
|
|
@@ -312,7 +315,7 @@ def test_classify_document_from_layout_position_scoring() -> None:
|
|
312
315
|
metadata={},
|
313
316
|
layout=layout_df,
|
314
317
|
)
|
315
|
-
config = ExtractionConfig()
|
318
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
316
319
|
|
317
320
|
doc_type, confidence = classify_document_from_layout(result, config)
|
318
321
|
|
@@ -327,7 +330,7 @@ def test_auto_detect_document_type_from_content() -> None:
|
|
327
330
|
mime_type="text/plain",
|
328
331
|
metadata={},
|
329
332
|
)
|
330
|
-
config = ExtractionConfig()
|
333
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
331
334
|
|
332
335
|
detection_result = auto_detect_document_type(result, config)
|
333
336
|
|
@@ -352,7 +355,7 @@ def test_auto_detect_document_type_from_layout() -> None:
|
|
352
355
|
metadata={},
|
353
356
|
layout=layout_df,
|
354
357
|
)
|
355
|
-
config = ExtractionConfig()
|
358
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
356
359
|
|
357
360
|
detection_result = auto_detect_document_type(result, config)
|
358
361
|
|
@@ -382,7 +385,7 @@ def test_auto_detect_document_type_no_matches() -> None:
|
|
382
385
|
mime_type="text/plain",
|
383
386
|
metadata={},
|
384
387
|
)
|
385
|
-
config = ExtractionConfig()
|
388
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
386
389
|
|
387
390
|
detection_result = auto_detect_document_type(result, config)
|
388
391
|
|
@@ -884,3 +887,35 @@ def test_classify_document_confidence_calculation(mocker: MockerFixture) -> None
|
|
884
887
|
|
885
888
|
assert doc_type == "invoice"
|
886
889
|
assert confidence == 1.0 # All 3 matches are for invoice, so 3/3 = 1.0
|
890
|
+
|
891
|
+
|
892
|
+
def test_missing_deep_translator_import_error(mocker: MockerFixture) -> None:
|
893
|
+
"""Test that MissingDependencyError is raised when deep-translator is not installed."""
|
894
|
+
# Temporarily remove deep_translator from sys.modules if it exists
|
895
|
+
original_module = sys.modules.pop("deep_translator", None)
|
896
|
+
|
897
|
+
try:
|
898
|
+
# Mock the import to raise ImportError when importing deep_translator
|
899
|
+
def mock_import(name: str, *args: Any, **kwargs: Any) -> Any:
|
900
|
+
if name == "deep_translator":
|
901
|
+
raise ImportError("No module named 'deep_translator'")
|
902
|
+
return original_import(name, *args, **kwargs)
|
903
|
+
|
904
|
+
original_import = builtins.__import__
|
905
|
+
mocker.patch("builtins.__import__", side_effect=mock_import)
|
906
|
+
|
907
|
+
# Import _get_translated_text after setting up the mock
|
908
|
+
from kreuzberg._document_classification import _get_translated_text
|
909
|
+
|
910
|
+
result = ExtractionResult(content="Test content", mime_type="text/plain", metadata={})
|
911
|
+
|
912
|
+
# Should raise MissingDependencyError when trying to import deep_translator
|
913
|
+
with pytest.raises(MissingDependencyError) as exc_info:
|
914
|
+
_get_translated_text(result)
|
915
|
+
|
916
|
+
assert "deep-translator" in str(exc_info.value)
|
917
|
+
assert "pip install 'kreuzberg[document-classification]'" in str(exc_info.value)
|
918
|
+
finally:
|
919
|
+
# Restore original module if it existed
|
920
|
+
if original_module is not None:
|
921
|
+
sys.modules["deep_translator"] = original_module
|