kreuzberg 3.10.1__tar.gz → 3.11.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/ci.yaml +3 -3
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/docs.yml +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/pr-title.yaml +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/publish-docker.yml +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/release.yaml +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.pre-commit-config.yaml +9 -7
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/PKG-INFO +13 -11
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/contributing.md +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/examples/extraction-examples.md +4 -4
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/getting-started/installation.md +11 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/index.md +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/document-classification.md +9 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/extraction-configuration.md +3 -3
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_config.py +18 -14
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_document_classification.py +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_base.py +1 -2
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_image.py +18 -17
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_pdf.py +30 -33
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_mcp/server.py +1 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_easyocr.py +8 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_paddleocr.py +2 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_types.py +11 -10
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/mkdocs.yaml +0 -1
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/pyproject.toml +14 -14
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/document_classification_test.py +49 -14
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/uv.lock +1037 -980
- kreuzberg-3.10.1/docs/changelog.md +0 -49
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.commitlintrc +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.deepsource.toml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.docker/README.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.dockerignore +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.gitignore +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/LICENSE +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/README.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/ai-rulez.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/cli.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/api/main_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/cli_command_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/config_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/conftest.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extraction_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/gmft_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/playa_helpers_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/types_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/tmp_test.py +0 -0
- {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils_errors_test.py +0 -0
@@ -15,7 +15,7 @@ jobs:
|
|
15
15
|
timeout-minutes: 10
|
16
16
|
steps:
|
17
17
|
- name: Checkout
|
18
|
-
uses: actions/checkout@
|
18
|
+
uses: actions/checkout@v5
|
19
19
|
|
20
20
|
- name: Install uv
|
21
21
|
uses: astral-sh/setup-uv@v6
|
@@ -58,7 +58,7 @@ jobs:
|
|
58
58
|
timeout-minutes: 20
|
59
59
|
steps:
|
60
60
|
- name: Checkout
|
61
|
-
uses: actions/checkout@
|
61
|
+
uses: actions/checkout@v5
|
62
62
|
|
63
63
|
- name: Install uv
|
64
64
|
uses: astral-sh/setup-uv@v6
|
@@ -151,7 +151,7 @@ jobs:
|
|
151
151
|
timeout-minutes: 30
|
152
152
|
steps:
|
153
153
|
- name: Checkout
|
154
|
-
uses: actions/checkout@
|
154
|
+
uses: actions/checkout@v5
|
155
155
|
|
156
156
|
- name: Install uv
|
157
157
|
uses: astral-sh/setup-uv@v6
|
@@ -5,13 +5,15 @@ repos:
|
|
5
5
|
- id: commitlint
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
|
-
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
# Temporarily disabled - ai-rulez Go build failing in CI
|
9
|
+
# TODO: Re-enable once ai-rulez v1.4.4+ Python migration is stable
|
10
|
+
# - repo: https://github.com/Goldziher/ai-rulez
|
11
|
+
# rev: v1.4.3
|
12
|
+
# hooks:
|
13
|
+
# - id: ai-rulez-validate
|
14
|
+
# - id: ai-rulez-generate
|
13
15
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
14
|
-
rev:
|
16
|
+
rev: v6.0.0
|
15
17
|
hooks:
|
16
18
|
- id: name-tests-test
|
17
19
|
args:
|
@@ -53,7 +55,7 @@ repos:
|
|
53
55
|
hooks:
|
54
56
|
- id: pyproject-fmt
|
55
57
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
56
|
-
rev: v0.12.
|
58
|
+
rev: v0.12.8
|
57
59
|
hooks:
|
58
60
|
- id: ruff
|
59
61
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.11.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -28,13 +28,13 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
28
|
Classifier: Topic :: Text Processing :: General
|
29
29
|
Classifier: Typing :: Typed
|
30
30
|
Requires-Python: >=3.10
|
31
|
-
Requires-Dist: anyio>=4.
|
31
|
+
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.12.
|
35
|
+
Requires-Dist: mcp>=1.12.4
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
|
-
Requires-Dist: playa-pdf>=0.
|
37
|
+
Requires-Dist: playa-pdf>=0.7.0
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
39
39
|
Requires-Dist: pypdfium2==4.30.0
|
40
40
|
Requires-Dist: python-calamine>=0.3.2
|
@@ -45,25 +45,24 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
|
45
45
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
46
46
|
Provides-Extra: all
|
47
47
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
48
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
48
49
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
49
50
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
50
51
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
51
52
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
52
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
53
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
53
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
55
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
56
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
56
|
-
Requires-Dist:
|
57
|
+
Requires-Dist: pandas>=2.3.1; extra == 'all'
|
58
|
+
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
57
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
58
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
59
61
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
60
62
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
61
63
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
62
64
|
Provides-Extra: api
|
63
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
64
|
-
Provides-Extra: auto-classify-document-type
|
65
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
|
66
|
-
Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
|
65
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
67
66
|
Provides-Extra: chunking
|
68
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
69
68
|
Provides-Extra: cli
|
@@ -71,7 +70,10 @@ Requires-Dist: click>=8.2.1; extra == 'cli'
|
|
71
70
|
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
72
71
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
73
72
|
Provides-Extra: crypto
|
74
|
-
Requires-Dist: playa-pdf[crypto]>=0.
|
73
|
+
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
|
74
|
+
Provides-Extra: document-classification
|
75
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
|
+
Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
|
75
77
|
Provides-Extra: easyocr
|
76
78
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
77
79
|
Provides-Extra: entity-extraction
|
@@ -34,7 +34,7 @@ All commands run through `uv run`:
|
|
34
34
|
# Testing
|
35
35
|
uv run pytest # Run all tests
|
36
36
|
uv run pytest tests/foo_test.py # Run specific test
|
37
|
-
uv run pytest --cov # With coverage (must be ≥
|
37
|
+
uv run pytest --cov # With coverage (must be ≥85%)
|
38
38
|
|
39
39
|
# Code quality
|
40
40
|
uv run ruff format # Format code
|
@@ -132,15 +132,15 @@ async def extract_tables_from_pdf():
|
|
132
132
|
# Process extracted tables
|
133
133
|
print(f"Found {len(result.tables)} tables")
|
134
134
|
for i, table in enumerate(result.tables):
|
135
|
-
print(f"Table {i+1} on page {table
|
136
|
-
print(table
|
135
|
+
print(f"Table {i+1} on page {table['page_number']}:")
|
136
|
+
print(table["text"]) # Markdown formatted table
|
137
137
|
|
138
138
|
# Work with the pandas DataFrame
|
139
|
-
df = table
|
139
|
+
df = table["df"]
|
140
140
|
print(f"Table shape: {df.shape}")
|
141
141
|
|
142
142
|
# The cropped table image is also available
|
143
|
-
# table
|
143
|
+
# table['cropped_image'].save(f"table_{i+1}.png")
|
144
144
|
|
145
145
|
# With custom GMFT configuration
|
146
146
|
custom_config = ExtractionConfig(
|
@@ -134,6 +134,16 @@ python -m spacy download es_core_news_sm # Spanish
|
|
134
134
|
|
135
135
|
spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
|
136
136
|
|
137
|
+
### Document Classification
|
138
|
+
|
139
|
+
For automatic document type detection (invoice, contract, receipt, etc.), install the document classification extra:
|
140
|
+
|
141
|
+
```shell
|
142
|
+
pip install "kreuzberg[document-classification]"
|
143
|
+
```
|
144
|
+
|
145
|
+
This feature uses Google Translate for multi-language support and requires explicit opt-in by setting `auto_detect_document_type=True` in your configuration.
|
146
|
+
|
137
147
|
### All Optional Dependencies
|
138
148
|
|
139
149
|
To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
|
@@ -145,5 +155,5 @@ pip install "kreuzberg[all]"
|
|
145
155
|
This is equivalent to:
|
146
156
|
|
147
157
|
```shell
|
148
|
-
pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
|
158
|
+
pip install "kreuzberg[chunking,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
|
149
159
|
```
|
@@ -22,7 +22,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
|
|
22
22
|
|
23
23
|
### Engineering Principles
|
24
24
|
|
25
|
-
- **Test Coverage**:
|
25
|
+
- **Test Coverage**: Comprehensive test suites ensuring code reliability
|
26
26
|
- **API Design**: True async/await implementation alongside synchronous APIs
|
27
27
|
- **Error Handling**: Consistent exception hierarchy with detailed context
|
28
28
|
- **Type Safety**: Full type annotations for enhanced developer experience
|
@@ -2,9 +2,17 @@
|
|
2
2
|
|
3
3
|
Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
|
4
4
|
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Document classification requires the `document-classification` extra to be installed:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
pip install "kreuzberg[document-classification]"
|
11
|
+
```
|
12
|
+
|
5
13
|
## Enabling Document Classification
|
6
14
|
|
7
|
-
To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
|
15
|
+
Document classification is disabled by default. To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
|
8
16
|
|
9
17
|
```python
|
10
18
|
from kreuzberg import ExtractionConfig, extract_file
|
@@ -237,10 +237,10 @@ result = await extract_file("document_with_tables.pdf", config=config)
|
|
237
237
|
|
238
238
|
# Access extracted tables
|
239
239
|
for i, table in enumerate(result.tables):
|
240
|
-
print(f"Table {i+1} on page {table
|
241
|
-
print(table
|
240
|
+
print(f"Table {i+1} on page {table['page_number']}:")
|
241
|
+
print(table["text"]) # Markdown formatted table text
|
242
242
|
# You can also access the pandas DataFrame directly
|
243
|
-
df = table
|
243
|
+
df = table["df"]
|
244
244
|
print(df.shape) # (rows, columns)
|
245
245
|
```
|
246
246
|
|
@@ -97,19 +97,21 @@ def parse_ocr_backend_config(
|
|
97
97
|
if not isinstance(backend_config, dict):
|
98
98
|
return None
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
100
|
+
match backend:
|
101
|
+
case "tesseract":
|
102
|
+
# Convert psm integer to PSMMode enum if needed
|
103
|
+
processed_config = backend_config.copy()
|
104
|
+
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
105
|
+
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
106
|
+
|
107
|
+
processed_config["psm"] = PSMMode(processed_config["psm"])
|
108
|
+
return TesseractConfig(**processed_config)
|
109
|
+
case "easyocr":
|
110
|
+
return EasyOCRConfig(**backend_config)
|
111
|
+
case "paddleocr":
|
112
|
+
return PaddleOCRConfig(**backend_config)
|
113
|
+
case _:
|
114
|
+
return None
|
113
115
|
|
114
116
|
|
115
117
|
def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
|
@@ -140,7 +142,9 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
140
142
|
"document_classification_mode",
|
141
143
|
"keyword_count",
|
142
144
|
}
|
143
|
-
extraction_config
|
145
|
+
extraction_config = extraction_config | {
|
146
|
+
field: config_dict[field] for field in basic_fields if field in config_dict
|
147
|
+
}
|
144
148
|
|
145
149
|
# Handle OCR backend configuration
|
146
150
|
ocr_backend = extraction_config.get("ocr_backend")
|
@@ -62,7 +62,7 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
62
62
|
from deep_translator import GoogleTranslator # noqa: PLC0415
|
63
63
|
except ImportError as e: # pragma: no cover
|
64
64
|
raise MissingDependencyError(
|
65
|
-
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[
|
65
|
+
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
|
66
66
|
) from e
|
67
67
|
|
68
68
|
try:
|
@@ -116,8 +116,7 @@ class Extractor(ABC):
|
|
116
116
|
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
117
|
|
118
118
|
# Add quality metadata
|
119
|
-
enhanced_metadata = dict(result.metadata) if result.metadata else {}
|
120
|
-
enhanced_metadata["quality_score"] = quality_score
|
119
|
+
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
121
120
|
|
122
121
|
# Return enhanced result
|
123
122
|
return ExtractionResult(
|
@@ -85,23 +85,24 @@ class ImageExtractor(Extractor):
|
|
85
85
|
|
86
86
|
backend = get_ocr_backend(self.config.ocr_backend)
|
87
87
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
88
|
+
match self.config.ocr_backend:
|
89
|
+
case "tesseract":
|
90
|
+
config = (
|
91
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
92
|
+
)
|
93
|
+
result = backend.process_file_sync(path, **asdict(config))
|
94
|
+
case "paddleocr":
|
95
|
+
paddle_config = (
|
96
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
97
|
+
)
|
98
|
+
result = backend.process_file_sync(path, **asdict(paddle_config))
|
99
|
+
case "easyocr":
|
100
|
+
easy_config = (
|
101
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
102
|
+
)
|
103
|
+
result = backend.process_file_sync(path, **asdict(easy_config))
|
104
|
+
case _:
|
105
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
105
106
|
return self._apply_quality_processing(result)
|
106
107
|
|
107
108
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
@@ -88,14 +88,12 @@ class PDFExtractor(Extractor):
|
|
88
88
|
# Enhance metadata with table information
|
89
89
|
if result.tables:
|
90
90
|
table_summary = generate_table_summary(result.tables)
|
91
|
-
result.metadata.
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
}
|
98
|
-
)
|
91
|
+
result.metadata = result.metadata | {
|
92
|
+
"table_count": table_summary["table_count"],
|
93
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
94
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
95
|
+
f"{table_summary['total_rows']} total rows",
|
96
|
+
}
|
99
97
|
|
100
98
|
return self._apply_quality_processing(result)
|
101
99
|
|
@@ -153,14 +151,12 @@ class PDFExtractor(Extractor):
|
|
153
151
|
# Enhance metadata with table information
|
154
152
|
if tables:
|
155
153
|
table_summary = generate_table_summary(tables)
|
156
|
-
result.metadata.
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
}
|
163
|
-
)
|
154
|
+
result.metadata = result.metadata | {
|
155
|
+
"table_count": table_summary["table_count"],
|
156
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
157
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
158
|
+
f"{table_summary['total_rows']} total rows",
|
159
|
+
}
|
164
160
|
|
165
161
|
# Apply quality processing
|
166
162
|
return self._apply_quality_processing(result)
|
@@ -386,23 +382,24 @@ class PDFExtractor(Extractor):
|
|
386
382
|
backend = get_ocr_backend(self.config.ocr_backend)
|
387
383
|
paths = [Path(p) for p in image_paths]
|
388
384
|
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
385
|
+
match self.config.ocr_backend:
|
386
|
+
case "tesseract":
|
387
|
+
config = (
|
388
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
389
|
+
)
|
390
|
+
results = backend.process_batch_sync(paths, **asdict(config))
|
391
|
+
case "paddleocr":
|
392
|
+
paddle_config = (
|
393
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
394
|
+
)
|
395
|
+
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
396
|
+
case "easyocr":
|
397
|
+
easy_config = (
|
398
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
399
|
+
)
|
400
|
+
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
401
|
+
case _:
|
402
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
406
403
|
|
407
404
|
# Use list comprehension and join for efficient string building
|
408
405
|
return "\n\n".join(result.content for result in results)
|
@@ -4,7 +4,6 @@ import warnings
|
|
4
4
|
from dataclasses import dataclass
|
5
5
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
6
6
|
|
7
|
-
import numpy as np
|
8
7
|
from PIL import Image
|
9
8
|
|
10
9
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
@@ -188,6 +187,9 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
188
187
|
|
189
188
|
kwargs.pop("language", None)
|
190
189
|
kwargs.pop("use_gpu", None)
|
190
|
+
kwargs.pop("device", None)
|
191
|
+
kwargs.pop("gpu_memory_limit", None)
|
192
|
+
kwargs.pop("fallback_to_cpu", None)
|
191
193
|
|
192
194
|
try:
|
193
195
|
result = await run_sync(
|
@@ -455,11 +457,16 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
455
457
|
Raises:
|
456
458
|
OCRError: If OCR processing fails.
|
457
459
|
"""
|
460
|
+
import numpy as np # noqa: PLC0415
|
461
|
+
|
458
462
|
self._init_easyocr_sync(**kwargs)
|
459
463
|
|
460
464
|
beam_width = kwargs.pop("beam_width")
|
461
465
|
kwargs.pop("language", None)
|
462
466
|
kwargs.pop("use_gpu", None)
|
467
|
+
kwargs.pop("device", None)
|
468
|
+
kwargs.pop("gpu_memory_limit", None)
|
469
|
+
kwargs.pop("fallback_to_cpu", None)
|
463
470
|
|
464
471
|
try:
|
465
472
|
result = self._reader.readtext(
|
@@ -7,7 +7,6 @@ from importlib.util import find_spec
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
9
9
|
|
10
|
-
import numpy as np
|
11
10
|
from PIL import Image
|
12
11
|
|
13
12
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
@@ -380,6 +379,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
380
379
|
Raises:
|
381
380
|
OCRError: If OCR processing fails.
|
382
381
|
"""
|
382
|
+
import numpy as np # noqa: PLC0415
|
383
|
+
|
383
384
|
self._init_paddle_ocr_sync(**kwargs)
|
384
385
|
|
385
386
|
if image.mode != "RGB":
|
@@ -349,7 +349,7 @@ class ExtractionConfig:
|
|
349
349
|
"""Configuration for language detection. If None, uses default settings."""
|
350
350
|
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
351
351
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
352
|
-
auto_detect_document_type: bool =
|
352
|
+
auto_detect_document_type: bool = False
|
353
353
|
"""Whether to automatically detect the document type."""
|
354
354
|
document_type_confidence_threshold: float = 0.5
|
355
355
|
"""Confidence threshold for document type detection."""
|
@@ -398,15 +398,16 @@ class ExtractionConfig:
|
|
398
398
|
return asdict(self.ocr_config)
|
399
399
|
|
400
400
|
# Lazy load and cache default configs instead of creating new instances
|
401
|
-
|
402
|
-
|
401
|
+
match self.ocr_backend:
|
402
|
+
case "tesseract":
|
403
|
+
from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
|
403
404
|
|
404
|
-
|
405
|
-
|
406
|
-
|
405
|
+
return asdict(TesseractConfig())
|
406
|
+
case "easyocr":
|
407
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
|
407
408
|
|
408
|
-
|
409
|
-
|
410
|
-
|
409
|
+
return asdict(EasyOCRConfig())
|
410
|
+
case _: # paddleocr or any other backend
|
411
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
|
411
412
|
|
412
|
-
|
413
|
+
return asdict(PaddleOCRConfig())
|