kreuzberg 3.9.1__tar.gz → 3.10.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/ci.yaml +97 -46
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.pre-commit-config.yaml +1 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/PKG-INFO +4 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/ai-rulez.yaml +11 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/benchmark_baseline.py +1 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/end_to_end_benchmark.py +1 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +1 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/statistical_benchmark.py +1 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_api/main.py +1 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_chunker.py +1 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_config.py +23 -2
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_document_classification.py +40 -5
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_entity_extraction.py +2 -2
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_email.py +31 -8
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pdf.py +77 -6
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_structured.py +3 -3
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_gmft.py +2 -2
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_language_detection.py +1 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_mcp/server.py +1 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/_base.py +3 -3
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/_easyocr.py +3 -3
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/_paddleocr.py +2 -2
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_playa.py +9 -5
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_types.py +7 -5
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_device.py +6 -6
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_document_cache.py +1 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/cli.py +6 -6
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/pyproject.toml +19 -3
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/api/main_test.py +323 -0
- kreuzberg-3.10.1/tests/cli_command_test.py +523 -0
- kreuzberg-3.10.1/tests/config_test.py +1570 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/conftest.py +6 -0
- kreuzberg-3.10.1/tests/document_classification_test.py +886 -0
- kreuzberg-3.10.1/tests/entity_extraction_test.py +675 -0
- kreuzberg-3.10.1/tests/extraction_test.py +834 -0
- kreuzberg-3.10.1/tests/extractors/email_test.py +1003 -0
- kreuzberg-3.10.1/tests/extractors/image_test.py +768 -0
- kreuzberg-3.10.1/tests/extractors/pandoc_test.py +2123 -0
- kreuzberg-3.10.1/tests/extractors/pdf_test.py +973 -0
- kreuzberg-3.10.1/tests/extractors/presentation_test.py +1005 -0
- kreuzberg-3.10.1/tests/extractors/spreed_sheet_test.py +1237 -0
- kreuzberg-3.10.1/tests/extractors/structured_test.py +302 -0
- kreuzberg-3.10.1/tests/gmft_test.py +720 -0
- kreuzberg-3.10.1/tests/language_detection_test.py +172 -0
- kreuzberg-3.10.1/tests/mcp_server_test.py +883 -0
- kreuzberg-3.10.1/tests/ocr/tesseract_test.py +1141 -0
- kreuzberg-3.10.1/tests/playa_helpers_test.py +549 -0
- kreuzberg-3.10.1/tests/types_test.py +440 -0
- kreuzberg-3.10.1/tests/utils/string_test.py +305 -0
- kreuzberg-3.10.1/tests/utils_errors_test.py +299 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/uv.lock +217 -129
- kreuzberg-3.9.1/tests/config_test.py +0 -401
- kreuzberg-3.9.1/tests/document_classification_test.py +0 -86
- kreuzberg-3.9.1/tests/entity_extraction_test.py +0 -102
- kreuzberg-3.9.1/tests/extraction_test.py +0 -389
- kreuzberg-3.9.1/tests/extractors/email_comprehensive_test.py +0 -326
- kreuzberg-3.9.1/tests/extractors/email_test.py +0 -31
- kreuzberg-3.9.1/tests/extractors/image_test.py +0 -275
- kreuzberg-3.9.1/tests/extractors/pandoc_test.py +0 -458
- kreuzberg-3.9.1/tests/extractors/pdf_test.py +0 -390
- kreuzberg-3.9.1/tests/extractors/presentation_test.py +0 -410
- kreuzberg-3.9.1/tests/extractors/spreed_sheet_test.py +0 -325
- kreuzberg-3.9.1/tests/extractors/structured_test.py +0 -90
- kreuzberg-3.9.1/tests/gmft_test.py +0 -397
- kreuzberg-3.9.1/tests/language_detection_test.py +0 -237
- kreuzberg-3.9.1/tests/mcp_server_test.py +0 -382
- kreuzberg-3.9.1/tests/ocr/tesseract_test.py +0 -477
- kreuzberg-3.9.1/tests/types_test.py +0 -191
- kreuzberg-3.9.1/tests/utils/string_test.py +0 -85
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.commitlintrc +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.deepsource.toml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.docker/README.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.dockerignore +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.gitignore +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/LICENSE +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/README.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/changelog.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/cli.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/contributing.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/mkdocs.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/tmp_test.py +0 -0
@@ -51,12 +51,103 @@ jobs:
|
|
51
51
|
- name: Execute Pre-Commit
|
52
52
|
run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
|
53
53
|
|
54
|
+
# Coverage job runs first, only on Python 3.13 Ubuntu
|
55
|
+
coverage:
|
56
|
+
needs: validate
|
57
|
+
runs-on: ubuntu-latest
|
58
|
+
timeout-minutes: 20
|
59
|
+
steps:
|
60
|
+
- name: Checkout
|
61
|
+
uses: actions/checkout@v4
|
62
|
+
|
63
|
+
- name: Install uv
|
64
|
+
uses: astral-sh/setup-uv@v6
|
65
|
+
with:
|
66
|
+
enable-cache: true
|
67
|
+
|
68
|
+
- name: Install Python
|
69
|
+
uses: actions/setup-python@v5
|
70
|
+
id: setup-python
|
71
|
+
with:
|
72
|
+
python-version: "3.13"
|
73
|
+
|
74
|
+
- name: Cache Python Dependencies
|
75
|
+
id: python-cache
|
76
|
+
uses: actions/cache@v4
|
77
|
+
with:
|
78
|
+
path: |
|
79
|
+
~/.cache/uv
|
80
|
+
.venv
|
81
|
+
key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
|
82
|
+
restore-keys: |
|
83
|
+
python-dependencies-ubuntu-latest-3.13-
|
84
|
+
|
85
|
+
- name: Install Dependencies
|
86
|
+
uses: nick-fields/retry@v3
|
87
|
+
with:
|
88
|
+
timeout_minutes: 5
|
89
|
+
max_attempts: 3
|
90
|
+
retry_wait_seconds: 30
|
91
|
+
command: |
|
92
|
+
uv sync --all-packages --all-extras --dev
|
93
|
+
shell: bash
|
94
|
+
|
95
|
+
- name: Install System Dependencies
|
96
|
+
uses: nick-fields/retry@v3
|
97
|
+
with:
|
98
|
+
timeout_minutes: 5
|
99
|
+
max_attempts: 3
|
100
|
+
retry_wait_seconds: 30
|
101
|
+
command: |
|
102
|
+
sudo apt-get update
|
103
|
+
sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
|
104
|
+
shell: bash
|
105
|
+
|
106
|
+
- name: Run Tests with Coverage
|
107
|
+
uses: nick-fields/retry@v3
|
108
|
+
with:
|
109
|
+
timeout_minutes: 15
|
110
|
+
max_attempts: 3
|
111
|
+
retry_wait_seconds: 10
|
112
|
+
command: |
|
113
|
+
uv run coverage erase
|
114
|
+
uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
|
115
|
+
uv run coverage report --precision=2
|
116
|
+
shell: bash
|
117
|
+
|
118
|
+
- name: Upload Coverage to DeepSource
|
119
|
+
if: always() && github.event_name == 'push'
|
120
|
+
env:
|
121
|
+
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
122
|
+
run: |
|
123
|
+
# Install DeepSource CLI
|
124
|
+
curl -fsSL https://deepsource.io/cli | sh
|
125
|
+
# Upload coverage report
|
126
|
+
./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
|
127
|
+
|
128
|
+
- name: Upload Coverage Artifacts
|
129
|
+
if: always()
|
130
|
+
uses: actions/upload-artifact@v4
|
131
|
+
with:
|
132
|
+
name: coverage-report-${{ github.sha }}
|
133
|
+
path: |
|
134
|
+
coverage.lcov
|
135
|
+
.coverage
|
136
|
+
retention-days: 7
|
137
|
+
|
138
|
+
# Full test matrix runs only after coverage succeeds
|
54
139
|
test:
|
140
|
+
needs: coverage
|
141
|
+
runs-on: ${{ matrix.os }}
|
55
142
|
strategy:
|
143
|
+
fail-fast: false
|
56
144
|
matrix:
|
57
|
-
os: [
|
58
|
-
python:
|
59
|
-
|
145
|
+
os: [ubuntu-latest, windows-latest, macos-latest]
|
146
|
+
python: ["3.10", "3.11", "3.12", "3.13"]
|
147
|
+
exclude:
|
148
|
+
# Skip Python 3.13 on macOS for now due to compatibility issues
|
149
|
+
- os: macos-latest
|
150
|
+
python: "3.13"
|
60
151
|
timeout-minutes: 30
|
61
152
|
steps:
|
62
153
|
- name: Checkout
|
@@ -146,52 +237,12 @@ jobs:
|
|
146
237
|
pandoc --version
|
147
238
|
shell: pwsh
|
148
239
|
|
149
|
-
- name:
|
150
|
-
run: |
|
151
|
-
rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
|
152
|
-
shell: bash
|
153
|
-
|
154
|
-
- name: Run Tests with Coverage
|
155
|
-
run: |
|
156
|
-
uv run coverage erase
|
157
|
-
uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
|
158
|
-
|
159
|
-
- name: Upload Coverage Artifacts
|
160
|
-
if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
|
161
|
-
uses: actions/upload-artifact@v4
|
162
|
-
with:
|
163
|
-
name: coverage-report
|
164
|
-
path: coverage.lcov
|
165
|
-
retention-days: 1
|
166
|
-
|
167
|
-
upload-coverage:
|
168
|
-
needs: test
|
169
|
-
runs-on: ubuntu-latest
|
170
|
-
if: github.event_name == 'push' || github.event_name == 'pull_request'
|
171
|
-
steps:
|
172
|
-
- name: Checkout
|
173
|
-
uses: actions/checkout@v4
|
174
|
-
with:
|
175
|
-
ref: ${{ github.event.pull_request.head.sha || github.sha }}
|
176
|
-
|
177
|
-
- name: Download Coverage Artifacts
|
178
|
-
uses: actions/download-artifact@v4
|
179
|
-
with:
|
180
|
-
name: coverage-report
|
181
|
-
path: .
|
182
|
-
|
183
|
-
- name: Install DeepSource CLI
|
240
|
+
- name: Run Tests (without coverage)
|
184
241
|
uses: nick-fields/retry@v3
|
185
242
|
with:
|
186
|
-
timeout_minutes:
|
243
|
+
timeout_minutes: 15
|
187
244
|
max_attempts: 3
|
188
245
|
retry_wait_seconds: 10
|
189
246
|
command: |
|
190
|
-
|
247
|
+
uv run pytest -s -vvv --reruns 2 --reruns-delay 1
|
191
248
|
shell: bash
|
192
|
-
|
193
|
-
- name: Upload Coverage to DeepSource
|
194
|
-
env:
|
195
|
-
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
196
|
-
run: |
|
197
|
-
./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.10.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -53,6 +53,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
|
|
53
53
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
54
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
55
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
56
|
+
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
|
56
57
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
57
58
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
58
59
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
@@ -69,6 +70,8 @@ Provides-Extra: cli
|
|
69
70
|
Requires-Dist: click>=8.2.1; extra == 'cli'
|
70
71
|
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
71
72
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
73
|
+
Provides-Extra: crypto
|
74
|
+
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
|
72
75
|
Provides-Extra: easyocr
|
73
76
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
74
77
|
Provides-Extra: entity-extraction
|
@@ -193,16 +193,18 @@ rules:
|
|
193
193
|
api = ["litestar[standard,structlog,opentelemetry]>=2.1.6"]
|
194
194
|
cli = ["click>=8.2.1", "rich>=14.0.0", "tomli>=2.0.0; python_version<'3.11'"]
|
195
195
|
chunking = ["semantic-text-splitter>=0.27.0"]
|
196
|
+
crypto = ["playa-pdf[crypto]>=0.6.4"]
|
196
197
|
easyocr = ["easyocr>=1.7.2"]
|
197
198
|
gmft = ["gmft>=0.4.2"]
|
198
199
|
langdetect = ["fast-langdetect>=0.2.0"]
|
199
200
|
paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
|
200
|
-
all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
|
201
|
+
all = ["kreuzberg[api,chunking,cli,crypto,easyocr,gmft,langdetect,paddleocr]"]
|
201
202
|
```
|
202
203
|
|
203
204
|
### Installation Patterns
|
204
205
|
- Basic: `pip install kreuzberg`
|
205
206
|
- With features: `pip install "kreuzberg[api,cli]"`
|
207
|
+
- With crypto support: `pip install "kreuzberg[crypto]"`
|
206
208
|
- All features: `pip install "kreuzberg[all]"`
|
207
209
|
- Development: `uv sync --all-extras`
|
208
210
|
|
@@ -211,6 +213,14 @@ rules:
|
|
211
213
|
- **System**: tesseract-ocr, pandoc (via package manager)
|
212
214
|
- **Development**: Uses dependency groups in pyproject.toml
|
213
215
|
|
216
|
+
### Crypto Support
|
217
|
+
The `crypto` extra adds cryptographic support for PDF processing:
|
218
|
+
- **Purpose**: Enables AES encryption/decryption for password-protected PDFs
|
219
|
+
- **Dependencies**: Adds cryptography (~22MB), cffi, and pycparser
|
220
|
+
- **Usage**: Required for PDFs with AES encryption (RC4 is supported in base installation)
|
221
|
+
- **Password Support**: Supports single password or list of passwords to try in sequence
|
222
|
+
- **Size Impact**: Increases installation size by ~24MB due to cryptography package
|
223
|
+
|
214
224
|
sections:
|
215
225
|
- title: "Language Detection"
|
216
226
|
content: |
|
@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
|
|
108
108
|
return results # type: ignore[return-value]
|
109
109
|
|
110
110
|
|
111
|
-
if __name__ == "__main__":
|
111
|
+
if __name__ == "__main__": # pragma: no cover
|
112
112
|
baseline_results = asyncio.run(run_baseline_benchmark())
|
113
113
|
|
114
114
|
baseline_file = Path("baseline_results.json")
|
@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
|
|
195
195
|
}
|
196
196
|
|
197
197
|
|
198
|
-
if __name__ == "__main__":
|
198
|
+
if __name__ == "__main__": # pragma: no cover
|
199
199
|
print("🧪 REPRODUCIBLE CACHE BENCHMARK")
|
200
200
|
print("Testing msgpack implementation with statistical rigor...")
|
201
201
|
print()
|
@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
|
|
187
187
|
}
|
188
188
|
|
189
189
|
|
190
|
-
if __name__ == "__main__":
|
190
|
+
if __name__ == "__main__": # pragma: no cover
|
191
191
|
print("🧪 STATISTICAL CACHE BENCHMARK")
|
192
192
|
print("Testing msgpack implementation with proper error analysis...")
|
193
193
|
print()
|
@@ -30,7 +30,7 @@ try:
|
|
30
30
|
HTTP_422_UNPROCESSABLE_ENTITY,
|
31
31
|
HTTP_500_INTERNAL_SERVER_ERROR,
|
32
32
|
)
|
33
|
-
except ImportError as e:
|
33
|
+
except ImportError as e: # pragma: no cover
|
34
34
|
raise MissingDependencyError.create_for_package(
|
35
35
|
dependency_group="litestar",
|
36
36
|
functionality="Litestar API and docker container",
|
@@ -43,7 +43,7 @@ def get_chunker(
|
|
43
43
|
from semantic_text_splitter import TextSplitter # noqa: PLC0415
|
44
44
|
|
45
45
|
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
|
-
except ImportError as e:
|
46
|
+
except ImportError as e: # pragma: no cover
|
47
47
|
raise MissingDependencyError.create_for_package(
|
48
48
|
dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
|
49
49
|
) from e
|
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
|
|
13
13
|
|
14
14
|
if sys.version_info >= (3, 11):
|
15
15
|
import tomllib
|
16
|
-
else:
|
16
|
+
else: # pragma: no cover
|
17
17
|
import tomli as tomllib # type: ignore[import-not-found]
|
18
18
|
|
19
19
|
from kreuzberg._gmft import GMFTConfig
|
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
50
50
|
# Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
|
51
51
|
if config_path.name == "kreuzberg.toml":
|
52
52
|
return data # type: ignore[no-any-return]
|
53
|
-
|
53
|
+
|
54
|
+
# For other files, check if they have [tool.kreuzberg] section
|
55
|
+
if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
|
56
|
+
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
57
|
+
|
58
|
+
# Otherwise assume root-level configuration
|
59
|
+
return data # type: ignore[no-any-return]
|
54
60
|
|
55
61
|
|
56
62
|
def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
@@ -129,12 +135,23 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
129
135
|
"extract_keywords",
|
130
136
|
"auto_detect_language",
|
131
137
|
"enable_quality_processing",
|
138
|
+
"auto_detect_document_type",
|
139
|
+
"document_type_confidence_threshold",
|
140
|
+
"document_classification_mode",
|
141
|
+
"keyword_count",
|
132
142
|
}
|
133
143
|
extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
|
134
144
|
|
135
145
|
# Handle OCR backend configuration
|
136
146
|
ocr_backend = extraction_config.get("ocr_backend")
|
137
147
|
if ocr_backend and ocr_backend != "none":
|
148
|
+
# Validate OCR backend
|
149
|
+
valid_backends = {"tesseract", "easyocr", "paddleocr"}
|
150
|
+
if ocr_backend not in valid_backends:
|
151
|
+
raise ValidationError(
|
152
|
+
f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
|
153
|
+
context={"provided": ocr_backend, "valid": sorted(valid_backends)},
|
154
|
+
)
|
138
155
|
ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
|
139
156
|
if ocr_config:
|
140
157
|
extraction_config["ocr_config"] = ocr_config
|
@@ -286,6 +303,10 @@ _CONFIG_FIELDS = [
|
|
286
303
|
"extract_keywords",
|
287
304
|
"auto_detect_language",
|
288
305
|
"enable_quality_processing",
|
306
|
+
"auto_detect_document_type",
|
307
|
+
"document_type_confidence_threshold",
|
308
|
+
"document_classification_mode",
|
309
|
+
"keyword_count",
|
289
310
|
]
|
290
311
|
|
291
312
|
|
@@ -4,13 +4,12 @@ import re
|
|
4
4
|
from typing import TYPE_CHECKING
|
5
5
|
|
6
6
|
from kreuzberg._ocr import get_ocr_backend
|
7
|
+
from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
|
7
8
|
from kreuzberg.exceptions import MissingDependencyError
|
8
9
|
|
9
10
|
if TYPE_CHECKING:
|
10
11
|
from pathlib import Path
|
11
12
|
|
12
|
-
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
13
|
-
|
14
13
|
|
15
14
|
DOCUMENT_CLASSIFIERS = {
|
16
15
|
"invoice": [
|
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
52
51
|
Raises:
|
53
52
|
MissingDependencyError: If the deep-translator package is not installed
|
54
53
|
"""
|
54
|
+
# Combine content with metadata for classification
|
55
|
+
text_to_classify = result.content
|
56
|
+
if result.metadata:
|
57
|
+
# Add metadata values to the text for classification
|
58
|
+
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
59
|
+
text_to_classify = f"{text_to_classify} {metadata_text}"
|
60
|
+
|
55
61
|
try:
|
56
62
|
from deep_translator import GoogleTranslator # noqa: PLC0415
|
57
|
-
except ImportError as e:
|
63
|
+
except ImportError as e: # pragma: no cover
|
58
64
|
raise MissingDependencyError(
|
59
65
|
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
|
60
66
|
) from e
|
61
67
|
|
62
|
-
|
68
|
+
try:
|
69
|
+
return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
70
|
+
except Exception: # noqa: BLE001
|
71
|
+
# Fall back to original content in lowercase if translation fails
|
72
|
+
return text_to_classify.lower()
|
63
73
|
|
64
74
|
|
65
75
|
def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
|
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
|
|
73
83
|
A tuple containing the detected document type and the confidence score,
|
74
84
|
or (None, None) if no type is detected with sufficient confidence.
|
75
85
|
"""
|
86
|
+
if not config.auto_detect_document_type:
|
87
|
+
return None, None
|
88
|
+
|
76
89
|
translated_text = _get_translated_text(result)
|
77
90
|
scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
|
78
91
|
|
@@ -108,7 +121,8 @@ def classify_document_from_layout(
|
|
108
121
|
A tuple containing the detected document type and the confidence score,
|
109
122
|
or (None, None) if no type is detected with sufficient confidence.
|
110
123
|
"""
|
111
|
-
|
124
|
+
if not config.auto_detect_document_type:
|
125
|
+
return None, None
|
112
126
|
|
113
127
|
if result.layout is None or result.layout.empty:
|
114
128
|
return None, None
|
@@ -117,6 +131,24 @@ def classify_document_from_layout(
|
|
117
131
|
if not all(col in layout_df.columns for col in ["text", "top", "height"]):
|
118
132
|
return None, None
|
119
133
|
|
134
|
+
# Use layout text for classification, not the content
|
135
|
+
layout_text = " ".join(layout_df["text"].astype(str).tolist())
|
136
|
+
|
137
|
+
# Translate layout text directly for classification
|
138
|
+
text_to_classify = layout_text
|
139
|
+
if result.metadata:
|
140
|
+
# Add metadata values to the text for classification
|
141
|
+
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
142
|
+
text_to_classify = f"{text_to_classify} {metadata_text}"
|
143
|
+
|
144
|
+
try:
|
145
|
+
from deep_translator import GoogleTranslator # noqa: PLC0415
|
146
|
+
|
147
|
+
translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
148
|
+
except Exception: # noqa: BLE001
|
149
|
+
# Fall back to original content in lowercase if translation fails
|
150
|
+
translated_text = text_to_classify.lower()
|
151
|
+
|
120
152
|
layout_df["translated_text"] = translated_text
|
121
153
|
|
122
154
|
page_height = layout_df["top"].max() + layout_df["height"].max()
|
@@ -151,6 +183,9 @@ def auto_detect_document_type(
|
|
151
183
|
if config.document_classification_mode == "vision" and file_path:
|
152
184
|
layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
|
153
185
|
result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
|
186
|
+
elif result.layout is not None and not result.layout.empty:
|
187
|
+
# Use layout-based classification if layout data is available
|
188
|
+
result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
|
154
189
|
else:
|
155
190
|
result.document_type, result.document_type_confidence = classify_document(result, config)
|
156
191
|
return result
|
@@ -139,7 +139,7 @@ def extract_entities(
|
|
139
139
|
|
140
140
|
try:
|
141
141
|
import spacy # noqa: F401, PLC0415
|
142
|
-
except ImportError as e:
|
142
|
+
except ImportError as e: # pragma: no cover
|
143
143
|
raise MissingDependencyError.create_for_package(
|
144
144
|
package_name="spacy",
|
145
145
|
dependency_group="entity-extraction",
|
@@ -230,7 +230,7 @@ def extract_keywords(
|
|
230
230
|
return [(kw, float(score)) for kw, score in keywords]
|
231
231
|
except (RuntimeError, OSError, ValueError):
|
232
232
|
return []
|
233
|
-
except ImportError as e:
|
233
|
+
except ImportError as e: # pragma: no cover
|
234
234
|
raise MissingDependencyError.create_for_package(
|
235
235
|
package_name="keybert",
|
236
236
|
dependency_group="entity-extraction",
|
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
|
|
19
19
|
# Import optional dependencies at module level with proper error handling
|
20
20
|
try:
|
21
21
|
import mailparse
|
22
|
-
except ImportError:
|
22
|
+
except ImportError: # pragma: no cover
|
23
23
|
mailparse = None
|
24
24
|
|
25
25
|
try:
|
26
26
|
import html2text # type: ignore[import-not-found]
|
27
|
-
except ImportError:
|
27
|
+
except ImportError: # pragma: no cover
|
28
28
|
html2text = None
|
29
29
|
|
30
30
|
# Compile regex pattern once at module level
|
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
|
|
59
59
|
|
60
60
|
to_info = parsed_email.get("to")
|
61
61
|
if to_info:
|
62
|
+
# Store the raw value in metadata (could be string, dict, or list)
|
62
63
|
if isinstance(to_info, list) and to_info:
|
64
|
+
# For metadata, use first recipient's email if it's a list
|
63
65
|
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
66
|
+
metadata["email_to"] = to_email
|
64
67
|
elif isinstance(to_info, dict):
|
65
|
-
|
68
|
+
metadata["email_to"] = to_info.get("email", "")
|
66
69
|
else:
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
+
metadata["email_to"] = str(to_info)
|
71
|
+
|
72
|
+
# For display, format all recipients
|
73
|
+
to_formatted = self._format_email_field(to_info)
|
74
|
+
text_parts.append(f"To: {to_formatted}")
|
70
75
|
|
71
76
|
date = parsed_email.get("date")
|
72
77
|
if date:
|
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
|
|
76
81
|
cc = parsed_email.get("cc")
|
77
82
|
if cc:
|
78
83
|
metadata["email_cc"] = cc
|
79
|
-
|
84
|
+
cc_formatted = self._format_email_field(cc)
|
85
|
+
text_parts.append(f"CC: {cc_formatted}")
|
80
86
|
|
81
87
|
bcc = parsed_email.get("bcc")
|
82
88
|
if bcc:
|
83
89
|
metadata["email_bcc"] = bcc
|
84
|
-
|
90
|
+
bcc_formatted = self._format_email_field(bcc)
|
91
|
+
text_parts.append(f"BCC: {bcc_formatted}")
|
92
|
+
|
93
|
+
def _format_email_field(self, field: Any) -> str:
|
94
|
+
"""Format email field (to, cc, bcc) for display."""
|
95
|
+
if isinstance(field, list):
|
96
|
+
emails = []
|
97
|
+
for item in field:
|
98
|
+
if isinstance(item, dict):
|
99
|
+
email = item.get("email", "")
|
100
|
+
if email:
|
101
|
+
emails.append(email)
|
102
|
+
else:
|
103
|
+
emails.append(str(item))
|
104
|
+
return ", ".join(emails)
|
105
|
+
if isinstance(field, dict):
|
106
|
+
return str(field.get("email", ""))
|
107
|
+
return str(field)
|
85
108
|
|
86
109
|
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
87
110
|
"""Extract and process email body content."""
|
@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
|
|
22
22
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
23
23
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
24
24
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
25
|
-
from kreuzberg._types import ExtractionResult, OcrBackendType
|
25
|
+
from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
|
26
26
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
27
27
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
28
28
|
from kreuzberg._utils._string import normalize_spaces
|
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
|
|
33
33
|
|
34
34
|
if TYPE_CHECKING: # pragma: no cover
|
35
35
|
from PIL.Image import Image
|
36
|
+
from playa.document import Document
|
36
37
|
|
37
38
|
|
38
39
|
class PDFExtractor(Extractor):
|
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
|
|
45
46
|
file_path, unlink = await create_temp_file(".pdf")
|
46
47
|
await AsyncPath(file_path).write_bytes(content)
|
47
48
|
try:
|
48
|
-
metadata = await
|
49
|
+
metadata = await self._extract_metadata_with_password_attempts(content)
|
49
50
|
result = await self.extract_path_async(file_path)
|
50
51
|
|
51
52
|
result.metadata = metadata
|
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
|
|
73
74
|
if not result:
|
74
75
|
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
75
76
|
|
76
|
-
result.metadata = await
|
77
|
+
result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
|
77
78
|
|
78
79
|
if self.config.extract_tables:
|
79
80
|
# GMFT is optional dependency
|
@@ -81,7 +82,7 @@ class PDFExtractor(Extractor):
|
|
81
82
|
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
82
83
|
|
83
84
|
result.tables = await extract_tables(path, self.config.gmft_config)
|
84
|
-
except ImportError:
|
85
|
+
except ImportError: # pragma: no cover
|
85
86
|
result.tables = []
|
86
87
|
|
87
88
|
# Enhance metadata with table information
|
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
|
|
107
108
|
|
108
109
|
result = self.extract_path_sync(Path(temp_path))
|
109
110
|
|
110
|
-
metadata =
|
111
|
+
metadata = self._extract_metadata_with_password_attempts_sync(content)
|
111
112
|
result.metadata = metadata
|
112
113
|
|
113
114
|
return result
|
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
|
|
406
407
|
# Use list comprehension and join for efficient string building
|
407
408
|
return "\n\n".join(result.content for result in results)
|
408
409
|
|
410
|
+
def _parse_with_password_attempts(self, content: bytes) -> Document:
|
411
|
+
"""Parse PDF with password attempts."""
|
412
|
+
# Normalize password to list
|
413
|
+
if isinstance(self.config.pdf_password, str):
|
414
|
+
passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
|
415
|
+
else:
|
416
|
+
passwords = list(self.config.pdf_password)
|
417
|
+
|
418
|
+
# Try each password in sequence
|
419
|
+
last_exception = None
|
420
|
+
for password in passwords:
|
421
|
+
try:
|
422
|
+
return parse(content, max_workers=1, password=password)
|
423
|
+
except Exception as e: # noqa: PERF203, BLE001
|
424
|
+
last_exception = e
|
425
|
+
continue
|
426
|
+
|
427
|
+
# If all passwords failed, raise the last exception
|
428
|
+
if last_exception:
|
429
|
+
raise last_exception from None
|
430
|
+
|
431
|
+
# Fallback to no password
|
432
|
+
return parse(content, max_workers=1, password="")
|
433
|
+
|
434
|
+
def _get_passwords_to_try(self) -> list[str]:
|
435
|
+
"""Get list of passwords to try in sequence."""
|
436
|
+
if isinstance(self.config.pdf_password, str):
|
437
|
+
return [self.config.pdf_password] if self.config.pdf_password else [""]
|
438
|
+
return list(self.config.pdf_password) if self.config.pdf_password else [""]
|
439
|
+
|
440
|
+
async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
|
441
|
+
"""Extract PDF metadata with password attempts."""
|
442
|
+
passwords = self._get_passwords_to_try()
|
443
|
+
|
444
|
+
last_exception = None
|
445
|
+
for password in passwords:
|
446
|
+
try:
|
447
|
+
return await extract_pdf_metadata(content, password=password)
|
448
|
+
except Exception as e: # noqa: PERF203, BLE001
|
449
|
+
last_exception = e
|
450
|
+
continue
|
451
|
+
|
452
|
+
# If all passwords failed, try with empty password as fallback
|
453
|
+
try:
|
454
|
+
return await extract_pdf_metadata(content, password="")
|
455
|
+
except Exception:
|
456
|
+
if last_exception:
|
457
|
+
raise last_exception from None
|
458
|
+
raise
|
459
|
+
|
460
|
+
def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
|
461
|
+
"""Extract PDF metadata with password attempts (sync version)."""
|
462
|
+
passwords = self._get_passwords_to_try()
|
463
|
+
|
464
|
+
last_exception = None
|
465
|
+
for password in passwords:
|
466
|
+
try:
|
467
|
+
return extract_pdf_metadata_sync(content, password=password)
|
468
|
+
except Exception as e: # noqa: PERF203, BLE001
|
469
|
+
last_exception = e
|
470
|
+
continue
|
471
|
+
|
472
|
+
# If all passwords failed, try with empty password as fallback
|
473
|
+
try:
|
474
|
+
return extract_pdf_metadata_sync(content, password="")
|
475
|
+
except Exception:
|
476
|
+
if last_exception:
|
477
|
+
raise last_exception from None
|
478
|
+
raise
|
479
|
+
|
409
480
|
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
410
481
|
"""Extract text using playa for better structure preservation."""
|
411
482
|
with contextlib.suppress(Exception):
|
412
483
|
content = path.read_bytes()
|
413
|
-
document =
|
484
|
+
document = self._parse_with_password_attempts(content)
|
414
485
|
|
415
486
|
# Extract text while preserving structure
|
416
487
|
pages_text = []
|