kreuzberg 3.11.0__tar.gz → 3.11.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.github/workflows/ci.yaml +3 -3
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.github/workflows/docs.yml +1 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.github/workflows/pr-title.yaml +1 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.github/workflows/publish-docker.yml +1 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.github/workflows/release.yaml +1 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.pre-commit-config.yaml +4 -4
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/PKG-INFO +10 -10
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/examples/extraction-examples.md +4 -4
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/extraction-configuration.md +3 -3
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_gmft.py +28 -10
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_ocr/_easyocr.py +8 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_ocr/_paddleocr.py +2 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/pyproject.toml +9 -9
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/gmft_test.py +119 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/uv.lock +1039 -988
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.commitlintrc +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.deepsource.toml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.docker/Dockerfile +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.docker/README.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.dockerignore +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.gitignore +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/.markdownlint.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/LICENSE +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/README.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/ai-rulez.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/README.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/advanced/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/assets/logo.png +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/cli.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/contributing.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/css/extra.css +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/examples/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/mkdocs.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/api/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/api/main_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/chunker_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/cli_command_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/cli_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/config_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/conftest.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/document_classification_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extraction_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/hooks_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/playa_helpers_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/playa_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/registry_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/types_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils/tmp_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.2}/tests/utils_errors_test.py +0 -0
@@ -15,7 +15,7 @@ jobs:
|
|
15
15
|
timeout-minutes: 10
|
16
16
|
steps:
|
17
17
|
- name: Checkout
|
18
|
-
uses: actions/checkout@
|
18
|
+
uses: actions/checkout@v5
|
19
19
|
|
20
20
|
- name: Install uv
|
21
21
|
uses: astral-sh/setup-uv@v6
|
@@ -58,7 +58,7 @@ jobs:
|
|
58
58
|
timeout-minutes: 20
|
59
59
|
steps:
|
60
60
|
- name: Checkout
|
61
|
-
uses: actions/checkout@
|
61
|
+
uses: actions/checkout@v5
|
62
62
|
|
63
63
|
- name: Install uv
|
64
64
|
uses: astral-sh/setup-uv@v6
|
@@ -151,7 +151,7 @@ jobs:
|
|
151
151
|
timeout-minutes: 30
|
152
152
|
steps:
|
153
153
|
- name: Checkout
|
154
|
-
uses: actions/checkout@
|
154
|
+
uses: actions/checkout@v5
|
155
155
|
|
156
156
|
- name: Install uv
|
157
157
|
uses: astral-sh/setup-uv@v6
|
@@ -6,12 +6,12 @@ repos:
|
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
8
|
- repo: https://github.com/Goldziher/ai-rulez
|
9
|
-
rev: v1.1
|
9
|
+
rev: v1.5.1
|
10
10
|
hooks:
|
11
11
|
- id: ai-rulez-validate
|
12
12
|
- id: ai-rulez-generate
|
13
13
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
14
|
-
rev:
|
14
|
+
rev: v6.0.0
|
15
15
|
hooks:
|
16
16
|
- id: name-tests-test
|
17
17
|
args:
|
@@ -53,7 +53,7 @@ repos:
|
|
53
53
|
hooks:
|
54
54
|
- id: pyproject-fmt
|
55
55
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
56
|
-
rev: v0.12.
|
56
|
+
rev: v0.12.9
|
57
57
|
hooks:
|
58
58
|
- id: ruff
|
59
59
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -66,7 +66,7 @@ repos:
|
|
66
66
|
additional_dependencies:
|
67
67
|
- tomli
|
68
68
|
- repo: https://github.com/jsh9/pydoclint
|
69
|
-
rev: 0.6.
|
69
|
+
rev: 0.6.10
|
70
70
|
hooks:
|
71
71
|
- id: pydoclint
|
72
72
|
args:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.11.
|
3
|
+
Version: 3.11.2
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -28,13 +28,13 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
28
|
Classifier: Topic :: Text Processing :: General
|
29
29
|
Classifier: Typing :: Typed
|
30
30
|
Requires-Python: >=3.10
|
31
|
-
Requires-Dist: anyio>=4.
|
31
|
+
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.
|
35
|
+
Requires-Dist: mcp>=1.13.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
|
-
Requires-Dist: playa-pdf>=0.
|
37
|
+
Requires-Dist: playa-pdf>=0.7.0
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
39
39
|
Requires-Dist: pypdfium2==4.30.0
|
40
40
|
Requires-Dist: python-calamine>=0.3.2
|
@@ -50,19 +50,19 @@ Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
|
50
50
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
51
51
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
52
52
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
53
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
53
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
54
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
55
|
-
Requires-Dist: paddleocr>=3.1.
|
55
|
+
Requires-Dist: paddleocr>=3.1.1; extra == 'all'
|
56
56
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
57
57
|
Requires-Dist: pandas>=2.3.1; extra == 'all'
|
58
|
-
Requires-Dist: playa-pdf[crypto]>=0.
|
58
|
+
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
59
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
60
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
61
61
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
62
62
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
63
63
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
64
64
|
Provides-Extra: api
|
65
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
65
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
66
66
|
Provides-Extra: chunking
|
67
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
68
68
|
Provides-Extra: cli
|
@@ -70,7 +70,7 @@ Requires-Dist: click>=8.2.1; extra == 'cli'
|
|
70
70
|
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
71
71
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
72
72
|
Provides-Extra: crypto
|
73
|
-
Requires-Dist: playa-pdf[crypto]>=0.
|
73
|
+
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
|
74
74
|
Provides-Extra: document-classification
|
75
75
|
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
76
|
Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
|
@@ -84,7 +84,7 @@ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
|
84
84
|
Provides-Extra: langdetect
|
85
85
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
86
86
|
Provides-Extra: paddleocr
|
87
|
-
Requires-Dist: paddleocr>=3.1.
|
87
|
+
Requires-Dist: paddleocr>=3.1.1; extra == 'paddleocr'
|
88
88
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
89
89
|
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
90
90
|
Description-Content-Type: text/markdown
|
@@ -132,15 +132,15 @@ async def extract_tables_from_pdf():
|
|
132
132
|
# Process extracted tables
|
133
133
|
print(f"Found {len(result.tables)} tables")
|
134
134
|
for i, table in enumerate(result.tables):
|
135
|
-
print(f"Table {i+1} on page {table
|
136
|
-
print(table
|
135
|
+
print(f"Table {i+1} on page {table['page_number']}:")
|
136
|
+
print(table["text"]) # Markdown formatted table
|
137
137
|
|
138
138
|
# Work with the pandas DataFrame
|
139
|
-
df = table
|
139
|
+
df = table["df"]
|
140
140
|
print(f"Table shape: {df.shape}")
|
141
141
|
|
142
142
|
# The cropped table image is also available
|
143
|
-
# table
|
143
|
+
# table['cropped_image'].save(f"table_{i+1}.png")
|
144
144
|
|
145
145
|
# With custom GMFT configuration
|
146
146
|
custom_config = ExtractionConfig(
|
@@ -237,10 +237,10 @@ result = await extract_file("document_with_tables.pdf", config=config)
|
|
237
237
|
|
238
238
|
# Access extracted tables
|
239
239
|
for i, table in enumerate(result.tables):
|
240
|
-
print(f"Table {i+1} on page {table
|
241
|
-
print(table
|
240
|
+
print(f"Table {i+1} on page {table['page_number']}:")
|
241
|
+
print(table["text"]) # Markdown formatted table text
|
242
242
|
# You can also access the pandas DataFrame directly
|
243
|
-
df = table
|
243
|
+
df = table["df"]
|
244
244
|
print(df.shape) # (rows, columns)
|
245
245
|
```
|
246
246
|
|
@@ -444,14 +444,26 @@ def _extract_tables_in_process(
|
|
444
444
|
cropped_image.save(img_bytes, format="PNG")
|
445
445
|
img_bytes.seek(0)
|
446
446
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
447
|
+
if data_frame.empty:
|
448
|
+
results.append(
|
449
|
+
{
|
450
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
451
|
+
"page_number": cropped_table.page.page_number,
|
452
|
+
"text": data_frame.to_markdown(),
|
453
|
+
"df_columns": data_frame.columns.tolist(),
|
454
|
+
"df_csv": None,
|
455
|
+
}
|
456
|
+
)
|
457
|
+
else:
|
458
|
+
results.append(
|
459
|
+
{
|
460
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
461
|
+
"page_number": cropped_table.page.page_number,
|
462
|
+
"text": data_frame.to_markdown(),
|
463
|
+
"df_columns": None,
|
464
|
+
"df_csv": data_frame.to_csv(index=False),
|
465
|
+
}
|
466
|
+
)
|
455
467
|
|
456
468
|
result_queue.put((True, results))
|
457
469
|
|
@@ -532,7 +544,10 @@ def _extract_tables_isolated(
|
|
532
544
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
533
545
|
import pandas as pd # noqa: PLC0415
|
534
546
|
|
535
|
-
|
547
|
+
if table_dict["df_csv"] is None:
|
548
|
+
df = pd.DataFrame(columns=table_dict["df_columns"])
|
549
|
+
else:
|
550
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
536
551
|
|
537
552
|
tables.append(
|
538
553
|
TableData(
|
@@ -638,7 +653,10 @@ async def _extract_tables_isolated_async(
|
|
638
653
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
639
654
|
import pandas as pd # noqa: PLC0415
|
640
655
|
|
641
|
-
|
656
|
+
if table_dict["df_csv"] is None:
|
657
|
+
df = pd.DataFrame(columns=table_dict["df_columns"])
|
658
|
+
else:
|
659
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
642
660
|
|
643
661
|
tables.append(
|
644
662
|
TableData(
|
@@ -4,7 +4,6 @@ import warnings
|
|
4
4
|
from dataclasses import dataclass
|
5
5
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
6
6
|
|
7
|
-
import numpy as np
|
8
7
|
from PIL import Image
|
9
8
|
|
10
9
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
@@ -188,6 +187,9 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
188
187
|
|
189
188
|
kwargs.pop("language", None)
|
190
189
|
kwargs.pop("use_gpu", None)
|
190
|
+
kwargs.pop("device", None)
|
191
|
+
kwargs.pop("gpu_memory_limit", None)
|
192
|
+
kwargs.pop("fallback_to_cpu", None)
|
191
193
|
|
192
194
|
try:
|
193
195
|
result = await run_sync(
|
@@ -455,11 +457,16 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
455
457
|
Raises:
|
456
458
|
OCRError: If OCR processing fails.
|
457
459
|
"""
|
460
|
+
import numpy as np # noqa: PLC0415
|
461
|
+
|
458
462
|
self._init_easyocr_sync(**kwargs)
|
459
463
|
|
460
464
|
beam_width = kwargs.pop("beam_width")
|
461
465
|
kwargs.pop("language", None)
|
462
466
|
kwargs.pop("use_gpu", None)
|
467
|
+
kwargs.pop("device", None)
|
468
|
+
kwargs.pop("gpu_memory_limit", None)
|
469
|
+
kwargs.pop("fallback_to_cpu", None)
|
463
470
|
|
464
471
|
try:
|
465
472
|
result = self._reader.readtext(
|
@@ -7,7 +7,6 @@ from importlib.util import find_spec
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
9
9
|
|
10
|
-
import numpy as np
|
11
10
|
from PIL import Image
|
12
11
|
|
13
12
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
@@ -380,6 +379,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
380
379
|
Raises:
|
381
380
|
OCRError: If OCR processing fails.
|
382
381
|
"""
|
382
|
+
import numpy as np # noqa: PLC0415
|
383
|
+
|
383
384
|
self._init_paddle_ocr_sync(**kwargs)
|
384
385
|
|
385
386
|
if image.mode != "RGB":
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.11.
|
8
|
+
version = "3.11.2"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -57,13 +57,13 @@ classifiers = [
|
|
57
57
|
]
|
58
58
|
|
59
59
|
dependencies = [
|
60
|
-
"anyio>=4.
|
60
|
+
"anyio>=4.10.0",
|
61
61
|
"chardetng-py>=0.3.5",
|
62
62
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
63
63
|
"html-to-markdown[lxml]>=1.9.0",
|
64
|
-
"mcp>=1.
|
64
|
+
"mcp>=1.13.0",
|
65
65
|
"msgspec>=0.18.0",
|
66
|
-
"playa-pdf>=0.
|
66
|
+
"playa-pdf>=0.7.0", # pinned due to breaking changes in 0.5.0
|
67
67
|
"psutil>=7.0.0",
|
68
68
|
"pypdfium2==4.30.0", # pinned due to bug in 4.30.1, until v5 is stable
|
69
69
|
"python-calamine>=0.3.2",
|
@@ -79,7 +79,7 @@ optional-dependencies.all = [
|
|
79
79
|
"kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
|
80
80
|
]
|
81
81
|
optional-dependencies.api = [
|
82
|
-
"litestar[standard,structlog,opentelemetry]>=2.
|
82
|
+
"litestar[standard,structlog,opentelemetry]>=2.17.0",
|
83
83
|
]
|
84
84
|
optional-dependencies.chunking = [ "semantic-text-splitter>=0.27.0" ]
|
85
85
|
optional-dependencies.cli = [
|
@@ -87,7 +87,7 @@ optional-dependencies.cli = [
|
|
87
87
|
"rich>=14.1.0",
|
88
88
|
"tomli>=2.0.0; python_version<'3.11'",
|
89
89
|
]
|
90
|
-
optional-dependencies.crypto = [ "playa-pdf[crypto]>=0.
|
90
|
+
optional-dependencies.crypto = [ "playa-pdf[crypto]>=0.7.0" ]
|
91
91
|
optional-dependencies.document-classification = [
|
92
92
|
"deep-translator>=1.11.4",
|
93
93
|
"pandas>=2.3.1",
|
@@ -97,7 +97,7 @@ optional-dependencies.entity-extraction = [ "keybert>=0.9.0", "spacy>=3.8.7" ]
|
|
97
97
|
optional-dependencies.gmft = [ "gmft>=0.4.2" ]
|
98
98
|
optional-dependencies.langdetect = [ "fast-langdetect>=0.3.2" ]
|
99
99
|
optional-dependencies.paddleocr = [
|
100
|
-
"paddleocr>=3.1.
|
100
|
+
"paddleocr>=3.1.1",
|
101
101
|
"paddlepaddle>=3.1.0",
|
102
102
|
"setuptools>=80.9.0",
|
103
103
|
]
|
@@ -111,13 +111,13 @@ scripts.kreuzberg-mcp = "kreuzberg._mcp.server:main"
|
|
111
111
|
dev = [
|
112
112
|
"covdefaults>=2.3.0",
|
113
113
|
"mypy>=1.16.1",
|
114
|
-
"pre-commit>=4.
|
114
|
+
"pre-commit>=4.3.0",
|
115
115
|
"pytest>=8.4.1",
|
116
116
|
"pytest-cov>=6.2.1",
|
117
117
|
"pytest-mock>=3.14.0",
|
118
118
|
"pytest-rerunfailures>=15.1",
|
119
119
|
"pytest-timeout>=2.4.0",
|
120
|
-
"ruff>=0.12.
|
120
|
+
"ruff>=0.12.9",
|
121
121
|
"trio>=0.30.0",
|
122
122
|
"uv-bump",
|
123
123
|
]
|
@@ -669,6 +669,125 @@ class TestGMFTInlineExtractionEdgeCases:
|
|
669
669
|
pytest.skip("GMFT dependency not available for inline testing")
|
670
670
|
|
671
671
|
|
672
|
+
class TestGMFTWithoutTables:
|
673
|
+
"""Test GMFT behavior with PDFs that have no tables - issue #104."""
|
674
|
+
|
675
|
+
@pytest.mark.anyio
|
676
|
+
async def test_extract_tables_pdf_without_tables_async(self) -> None:
|
677
|
+
"""Test that extract_tables handles PDFs without tables gracefully (async)."""
|
678
|
+
pdf_path = Path("tests/test_source_files/searchable.pdf")
|
679
|
+
|
680
|
+
try:
|
681
|
+
tables = await extract_tables(pdf_path)
|
682
|
+
|
683
|
+
assert isinstance(tables, list)
|
684
|
+
|
685
|
+
for table in tables:
|
686
|
+
assert "page_number" in table
|
687
|
+
assert "text" in table
|
688
|
+
assert "df" in table
|
689
|
+
assert "cropped_image" in table
|
690
|
+
|
691
|
+
import pandas as pd
|
692
|
+
|
693
|
+
assert isinstance(table["df"], pd.DataFrame)
|
694
|
+
except MissingDependencyError:
|
695
|
+
pytest.skip("GMFT dependency not installed")
|
696
|
+
|
697
|
+
def test_extract_tables_pdf_without_tables_sync(self) -> None:
|
698
|
+
"""Test that extract_tables_sync handles PDFs without tables gracefully (sync)."""
|
699
|
+
# Using searchable.pdf which is a simple text PDF without tables
|
700
|
+
pdf_path = Path("tests/test_source_files/searchable.pdf")
|
701
|
+
|
702
|
+
try:
|
703
|
+
tables = extract_tables_sync(pdf_path)
|
704
|
+
|
705
|
+
assert isinstance(tables, list)
|
706
|
+
|
707
|
+
for table in tables:
|
708
|
+
assert "page_number" in table
|
709
|
+
assert "text" in table
|
710
|
+
assert "df" in table
|
711
|
+
assert "cropped_image" in table
|
712
|
+
|
713
|
+
import pandas as pd
|
714
|
+
|
715
|
+
assert isinstance(table["df"], pd.DataFrame)
|
716
|
+
except MissingDependencyError:
|
717
|
+
pytest.skip("GMFT dependency not installed")
|
718
|
+
|
719
|
+
@pytest.mark.anyio
|
720
|
+
async def test_extract_file_with_gmft_pdf_without_tables(self) -> None:
|
721
|
+
"""Test that extract_file with extract_tables=True handles PDFs without tables gracefully."""
|
722
|
+
pdf_path = Path("tests/test_source_files/searchable.pdf")
|
723
|
+
|
724
|
+
config = ExtractionConfig(
|
725
|
+
extract_tables=True,
|
726
|
+
gmft_config=GMFTConfig(
|
727
|
+
detector_base_threshold=0.85,
|
728
|
+
remove_null_rows=True,
|
729
|
+
enable_multi_header=True,
|
730
|
+
),
|
731
|
+
)
|
732
|
+
|
733
|
+
try:
|
734
|
+
result = await extract_file(pdf_path, config=config)
|
735
|
+
|
736
|
+
assert result.content
|
737
|
+
assert "Sample PDF" in result.content
|
738
|
+
|
739
|
+
assert hasattr(result, "tables")
|
740
|
+
assert isinstance(result.tables, list)
|
741
|
+
|
742
|
+
for table in result.tables:
|
743
|
+
assert "page_number" in table
|
744
|
+
assert "text" in table
|
745
|
+
assert "df" in table
|
746
|
+
assert "cropped_image" in table
|
747
|
+
|
748
|
+
import pandas as pd
|
749
|
+
|
750
|
+
assert isinstance(table["df"], pd.DataFrame)
|
751
|
+
except MissingDependencyError:
|
752
|
+
pytest.skip("GMFT dependency not installed")
|
753
|
+
|
754
|
+
def test_extract_file_sync_with_gmft_pdf_without_tables(self) -> None:
|
755
|
+
"""Test that extract_file_sync with extract_tables=True handles PDFs without tables gracefully."""
|
756
|
+
pdf_path = Path("tests/test_source_files/searchable.pdf")
|
757
|
+
|
758
|
+
from kreuzberg.extraction import extract_file_sync
|
759
|
+
|
760
|
+
config = ExtractionConfig(
|
761
|
+
extract_tables=True,
|
762
|
+
gmft_config=GMFTConfig(
|
763
|
+
detector_base_threshold=0.85,
|
764
|
+
remove_null_rows=True,
|
765
|
+
enable_multi_header=True,
|
766
|
+
),
|
767
|
+
)
|
768
|
+
|
769
|
+
try:
|
770
|
+
result = extract_file_sync(pdf_path, config=config)
|
771
|
+
|
772
|
+
assert result.content
|
773
|
+
assert "Sample PDF" in result.content
|
774
|
+
|
775
|
+
assert hasattr(result, "tables")
|
776
|
+
assert isinstance(result.tables, list)
|
777
|
+
|
778
|
+
for table in result.tables:
|
779
|
+
assert "page_number" in table
|
780
|
+
assert "text" in table
|
781
|
+
assert "df" in table
|
782
|
+
assert "cropped_image" in table
|
783
|
+
|
784
|
+
import pandas as pd
|
785
|
+
|
786
|
+
assert isinstance(table["df"], pd.DataFrame)
|
787
|
+
except MissingDependencyError:
|
788
|
+
pytest.skip("GMFT dependency not installed")
|
789
|
+
|
790
|
+
|
672
791
|
class TestGMFTConfigSerialization:
|
673
792
|
"""Test GMFTConfig serialization for multiprocessing."""
|
674
793
|
|