kreuzberg 3.11.1__tar.gz → 3.11.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.pre-commit-config.yaml +7 -9
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/PKG-INFO +4 -4
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_gmft.py +28 -10
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/pyproject.toml +4 -4
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/gmft_test.py +119 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/uv.lock +110 -93
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.commitlintrc +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.deepsource.toml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.docker/Dockerfile +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.docker/README.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.dockerignore +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.gitignore +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/.markdownlint.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/LICENSE +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/README.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/ai-rulez.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/README.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/assets/logo.png +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/cli.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/contributing.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/css/extra.css +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/examples/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/mkdocs.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/api/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/api/main_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/chunker_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/cli_command_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/cli_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/config_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/conftest.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/document_classification_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extraction_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/hooks_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/playa_helpers_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/playa_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/registry_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/types_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils/tmp_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.2}/tests/utils_errors_test.py +0 -0
@@ -5,13 +5,11 @@ repos:
|
|
5
5
|
- id: commitlint
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
# - id: ai-rulez-validate
|
14
|
-
# - id: ai-rulez-generate
|
8
|
+
- repo: https://github.com/Goldziher/ai-rulez
|
9
|
+
rev: v1.5.1
|
10
|
+
hooks:
|
11
|
+
- id: ai-rulez-validate
|
12
|
+
- id: ai-rulez-generate
|
15
13
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
16
14
|
rev: v6.0.0
|
17
15
|
hooks:
|
@@ -55,7 +53,7 @@ repos:
|
|
55
53
|
hooks:
|
56
54
|
- id: pyproject-fmt
|
57
55
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
58
|
-
rev: v0.12.
|
56
|
+
rev: v0.12.9
|
59
57
|
hooks:
|
60
58
|
- id: ruff
|
61
59
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -68,7 +66,7 @@ repos:
|
|
68
66
|
additional_dependencies:
|
69
67
|
- tomli
|
70
68
|
- repo: https://github.com/jsh9/pydoclint
|
71
|
-
rev: 0.6.
|
69
|
+
rev: 0.6.10
|
72
70
|
hooks:
|
73
71
|
- id: pydoclint
|
74
72
|
args:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.11.
|
3
|
+
Version: 3.11.2
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.10.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.
|
35
|
+
Requires-Dist: mcp>=1.13.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: playa-pdf>=0.7.0
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
@@ -52,7 +52,7 @@ Requires-Dist: gmft>=0.4.2; extra == 'all'
|
|
52
52
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
53
53
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
54
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
55
|
-
Requires-Dist: paddleocr>=3.1.
|
55
|
+
Requires-Dist: paddleocr>=3.1.1; extra == 'all'
|
56
56
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
57
57
|
Requires-Dist: pandas>=2.3.1; extra == 'all'
|
58
58
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
@@ -84,7 +84,7 @@ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
|
84
84
|
Provides-Extra: langdetect
|
85
85
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
86
86
|
Provides-Extra: paddleocr
|
87
|
-
Requires-Dist: paddleocr>=3.1.
|
87
|
+
Requires-Dist: paddleocr>=3.1.1; extra == 'paddleocr'
|
88
88
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
89
89
|
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
90
90
|
Description-Content-Type: text/markdown
|
@@ -444,14 +444,26 @@ def _extract_tables_in_process(
|
|
444
444
|
cropped_image.save(img_bytes, format="PNG")
|
445
445
|
img_bytes.seek(0)
|
446
446
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
447
|
+
if data_frame.empty:
|
448
|
+
results.append(
|
449
|
+
{
|
450
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
451
|
+
"page_number": cropped_table.page.page_number,
|
452
|
+
"text": data_frame.to_markdown(),
|
453
|
+
"df_columns": data_frame.columns.tolist(),
|
454
|
+
"df_csv": None,
|
455
|
+
}
|
456
|
+
)
|
457
|
+
else:
|
458
|
+
results.append(
|
459
|
+
{
|
460
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
461
|
+
"page_number": cropped_table.page.page_number,
|
462
|
+
"text": data_frame.to_markdown(),
|
463
|
+
"df_columns": None,
|
464
|
+
"df_csv": data_frame.to_csv(index=False),
|
465
|
+
}
|
466
|
+
)
|
455
467
|
|
456
468
|
result_queue.put((True, results))
|
457
469
|
|
@@ -532,7 +544,10 @@ def _extract_tables_isolated(
|
|
532
544
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
533
545
|
import pandas as pd # noqa: PLC0415
|
534
546
|
|
535
|
-
|
547
|
+
if table_dict["df_csv"] is None:
|
548
|
+
df = pd.DataFrame(columns=table_dict["df_columns"])
|
549
|
+
else:
|
550
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
536
551
|
|
537
552
|
tables.append(
|
538
553
|
TableData(
|
@@ -638,7 +653,10 @@ async def _extract_tables_isolated_async(
|
|
638
653
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
639
654
|
import pandas as pd # noqa: PLC0415
|
640
655
|
|
641
|
-
|
656
|
+
if table_dict["df_csv"] is None:
|
657
|
+
df = pd.DataFrame(columns=table_dict["df_columns"])
|
658
|
+
else:
|
659
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
642
660
|
|
643
661
|
tables.append(
|
644
662
|
TableData(
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.11.
|
8
|
+
version = "3.11.2"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -61,7 +61,7 @@ dependencies = [
|
|
61
61
|
"chardetng-py>=0.3.5",
|
62
62
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
63
63
|
"html-to-markdown[lxml]>=1.9.0",
|
64
|
-
"mcp>=1.
|
64
|
+
"mcp>=1.13.0",
|
65
65
|
"msgspec>=0.18.0",
|
66
66
|
"playa-pdf>=0.7.0", # pinned due to breaking changes in 0.5.0
|
67
67
|
"psutil>=7.0.0",
|
@@ -97,7 +97,7 @@ optional-dependencies.entity-extraction = [ "keybert>=0.9.0", "spacy>=3.8.7" ]
|
|
97
97
|
optional-dependencies.gmft = [ "gmft>=0.4.2" ]
|
98
98
|
optional-dependencies.langdetect = [ "fast-langdetect>=0.3.2" ]
|
99
99
|
optional-dependencies.paddleocr = [
|
100
|
-
"paddleocr>=3.1.
|
100
|
+
"paddleocr>=3.1.1",
|
101
101
|
"paddlepaddle>=3.1.0",
|
102
102
|
"setuptools>=80.9.0",
|
103
103
|
]
|
@@ -117,7 +117,7 @@ dev = [
|
|
117
117
|
"pytest-mock>=3.14.0",
|
118
118
|
"pytest-rerunfailures>=15.1",
|
119
119
|
"pytest-timeout>=2.4.0",
|
120
|
-
"ruff>=0.12.
|
120
|
+
"ruff>=0.12.9",
|
121
121
|
"trio>=0.30.0",
|
122
122
|
"uv-bump",
|
123
123
|
]
|
@@ -669,6 +669,125 @@ class TestGMFTInlineExtractionEdgeCases:
|
|
669
669
|
pytest.skip("GMFT dependency not available for inline testing")
|
670
670
|
|
671
671
|
|
672
|
+
class TestGMFTWithoutTables:
|
673
|
+
"""Test GMFT behavior with PDFs that have no tables - issue #104."""
|
674
|
+
|
675
|
+
@pytest.mark.anyio
|
676
|
+
async def test_extract_tables_pdf_without_tables_async(self) -> None:
|
677
|
+
"""Test that extract_tables handles PDFs without tables gracefully (async)."""
|
678
|
+
pdf_path = Path("tests/test_source_files/searchable.pdf")
|
679
|
+
|
680
|
+
try:
|
681
|
+
tables = await extract_tables(pdf_path)
|
682
|
+
|
683
|
+
assert isinstance(tables, list)
|
684
|
+
|
685
|
+
for table in tables:
|
686
|
+
assert "page_number" in table
|
687
|
+
assert "text" in table
|
688
|
+
assert "df" in table
|
689
|
+
assert "cropped_image" in table
|
690
|
+
|
691
|
+
import pandas as pd
|
692
|
+
|
693
|
+
assert isinstance(table["df"], pd.DataFrame)
|
694
|
+
except MissingDependencyError:
|
695
|
+
pytest.skip("GMFT dependency not installed")
|
696
|
+
|
697
|
+
def test_extract_tables_pdf_without_tables_sync(self) -> None:
|
698
|
+
"""Test that extract_tables_sync handles PDFs without tables gracefully (sync)."""
|
699
|
+
# Using searchable.pdf which is a simple text PDF without tables
|
700
|
+
pdf_path = Path("tests/test_source_files/searchable.pdf")
|
701
|
+
|
702
|
+
try:
|
703
|
+
tables = extract_tables_sync(pdf_path)
|
704
|
+
|
705
|
+
assert isinstance(tables, list)
|
706
|
+
|
707
|
+
for table in tables:
|
708
|
+
assert "page_number" in table
|
709
|
+
assert "text" in table
|
710
|
+
assert "df" in table
|
711
|
+
assert "cropped_image" in table
|
712
|
+
|
713
|
+
import pandas as pd
|
714
|
+
|
715
|
+
assert isinstance(table["df"], pd.DataFrame)
|
716
|
+
except MissingDependencyError:
|
717
|
+
pytest.skip("GMFT dependency not installed")
|
718
|
+
|
719
|
+
@pytest.mark.anyio
|
720
|
+
async def test_extract_file_with_gmft_pdf_without_tables(self) -> None:
|
721
|
+
"""Test that extract_file with extract_tables=True handles PDFs without tables gracefully."""
|
722
|
+
pdf_path = Path("tests/test_source_files/searchable.pdf")
|
723
|
+
|
724
|
+
config = ExtractionConfig(
|
725
|
+
extract_tables=True,
|
726
|
+
gmft_config=GMFTConfig(
|
727
|
+
detector_base_threshold=0.85,
|
728
|
+
remove_null_rows=True,
|
729
|
+
enable_multi_header=True,
|
730
|
+
),
|
731
|
+
)
|
732
|
+
|
733
|
+
try:
|
734
|
+
result = await extract_file(pdf_path, config=config)
|
735
|
+
|
736
|
+
assert result.content
|
737
|
+
assert "Sample PDF" in result.content
|
738
|
+
|
739
|
+
assert hasattr(result, "tables")
|
740
|
+
assert isinstance(result.tables, list)
|
741
|
+
|
742
|
+
for table in result.tables:
|
743
|
+
assert "page_number" in table
|
744
|
+
assert "text" in table
|
745
|
+
assert "df" in table
|
746
|
+
assert "cropped_image" in table
|
747
|
+
|
748
|
+
import pandas as pd
|
749
|
+
|
750
|
+
assert isinstance(table["df"], pd.DataFrame)
|
751
|
+
except MissingDependencyError:
|
752
|
+
pytest.skip("GMFT dependency not installed")
|
753
|
+
|
754
|
+
def test_extract_file_sync_with_gmft_pdf_without_tables(self) -> None:
|
755
|
+
"""Test that extract_file_sync with extract_tables=True handles PDFs without tables gracefully."""
|
756
|
+
pdf_path = Path("tests/test_source_files/searchable.pdf")
|
757
|
+
|
758
|
+
from kreuzberg.extraction import extract_file_sync
|
759
|
+
|
760
|
+
config = ExtractionConfig(
|
761
|
+
extract_tables=True,
|
762
|
+
gmft_config=GMFTConfig(
|
763
|
+
detector_base_threshold=0.85,
|
764
|
+
remove_null_rows=True,
|
765
|
+
enable_multi_header=True,
|
766
|
+
),
|
767
|
+
)
|
768
|
+
|
769
|
+
try:
|
770
|
+
result = extract_file_sync(pdf_path, config=config)
|
771
|
+
|
772
|
+
assert result.content
|
773
|
+
assert "Sample PDF" in result.content
|
774
|
+
|
775
|
+
assert hasattr(result, "tables")
|
776
|
+
assert isinstance(result.tables, list)
|
777
|
+
|
778
|
+
for table in result.tables:
|
779
|
+
assert "page_number" in table
|
780
|
+
assert "text" in table
|
781
|
+
assert "df" in table
|
782
|
+
assert "cropped_image" in table
|
783
|
+
|
784
|
+
import pandas as pd
|
785
|
+
|
786
|
+
assert isinstance(table["df"], pd.DataFrame)
|
787
|
+
except MissingDependencyError:
|
788
|
+
pytest.skip("GMFT dependency not installed")
|
789
|
+
|
790
|
+
|
672
791
|
class TestGMFTConfigSerialization:
|
673
792
|
"""Test GMFTConfig serialization for multiprocessing."""
|
674
793
|
|