kreuzberg 3.11.0__tar.gz → 3.11.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.github/workflows/ci.yaml +3 -3
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.github/workflows/docs.yml +1 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.github/workflows/pr-title.yaml +1 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.github/workflows/publish-docker.yml +1 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.github/workflows/release.yaml +1 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.pre-commit-config.yaml +9 -7
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/PKG-INFO +8 -8
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/examples/extraction-examples.md +4 -4
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/extraction-configuration.md +3 -3
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_ocr/_easyocr.py +8 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_ocr/_paddleocr.py +2 -1
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/pyproject.toml +8 -8
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/uv.lock +963 -929
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.commitlintrc +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.deepsource.toml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.docker/README.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.dockerignore +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.gitignore +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/LICENSE +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/README.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/ai-rulez.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/cli.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/contributing.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/mkdocs.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/api/main_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/cli_command_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/config_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/conftest.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/document_classification_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extraction_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/gmft_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/playa_helpers_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/types_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils/tmp_test.py +0 -0
- {kreuzberg-3.11.0 → kreuzberg-3.11.1}/tests/utils_errors_test.py +0 -0
@@ -15,7 +15,7 @@ jobs:
|
|
15
15
|
timeout-minutes: 10
|
16
16
|
steps:
|
17
17
|
- name: Checkout
|
18
|
-
uses: actions/checkout@
|
18
|
+
uses: actions/checkout@v5
|
19
19
|
|
20
20
|
- name: Install uv
|
21
21
|
uses: astral-sh/setup-uv@v6
|
@@ -58,7 +58,7 @@ jobs:
|
|
58
58
|
timeout-minutes: 20
|
59
59
|
steps:
|
60
60
|
- name: Checkout
|
61
|
-
uses: actions/checkout@
|
61
|
+
uses: actions/checkout@v5
|
62
62
|
|
63
63
|
- name: Install uv
|
64
64
|
uses: astral-sh/setup-uv@v6
|
@@ -151,7 +151,7 @@ jobs:
|
|
151
151
|
timeout-minutes: 30
|
152
152
|
steps:
|
153
153
|
- name: Checkout
|
154
|
-
uses: actions/checkout@
|
154
|
+
uses: actions/checkout@v5
|
155
155
|
|
156
156
|
- name: Install uv
|
157
157
|
uses: astral-sh/setup-uv@v6
|
@@ -5,13 +5,15 @@ repos:
|
|
5
5
|
- id: commitlint
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
|
-
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
# Temporarily disabled - ai-rulez Go build failing in CI
|
9
|
+
# TODO: Re-enable once ai-rulez v1.4.4+ Python migration is stable
|
10
|
+
# - repo: https://github.com/Goldziher/ai-rulez
|
11
|
+
# rev: v1.4.3
|
12
|
+
# hooks:
|
13
|
+
# - id: ai-rulez-validate
|
14
|
+
# - id: ai-rulez-generate
|
13
15
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
14
|
-
rev:
|
16
|
+
rev: v6.0.0
|
15
17
|
hooks:
|
16
18
|
- id: name-tests-test
|
17
19
|
args:
|
@@ -53,7 +55,7 @@ repos:
|
|
53
55
|
hooks:
|
54
56
|
- id: pyproject-fmt
|
55
57
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
56
|
-
rev: v0.12.
|
58
|
+
rev: v0.12.8
|
57
59
|
hooks:
|
58
60
|
- id: ruff
|
59
61
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.11.
|
3
|
+
Version: 3.11.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -28,13 +28,13 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
28
|
Classifier: Topic :: Text Processing :: General
|
29
29
|
Classifier: Typing :: Typed
|
30
30
|
Requires-Python: >=3.10
|
31
|
-
Requires-Dist: anyio>=4.
|
31
|
+
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.12.
|
35
|
+
Requires-Dist: mcp>=1.12.4
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
|
-
Requires-Dist: playa-pdf>=0.
|
37
|
+
Requires-Dist: playa-pdf>=0.7.0
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
39
39
|
Requires-Dist: pypdfium2==4.30.0
|
40
40
|
Requires-Dist: python-calamine>=0.3.2
|
@@ -50,19 +50,19 @@ Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
|
50
50
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
51
51
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
52
52
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
53
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
53
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
54
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
55
55
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
56
56
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
57
57
|
Requires-Dist: pandas>=2.3.1; extra == 'all'
|
58
|
-
Requires-Dist: playa-pdf[crypto]>=0.
|
58
|
+
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
59
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
60
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
61
61
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
62
62
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
63
63
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
64
64
|
Provides-Extra: api
|
65
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
65
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
66
66
|
Provides-Extra: chunking
|
67
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
68
68
|
Provides-Extra: cli
|
@@ -70,7 +70,7 @@ Requires-Dist: click>=8.2.1; extra == 'cli'
|
|
70
70
|
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
71
71
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
72
72
|
Provides-Extra: crypto
|
73
|
-
Requires-Dist: playa-pdf[crypto]>=0.
|
73
|
+
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
|
74
74
|
Provides-Extra: document-classification
|
75
75
|
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
76
|
Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
|
@@ -132,15 +132,15 @@ async def extract_tables_from_pdf():
|
|
132
132
|
# Process extracted tables
|
133
133
|
print(f"Found {len(result.tables)} tables")
|
134
134
|
for i, table in enumerate(result.tables):
|
135
|
-
print(f"Table {i+1} on page {table
|
136
|
-
print(table
|
135
|
+
print(f"Table {i+1} on page {table['page_number']}:")
|
136
|
+
print(table["text"]) # Markdown formatted table
|
137
137
|
|
138
138
|
# Work with the pandas DataFrame
|
139
|
-
df = table
|
139
|
+
df = table["df"]
|
140
140
|
print(f"Table shape: {df.shape}")
|
141
141
|
|
142
142
|
# The cropped table image is also available
|
143
|
-
# table
|
143
|
+
# table['cropped_image'].save(f"table_{i+1}.png")
|
144
144
|
|
145
145
|
# With custom GMFT configuration
|
146
146
|
custom_config = ExtractionConfig(
|
@@ -237,10 +237,10 @@ result = await extract_file("document_with_tables.pdf", config=config)
|
|
237
237
|
|
238
238
|
# Access extracted tables
|
239
239
|
for i, table in enumerate(result.tables):
|
240
|
-
print(f"Table {i+1} on page {table
|
241
|
-
print(table
|
240
|
+
print(f"Table {i+1} on page {table['page_number']}:")
|
241
|
+
print(table["text"]) # Markdown formatted table text
|
242
242
|
# You can also access the pandas DataFrame directly
|
243
|
-
df = table
|
243
|
+
df = table["df"]
|
244
244
|
print(df.shape) # (rows, columns)
|
245
245
|
```
|
246
246
|
|
@@ -4,7 +4,6 @@ import warnings
|
|
4
4
|
from dataclasses import dataclass
|
5
5
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
6
6
|
|
7
|
-
import numpy as np
|
8
7
|
from PIL import Image
|
9
8
|
|
10
9
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
@@ -188,6 +187,9 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
188
187
|
|
189
188
|
kwargs.pop("language", None)
|
190
189
|
kwargs.pop("use_gpu", None)
|
190
|
+
kwargs.pop("device", None)
|
191
|
+
kwargs.pop("gpu_memory_limit", None)
|
192
|
+
kwargs.pop("fallback_to_cpu", None)
|
191
193
|
|
192
194
|
try:
|
193
195
|
result = await run_sync(
|
@@ -455,11 +457,16 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
455
457
|
Raises:
|
456
458
|
OCRError: If OCR processing fails.
|
457
459
|
"""
|
460
|
+
import numpy as np # noqa: PLC0415
|
461
|
+
|
458
462
|
self._init_easyocr_sync(**kwargs)
|
459
463
|
|
460
464
|
beam_width = kwargs.pop("beam_width")
|
461
465
|
kwargs.pop("language", None)
|
462
466
|
kwargs.pop("use_gpu", None)
|
467
|
+
kwargs.pop("device", None)
|
468
|
+
kwargs.pop("gpu_memory_limit", None)
|
469
|
+
kwargs.pop("fallback_to_cpu", None)
|
463
470
|
|
464
471
|
try:
|
465
472
|
result = self._reader.readtext(
|
@@ -7,7 +7,6 @@ from importlib.util import find_spec
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
9
9
|
|
10
|
-
import numpy as np
|
11
10
|
from PIL import Image
|
12
11
|
|
13
12
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
@@ -380,6 +379,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
380
379
|
Raises:
|
381
380
|
OCRError: If OCR processing fails.
|
382
381
|
"""
|
382
|
+
import numpy as np # noqa: PLC0415
|
383
|
+
|
383
384
|
self._init_paddle_ocr_sync(**kwargs)
|
384
385
|
|
385
386
|
if image.mode != "RGB":
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.11.
|
8
|
+
version = "3.11.1"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -57,13 +57,13 @@ classifiers = [
|
|
57
57
|
]
|
58
58
|
|
59
59
|
dependencies = [
|
60
|
-
"anyio>=4.
|
60
|
+
"anyio>=4.10.0",
|
61
61
|
"chardetng-py>=0.3.5",
|
62
62
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
63
63
|
"html-to-markdown[lxml]>=1.9.0",
|
64
|
-
"mcp>=1.12.
|
64
|
+
"mcp>=1.12.4",
|
65
65
|
"msgspec>=0.18.0",
|
66
|
-
"playa-pdf>=0.
|
66
|
+
"playa-pdf>=0.7.0", # pinned due to breaking changes in 0.5.0
|
67
67
|
"psutil>=7.0.0",
|
68
68
|
"pypdfium2==4.30.0", # pinned due to bug in 4.30.1, until v5 is stable
|
69
69
|
"python-calamine>=0.3.2",
|
@@ -79,7 +79,7 @@ optional-dependencies.all = [
|
|
79
79
|
"kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
|
80
80
|
]
|
81
81
|
optional-dependencies.api = [
|
82
|
-
"litestar[standard,structlog,opentelemetry]>=2.
|
82
|
+
"litestar[standard,structlog,opentelemetry]>=2.17.0",
|
83
83
|
]
|
84
84
|
optional-dependencies.chunking = [ "semantic-text-splitter>=0.27.0" ]
|
85
85
|
optional-dependencies.cli = [
|
@@ -87,7 +87,7 @@ optional-dependencies.cli = [
|
|
87
87
|
"rich>=14.1.0",
|
88
88
|
"tomli>=2.0.0; python_version<'3.11'",
|
89
89
|
]
|
90
|
-
optional-dependencies.crypto = [ "playa-pdf[crypto]>=0.
|
90
|
+
optional-dependencies.crypto = [ "playa-pdf[crypto]>=0.7.0" ]
|
91
91
|
optional-dependencies.document-classification = [
|
92
92
|
"deep-translator>=1.11.4",
|
93
93
|
"pandas>=2.3.1",
|
@@ -111,13 +111,13 @@ scripts.kreuzberg-mcp = "kreuzberg._mcp.server:main"
|
|
111
111
|
dev = [
|
112
112
|
"covdefaults>=2.3.0",
|
113
113
|
"mypy>=1.16.1",
|
114
|
-
"pre-commit>=4.
|
114
|
+
"pre-commit>=4.3.0",
|
115
115
|
"pytest>=8.4.1",
|
116
116
|
"pytest-cov>=6.2.1",
|
117
117
|
"pytest-mock>=3.14.0",
|
118
118
|
"pytest-rerunfailures>=15.1",
|
119
119
|
"pytest-timeout>=2.4.0",
|
120
|
-
"ruff>=0.12.
|
120
|
+
"ruff>=0.12.8",
|
121
121
|
"trio>=0.30.0",
|
122
122
|
"uv-bump",
|
123
123
|
]
|