kreuzberg 3.11.3__tar.gz → 3.11.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.docker/Dockerfile +5 -1
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.github/workflows/docker-e2e-tests.yml +1 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.gitignore +3 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/PKG-INFO +1 -1
- kreuzberg-3.11.4/docker-compose.example.yml +37 -0
- kreuzberg-3.11.4/docs/user-guide/docker.md +417 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/cli.py +1 -3
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/pyproject.toml +2 -4
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/e2e/docker_images_test.py +6 -7
- kreuzberg-3.11.4/tests/test_source_files/contract.txt +1 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/uv.lock +2 -2
- kreuzberg-3.11.3/.task/checksum/docker-build-core +0 -1
- kreuzberg-3.11.3/.task/checksum/docker-build-easyocr +0 -1
- kreuzberg-3.11.3/.task/checksum/docker-build-gmft +0 -1
- kreuzberg-3.11.3/.task/checksum/docker-build-paddle +0 -1
- kreuzberg-3.11.3/docs/user-guide/docker.md +0 -389
- kreuzberg-3.11.3/tests/e2e/run_docker_tests.sh +0 -371
- kreuzberg-3.11.3/tests/e2e/test_report.json +0 -14
- kreuzberg-3.11.3/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.commitlintrc +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.deepsource.toml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.docker/README.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.dockerignore +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.markdownlint.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/LICENSE +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/README.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/Taskfile.yml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/ai-rulez.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/README.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/advanced/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/assets/logo.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/cli.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/contributing.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/css/extra.css +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/examples/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/mkdocs.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/api/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/api/main_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/chunker_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/cli_command_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/cli_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/config_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/conftest.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/document_classification_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extraction_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/gmft_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/hooks_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/playa_helpers_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/playa_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/registry_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/types_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils/tmp_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.11.4}/tests/utils_errors_test.py +0 -0
@@ -46,9 +46,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
46
46
|
COPY --from=builder /app/.venv /app/.venv
|
47
47
|
COPY --from=builder /app/kreuzberg /app/kreuzberg
|
48
48
|
|
49
|
-
# Create non-root user
|
49
|
+
# Create non-root user and cache directory
|
50
50
|
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser && \
|
51
|
+
mkdir -p /app/.kreuzberg && \
|
51
52
|
chown -R appuser:appuser /app
|
52
53
|
|
54
|
+
# Set default cache directory to prevent permission issues
|
55
|
+
ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg
|
56
|
+
|
53
57
|
USER appuser
|
54
58
|
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.11.
|
3
|
+
Version: 3.11.4
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -0,0 +1,37 @@
|
|
1
|
+
services:
|
2
|
+
kreuzberg:
|
3
|
+
# Choose your image variant:
|
4
|
+
# - goldziher/kreuzberg:latest (core, 270MB)
|
5
|
+
# - goldziher/kreuzberg:latest-paddle (+ PaddleOCR, 878MB)
|
6
|
+
# - goldziher/kreuzberg:latest-easyocr (+ EasyOCR, 8.7GB)
|
7
|
+
# - goldziher/kreuzberg:latest-gmft (+ table extraction, 8.6GB)
|
8
|
+
# - goldziher/kreuzberg:latest-all (all features, 9.6GB - testing only!)
|
9
|
+
image: goldziher/kreuzberg:latest
|
10
|
+
|
11
|
+
ports:
|
12
|
+
- "8000:8000"
|
13
|
+
|
14
|
+
volumes:
|
15
|
+
# Mount your configuration file (optional)
|
16
|
+
- "./kreuzberg.toml:/app/kreuzberg.toml:ro"
|
17
|
+
# Persist cache across restarts (recommended)
|
18
|
+
- "kreuzberg-cache:/app/.kreuzberg"
|
19
|
+
|
20
|
+
environment:
|
21
|
+
- PYTHONUNBUFFERED=1
|
22
|
+
- KREUZBERG_CACHE_DIR=/app/.kreuzberg
|
23
|
+
# Optional: Cache size limits
|
24
|
+
# - KREUZBERG_OCR_CACHE_SIZE_MB=500
|
25
|
+
# - KREUZBERG_DOCUMENT_CACHE_SIZE_MB=1000
|
26
|
+
|
27
|
+
restart: unless-stopped
|
28
|
+
|
29
|
+
healthcheck:
|
30
|
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
31
|
+
interval: 30s
|
32
|
+
timeout: 10s
|
33
|
+
retries: 3
|
34
|
+
start_period: 40s
|
35
|
+
|
36
|
+
volumes:
|
37
|
+
kreuzberg-cache:
|
@@ -0,0 +1,417 @@
|
|
1
|
+
# Docker
|
2
|
+
|
3
|
+
Kreuzberg provides official Docker images for easy deployment and containerized usage.
|
4
|
+
|
5
|
+
## Image Variants
|
6
|
+
|
7
|
+
Docker images are available on [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg):
|
8
|
+
|
9
|
+
### Core Image
|
10
|
+
|
11
|
+
- **Image**: `goldziher/kreuzberg:latest`
|
12
|
+
- **Size**: ~270MB
|
13
|
+
- **Includes**: Base library + API server + Tesseract OCR
|
14
|
+
- **Use Case**: Basic text extraction from documents
|
15
|
+
- **Limitations**: No chunking, language detection, entity extraction, or alternative OCR backends
|
16
|
+
|
17
|
+
### OCR Backend Variants
|
18
|
+
|
19
|
+
- **EasyOCR**: `goldziher/kreuzberg:latest-easyocr` (~8.7GB)
|
20
|
+
|
21
|
+
- Deep learning-based OCR with support for 80+ languages
|
22
|
+
- Better accuracy for complex layouts and handwriting
|
23
|
+
|
24
|
+
- **PaddleOCR**: `goldziher/kreuzberg:latest-paddle` (~878MB)
|
25
|
+
|
26
|
+
- Lightweight deep learning OCR
|
27
|
+
- Good balance between size and accuracy
|
28
|
+
|
29
|
+
### Table Extraction
|
30
|
+
|
31
|
+
- **GMFT**: `goldziher/kreuzberg:latest-gmft` (~8.6GB)
|
32
|
+
- Advanced table detection and extraction from PDFs
|
33
|
+
- Uses Microsoft's Table Transformer models
|
34
|
+
|
35
|
+
### All-in-One (Testing Only)
|
36
|
+
|
37
|
+
- **Image**: `goldziher/kreuzberg:latest-all`
|
38
|
+
- **Size**: ~9.6GB
|
39
|
+
- **⚠️ WARNING**: For testing only, NOT for production use
|
40
|
+
- **Includes**: All OCR backends and features
|
41
|
+
- **Why not production?**: Unnecessarily large, includes conflicting dependencies, slower startup
|
42
|
+
|
43
|
+
## Quick Start
|
44
|
+
|
45
|
+
### Basic Usage
|
46
|
+
|
47
|
+
```bash
|
48
|
+
# Pull and run the core image
|
49
|
+
docker pull goldziher/kreuzberg:latest
|
50
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
51
|
+
|
52
|
+
# Extract text from a document
|
53
|
+
curl -X POST http://localhost:8000/extract \
|
54
|
+
-F "data=@document.pdf"
|
55
|
+
```
|
56
|
+
|
57
|
+
### With Cache Volume
|
58
|
+
|
59
|
+
```bash
|
60
|
+
# Create cache directory
|
61
|
+
mkdir -p kreuzberg-cache
|
62
|
+
|
63
|
+
# Run with persistent cache
|
64
|
+
docker run -p 8000:8000 \
|
65
|
+
-v "$(pwd)/kreuzberg-cache:/app/.kreuzberg" \
|
66
|
+
goldziher/kreuzberg:latest
|
67
|
+
```
|
68
|
+
|
69
|
+
## Customizing Docker Images
|
70
|
+
|
71
|
+
For production, create a custom image with only the features you need:
|
72
|
+
|
73
|
+
### Example 1: Core + Chunking Support
|
74
|
+
|
75
|
+
```dockerfile
|
76
|
+
FROM goldziher/kreuzberg:latest
|
77
|
+
|
78
|
+
USER root
|
79
|
+
|
80
|
+
# Install only chunking dependency
|
81
|
+
RUN uv pip install --python /app/.venv/bin/python semantic-text-splitter
|
82
|
+
|
83
|
+
USER appuser
|
84
|
+
```
|
85
|
+
|
86
|
+
Build and run:
|
87
|
+
|
88
|
+
```bash
|
89
|
+
# Build the image
|
90
|
+
docker build -t kreuzberg-chunking .
|
91
|
+
|
92
|
+
# Run with external configuration
|
93
|
+
docker run -p 8000:8000 \
|
94
|
+
-v "$(pwd)/kreuzberg.toml:/app/kreuzberg.toml:ro" \
|
95
|
+
-v "$(pwd)/cache:/app/.kreuzberg" \
|
96
|
+
kreuzberg-chunking
|
97
|
+
```
|
98
|
+
|
99
|
+
### Example 2: Core + Language Detection + Chunking
|
100
|
+
|
101
|
+
```dockerfile
|
102
|
+
FROM goldziher/kreuzberg:latest
|
103
|
+
|
104
|
+
USER root
|
105
|
+
|
106
|
+
# Install specific features
|
107
|
+
RUN uv pip install --python /app/.venv/bin/python \
|
108
|
+
semantic-text-splitter \
|
109
|
+
fast-langdetect
|
110
|
+
|
111
|
+
USER appuser
|
112
|
+
```
|
113
|
+
|
114
|
+
Create configuration file `kreuzberg.toml`:
|
115
|
+
|
116
|
+
```toml
|
117
|
+
chunk_content = true
|
118
|
+
auto_detect_language = true
|
119
|
+
max_chars = 2000
|
120
|
+
max_overlap = 100
|
121
|
+
```
|
122
|
+
|
123
|
+
Run with:
|
124
|
+
|
125
|
+
```bash
|
126
|
+
docker run -p 8000:8000 \
|
127
|
+
-v "$(pwd)/kreuzberg.toml:/app/kreuzberg.toml:ro" \
|
128
|
+
-v "$(pwd)/cache:/app/.kreuzberg" \
|
129
|
+
kreuzberg-multilang
|
130
|
+
```
|
131
|
+
|
132
|
+
### Example 3: Core + PaddleOCR (Custom Build)
|
133
|
+
|
134
|
+
```dockerfile
|
135
|
+
FROM goldziher/kreuzberg:latest
|
136
|
+
|
137
|
+
USER root
|
138
|
+
|
139
|
+
# Install PaddleOCR dependencies
|
140
|
+
RUN uv pip install --python /app/.venv/bin/python \
|
141
|
+
paddleocr \
|
142
|
+
paddlepaddle
|
143
|
+
|
144
|
+
USER appuser
|
145
|
+
```
|
146
|
+
|
147
|
+
Run with PaddleOCR backend:
|
148
|
+
|
149
|
+
```bash
|
150
|
+
docker run -p 8000:8000 \
|
151
|
+
-e KREUZBERG_OCR_BACKEND=paddleocr \
|
152
|
+
-v "$(pwd)/cache:/app/.kreuzberg" \
|
153
|
+
kreuzberg-paddle
|
154
|
+
```
|
155
|
+
|
156
|
+
### Example 4: Optimized Production Build
|
157
|
+
|
158
|
+
```dockerfile
|
159
|
+
FROM goldziher/kreuzberg:latest
|
160
|
+
|
161
|
+
USER root
|
162
|
+
|
163
|
+
# Install only the features you need
|
164
|
+
RUN uv pip install --python /app/.venv/bin/python \
|
165
|
+
semantic-text-splitter \
|
166
|
+
fast-langdetect && \
|
167
|
+
# Clean up cache to reduce image size
|
168
|
+
rm -rf /root/.cache/uv
|
169
|
+
|
170
|
+
USER appuser
|
171
|
+
|
172
|
+
# Set production environment variables
|
173
|
+
ENV PYTHONUNBUFFERED=1
|
174
|
+
ENV PYTHONDONTWRITEBYTECODE=1
|
175
|
+
```
|
176
|
+
|
177
|
+
Deploy with Docker Compose:
|
178
|
+
|
179
|
+
```yaml
|
180
|
+
services:
|
181
|
+
kreuzberg:
|
182
|
+
build: .
|
183
|
+
ports:
|
184
|
+
- "8000:8000"
|
185
|
+
volumes:
|
186
|
+
- "./config/kreuzberg.toml:/app/kreuzberg.toml:ro"
|
187
|
+
- "kreuzberg-cache:/app/.kreuzberg"
|
188
|
+
environment:
|
189
|
+
- PYTHONUNBUFFERED=1
|
190
|
+
- KREUZBERG_CACHE_DIR=/app/.kreuzberg
|
191
|
+
restart: unless-stopped
|
192
|
+
healthcheck:
|
193
|
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
194
|
+
interval: 30s
|
195
|
+
timeout: 10s
|
196
|
+
retries: 3
|
197
|
+
|
198
|
+
volumes:
|
199
|
+
kreuzberg-cache:
|
200
|
+
```
|
201
|
+
|
202
|
+
## Docker Compose
|
203
|
+
|
204
|
+
### Production Setup
|
205
|
+
|
206
|
+
```yaml
|
207
|
+
services:
|
208
|
+
kreuzberg:
|
209
|
+
image: goldziher/kreuzberg:latest # Or your custom image
|
210
|
+
ports:
|
211
|
+
- "8000:8000"
|
212
|
+
volumes:
|
213
|
+
- "./kreuzberg-cache:/app/.kreuzberg" # Persistent cache
|
214
|
+
- "./kreuzberg.toml:/app/kreuzberg.toml:ro" # Configuration
|
215
|
+
environment:
|
216
|
+
- PYTHONUNBUFFERED=1
|
217
|
+
- KREUZBERG_CACHE_DIR=/app/.kreuzberg
|
218
|
+
# Cache configuration
|
219
|
+
- KREUZBERG_OCR_CACHE_SIZE_MB=500
|
220
|
+
- KREUZBERG_DOCUMENT_CACHE_SIZE_MB=1000
|
221
|
+
restart: unless-stopped
|
222
|
+
healthcheck:
|
223
|
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
224
|
+
interval: 30s
|
225
|
+
timeout: 10s
|
226
|
+
retries: 3
|
227
|
+
```
|
228
|
+
|
229
|
+
## Configuration
|
230
|
+
|
231
|
+
### Using Configuration Files
|
232
|
+
|
233
|
+
Create `kreuzberg.toml`:
|
234
|
+
|
235
|
+
```toml
|
236
|
+
force_ocr = false
|
237
|
+
chunk_content = true # Requires semantic-text-splitter
|
238
|
+
extract_tables = false # Requires gmft
|
239
|
+
ocr_backend = "tesseract"
|
240
|
+
|
241
|
+
[tesseract]
|
242
|
+
language = "eng"
|
243
|
+
psm = 6
|
244
|
+
```
|
245
|
+
|
246
|
+
Mount the configuration:
|
247
|
+
|
248
|
+
```bash
|
249
|
+
docker run -p 8000:8000 \
|
250
|
+
-v "$(pwd)/kreuzberg.toml:/app/kreuzberg.toml" \
|
251
|
+
goldziher/kreuzberg:latest
|
252
|
+
```
|
253
|
+
|
254
|
+
### Environment Variables
|
255
|
+
|
256
|
+
**Cache Configuration:**
|
257
|
+
|
258
|
+
- `KREUZBERG_CACHE_DIR`: Cache directory (default: `/app/.kreuzberg`)
|
259
|
+
- `KREUZBERG_OCR_CACHE_SIZE_MB`: OCR cache size limit (default: `500`)
|
260
|
+
- `KREUZBERG_DOCUMENT_CACHE_SIZE_MB`: Document cache size limit (default: `1000`)
|
261
|
+
|
262
|
+
**Runtime Configuration:**
|
263
|
+
|
264
|
+
- `PYTHONUNBUFFERED=1`: Ensures proper logging output
|
265
|
+
- `PYTHONDONTWRITEBYTECODE=1`: Prevents .pyc file creation
|
266
|
+
|
267
|
+
## Production Deployment
|
268
|
+
|
269
|
+
### Kubernetes
|
270
|
+
|
271
|
+
```yaml
|
272
|
+
apiVersion: apps/v1
|
273
|
+
kind: Deployment
|
274
|
+
metadata:
|
275
|
+
name: kreuzberg-api
|
276
|
+
spec:
|
277
|
+
replicas: 3
|
278
|
+
selector:
|
279
|
+
matchLabels:
|
280
|
+
app: kreuzberg-api
|
281
|
+
template:
|
282
|
+
metadata:
|
283
|
+
labels:
|
284
|
+
app: kreuzberg-api
|
285
|
+
spec:
|
286
|
+
containers:
|
287
|
+
- name: kreuzberg
|
288
|
+
image: your-registry/kreuzberg-custom:latest
|
289
|
+
ports:
|
290
|
+
- containerPort: 8000
|
291
|
+
volumeMounts:
|
292
|
+
- name: cache
|
293
|
+
mountPath: /app/.kreuzberg
|
294
|
+
- name: config
|
295
|
+
mountPath: /app/kreuzberg.toml
|
296
|
+
subPath: kreuzberg.toml
|
297
|
+
livenessProbe:
|
298
|
+
httpGet:
|
299
|
+
path: /health
|
300
|
+
port: 8000
|
301
|
+
initialDelaySeconds: 30
|
302
|
+
readinessProbe:
|
303
|
+
httpGet:
|
304
|
+
path: /health
|
305
|
+
port: 8000
|
306
|
+
initialDelaySeconds: 5
|
307
|
+
resources:
|
308
|
+
requests:
|
309
|
+
memory: "512Mi"
|
310
|
+
cpu: "500m"
|
311
|
+
limits:
|
312
|
+
memory: "2Gi"
|
313
|
+
cpu: "2000m"
|
314
|
+
volumes:
|
315
|
+
- name: cache
|
316
|
+
emptyDir: {}
|
317
|
+
- name: config
|
318
|
+
configMap:
|
319
|
+
name: kreuzberg-config
|
320
|
+
---
|
321
|
+
apiVersion: v1
|
322
|
+
kind: ConfigMap
|
323
|
+
metadata:
|
324
|
+
name: kreuzberg-config
|
325
|
+
data:
|
326
|
+
kreuzberg.toml: |
|
327
|
+
chunk_content = false
|
328
|
+
ocr_backend = "tesseract"
|
329
|
+
[tesseract]
|
330
|
+
language = "eng"
|
331
|
+
```
|
332
|
+
|
333
|
+
### With nginx Reverse Proxy
|
334
|
+
|
335
|
+
```nginx
|
336
|
+
server {
|
337
|
+
listen 80;
|
338
|
+
server_name api.example.com;
|
339
|
+
|
340
|
+
location / {
|
341
|
+
proxy_pass http://localhost:8000;
|
342
|
+
proxy_set_header Host $host;
|
343
|
+
proxy_set_header X-Real-IP $remote_addr;
|
344
|
+
|
345
|
+
# File upload settings
|
346
|
+
client_max_body_size 100M;
|
347
|
+
proxy_read_timeout 300s;
|
348
|
+
}
|
349
|
+
|
350
|
+
location /health {
|
351
|
+
proxy_pass http://localhost:8000/health;
|
352
|
+
access_log off;
|
353
|
+
}
|
354
|
+
}
|
355
|
+
```
|
356
|
+
|
357
|
+
## Resource Requirements
|
358
|
+
|
359
|
+
| Variant | CPU | Memory | Storage |
|
360
|
+
| ----------- | -------- | ------ | ------- |
|
361
|
+
| Core | 1+ cores | 512MB+ | 1GB |
|
362
|
+
| + Chunking | 1+ cores | 1GB+ | 1GB |
|
363
|
+
| + PaddleOCR | 2+ cores | 2GB+ | 2GB |
|
364
|
+
| + EasyOCR | 2+ cores | 4GB+ | 10GB |
|
365
|
+
| + GMFT | 2+ cores | 4GB+ | 10GB |
|
366
|
+
|
367
|
+
## Troubleshooting
|
368
|
+
|
369
|
+
### Common Issues
|
370
|
+
|
371
|
+
#### Permission Denied on Cache Directory
|
372
|
+
|
373
|
+
```bash
|
374
|
+
# Fix: Ensure proper ownership
|
375
|
+
docker run --rm -v "$(pwd)/cache:/app/.kreuzberg" --user root \
|
376
|
+
goldziher/kreuzberg:latest \
|
377
|
+
chown -R 999:999 /app/.kreuzberg
|
378
|
+
```
|
379
|
+
|
380
|
+
#### Missing Dependencies Error
|
381
|
+
|
382
|
+
```bash
|
383
|
+
# Solution: Use appropriate image variant or build custom image
|
384
|
+
# For chunking: Install semantic-text-splitter
|
385
|
+
# For language detection: Install fast-langdetect
|
386
|
+
```
|
387
|
+
|
388
|
+
#### Out of Memory
|
389
|
+
|
390
|
+
- Increase Docker memory allocation
|
391
|
+
- Use a smaller OCR engine (Tesseract instead of EasyOCR)
|
392
|
+
- Disable unnecessary features
|
393
|
+
|
394
|
+
### Debugging
|
395
|
+
|
396
|
+
```bash
|
397
|
+
# Check logs
|
398
|
+
docker logs <container-id>
|
399
|
+
|
400
|
+
# Shell access
|
401
|
+
docker exec -it <container-id> /bin/bash
|
402
|
+
|
403
|
+
# Test extraction
|
404
|
+
docker exec <container-id> python3 -c "
|
405
|
+
from kreuzberg import extract_file_sync
|
406
|
+
result = extract_file_sync('/path/to/file.pdf')
|
407
|
+
print(result.content[:100])
|
408
|
+
"
|
409
|
+
```
|
410
|
+
|
411
|
+
## Security Considerations
|
412
|
+
|
413
|
+
- Runs as non-root user (`appuser`) by default
|
414
|
+
- No external API calls or cloud dependencies
|
415
|
+
- Process files locally within the container
|
416
|
+
- Use read-only mounts where possible (`:ro`)
|
417
|
+
- Consider adding authentication for production use
|
@@ -19,11 +19,9 @@ except ImportError as e: # pragma: no cover
|
|
19
19
|
|
20
20
|
from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
|
21
21
|
from kreuzberg._config import build_extraction_config, find_config_file, load_config_from_file
|
22
|
+
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
22
23
|
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
23
24
|
|
24
|
-
DEFAULT_MAX_CHARACTERS = 4000
|
25
|
-
DEFAULT_MAX_OVERLAP = 200
|
26
|
-
|
27
25
|
if TYPE_CHECKING:
|
28
26
|
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
29
27
|
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.11.
|
8
|
+
version = "3.11.4"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -202,9 +202,7 @@ lint.per-file-ignores."tests/**/*.*" = [
|
|
202
202
|
"SLF001",
|
203
203
|
]
|
204
204
|
lint.per-file-ignores."tests/e2e/*_test.py" = [
|
205
|
-
"
|
206
|
-
"PTH123", # Allow open() in E2E tests
|
207
|
-
"T201", # print statements are needed for test output
|
205
|
+
"T201", # print statements are needed for test output
|
208
206
|
]
|
209
207
|
lint.isort.known-first-party = [ "kreuzberg", "tests" ]
|
210
208
|
lint.mccabe.max-complexity = 15
|
@@ -31,10 +31,11 @@ DOCKER_IMAGES = {
|
|
31
31
|
"easyocr": "kreuzberg:easyocr",
|
32
32
|
"paddle": "kreuzberg:paddle",
|
33
33
|
"gmft": "kreuzberg:gmft",
|
34
|
+
"all": "kreuzberg:all",
|
34
35
|
}
|
35
36
|
|
36
37
|
# Images that are optional (may not be built due to space constraints)
|
37
|
-
OPTIONAL_IMAGES = {"paddle", "gmft"}
|
38
|
+
OPTIONAL_IMAGES = {"paddle", "gmft", "all"}
|
38
39
|
|
39
40
|
# Security configuration
|
40
41
|
SECURITY_CONFIG = {
|
@@ -111,7 +112,7 @@ def test_cli_help(image_name: str) -> bool:
|
|
111
112
|
"--help",
|
112
113
|
]
|
113
114
|
exit_code, stdout, stderr = run_command(cmd)
|
114
|
-
success = exit_code == 0 and "
|
115
|
+
success = exit_code == 0 and "Text extraction from documents" in stdout
|
115
116
|
if not success:
|
116
117
|
pass
|
117
118
|
return success
|
@@ -536,13 +537,11 @@ def print_summary(all_results: dict[str, dict[str, bool]]) -> bool:
|
|
536
537
|
success_rate = (total_passed / total_tests * 100) if total_tests > 0 else 0
|
537
538
|
print(f"Success rate: {success_rate:.1f}%")
|
538
539
|
|
539
|
-
|
540
|
+
# ALL tests must pass - no partial success allowed
|
541
|
+
if success_rate == 100:
|
540
542
|
print("✅ Test suite PASSED")
|
541
543
|
return True
|
542
|
-
|
543
|
-
print("⚠️ Test suite PASSED with warnings")
|
544
|
-
return True
|
545
|
-
print("❌ Test suite FAILED")
|
544
|
+
print("❌ Test suite FAILED - all tests must pass")
|
546
545
|
return False
|
547
546
|
|
548
547
|
|
@@ -0,0 +1 @@
|
|
1
|
+
Test content for contract.txt file
|
@@ -956,7 +956,7 @@ name = "exceptiongroup"
|
|
956
956
|
version = "1.3.0"
|
957
957
|
source = { registry = "https://pypi.org/simple" }
|
958
958
|
dependencies = [
|
959
|
-
{ name = "typing-extensions", marker = "python_full_version < '3.
|
959
|
+
{ name = "typing-extensions", marker = "python_full_version < '3.11'" },
|
960
960
|
]
|
961
961
|
sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
|
962
962
|
wheels = [
|
@@ -1632,7 +1632,7 @@ wheels = [
|
|
1632
1632
|
|
1633
1633
|
[[package]]
|
1634
1634
|
name = "kreuzberg"
|
1635
|
-
version = "3.11.
|
1635
|
+
version = "3.11.4"
|
1636
1636
|
source = { editable = "." }
|
1637
1637
|
dependencies = [
|
1638
1638
|
{ name = "anyio" },
|
@@ -1 +0,0 @@
|
|
1
|
-
f2683e2fcbf67fa54bee4baa3ead293a
|
@@ -1 +0,0 @@
|
|
1
|
-
f2683e2fcbf67fa54bee4baa3ead293a
|
@@ -1 +0,0 @@
|
|
1
|
-
f2683e2fcbf67fa54bee4baa3ead293a
|
@@ -1 +0,0 @@
|
|
1
|
-
f2683e2fcbf67fa54bee4baa3ead293a
|