kreuzberg 3.14.0__tar.gz → 3.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.github/workflows/docker-e2e-tests.yml +1 -1
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.gitignore +27 -24
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/PKG-INFO +2 -1
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/README.md +1 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/Taskfile.yml +2 -2
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/ai-rulez.yaml +1 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/benchmarks/README.md +30 -0
- kreuzberg-3.15.0/benchmarks/batch_size_benchmark.py +179 -0
- kreuzberg-3.15.0/benchmarks/batch_validation_benchmark.py +83 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/benchmarks/pyproject.toml +2 -7
- kreuzberg-3.15.0/benchmarks/src/__main__.py +4 -0
- kreuzberg-3.15.0/benchmarks/src/benchmarks.py +703 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/benchmarks/src/cli.py +215 -182
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/benchmarks/src/models.py +10 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/benchmarks/src/profiler.py +12 -21
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/benchmarks/src/runner.py +52 -63
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/api-reference/types.md +20 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/examples/extraction-examples.md +265 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/index.md +2 -1
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/api-server.md +128 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/extraction-configuration.md +197 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/supported-formats.md +10 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/__init__.py +6 -0
- kreuzberg-3.15.0/kreuzberg/_api/_config_cache.py +247 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_api/main.py +156 -30
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_chunker.py +7 -6
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_constants.py +2 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_document_classification.py +4 -6
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_entity_extraction.py +9 -4
- kreuzberg-3.15.0/kreuzberg/_extractors/_base.py +328 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_extractors/_email.py +95 -27
- kreuzberg-3.15.0/kreuzberg/_extractors/_html.py +121 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_extractors/_image.py +23 -22
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_extractors/_pandoc.py +106 -75
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_extractors/_pdf.py +209 -99
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_extractors/_presentation.py +72 -8
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_extractors/_spread_sheet.py +25 -30
- kreuzberg-3.15.0/kreuzberg/_mcp/server.py +514 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_mime_types.py +42 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_ocr/_easyocr.py +2 -2
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_ocr/_paddleocr.py +1 -1
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_ocr/_tesseract.py +74 -34
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_types.py +182 -23
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_cache.py +10 -4
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_device.py +2 -4
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_image_preprocessing.py +12 -39
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_process_pool.py +29 -8
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_quality.py +7 -2
- kreuzberg-3.15.0/kreuzberg/_utils/_resource_managers.py +65 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_sync.py +36 -6
- kreuzberg-3.15.0/kreuzberg/_utils/_tmp.py +64 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/cli.py +34 -20
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/extraction.py +43 -27
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/pyproject.toml +5 -21
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/api/conftest.py +1 -0
- kreuzberg-3.15.0/tests/api/header_config_hashing_test.py +29 -0
- kreuzberg-3.15.0/tests/api/image_extraction_test.py +56 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/api/main_test.py +4 -2
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/api/runtime_config_test.py +49 -0
- kreuzberg-3.15.0/tests/core/config_test.py +15 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/core/dpi_configuration_test.py +25 -78
- kreuzberg-3.15.0/tests/core/extraction_batch_test.py +446 -0
- kreuzberg-3.15.0/tests/core/extraction_test.py +457 -0
- kreuzberg-3.15.0/tests/core/image_ocr_result_test.py +27 -0
- kreuzberg-3.15.0/tests/core/types_test.py +23 -0
- kreuzberg-3.15.0/tests/extractors/README_image_tests.md +85 -0
- kreuzberg-3.15.0/tests/extractors/base_memory_limits_test.py +100 -0
- kreuzberg-3.15.0/tests/extractors/base_ocr_processing_test.py +288 -0
- kreuzberg-3.15.0/tests/extractors/base_ocr_simple_test.py +64 -0
- kreuzberg-3.15.0/tests/extractors/email_error_paths_test.py +39 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/email_test.py +34 -10
- kreuzberg-3.15.0/tests/extractors/html_invalid_base64_test.py +11 -0
- kreuzberg-3.15.0/tests/extractors/image_deduplication_test.py +87 -0
- kreuzberg-3.15.0/tests/extractors/image_error_handling_test.py +251 -0
- kreuzberg-3.15.0/tests/extractors/image_error_simple_test.py +75 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/image_test.py +161 -75
- kreuzberg-3.15.0/tests/extractors/pdf_images_test.py +52 -0
- kreuzberg-3.15.0/tests/extractors/pdf_sync_images_test.py +217 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/pdf_test.py +26 -11
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/presentation_test.py +33 -0
- kreuzberg-3.14.0/tests/extractors/spreed_sheet_test.py → kreuzberg-3.15.0/tests/extractors/spreadsheet_test.py +39 -28
- kreuzberg-3.15.0/tests/features/gmft_test.py +528 -0
- kreuzberg-3.15.0/tests/features/language_detection_test.py +415 -0
- kreuzberg-3.15.0/tests/integration/all_extractors_images_test.py +231 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/dpi_integration_test.py +9 -44
- kreuzberg-3.15.0/tests/integration/pandoc_images_test.py +30 -0
- kreuzberg-3.15.0/tests/integration/pdf_images_test.py +18 -0
- kreuzberg-3.15.0/tests/integration/pdf_real_images_test.py +52 -0
- kreuzberg-3.15.0/tests/integration/pptx_complex_test.py +22 -0
- kreuzberg-3.15.0/tests/integration/pptx_images_test.py +18 -0
- kreuzberg-3.15.0/tests/interfaces/mcp_server_test.py +1275 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/ocr/paddleocr_test.py +30 -20
- kreuzberg-3.15.0/tests/performance/large_pdf_perf_test.py +29 -0
- kreuzberg-3.15.0/tests/utils/playa_test.py +264 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/uv.lock +57 -25
- kreuzberg-3.14.0/benchmarks/src/__main__.py +0 -4
- kreuzberg-3.14.0/benchmarks/src/benchmarks.py +0 -302
- kreuzberg-3.14.0/docker-compose.example.yml +0 -26
- kreuzberg-3.14.0/kreuzberg/_extractors/_base.py +0 -62
- kreuzberg-3.14.0/kreuzberg/_extractors/_html.py +0 -43
- kreuzberg-3.14.0/kreuzberg/_mcp/server.py +0 -194
- kreuzberg-3.14.0/kreuzberg/_utils/_tmp.py +0 -28
- kreuzberg-3.14.0/results/baseline.json +0 -9
- kreuzberg-3.14.0/results/serialization.json +0 -11
- kreuzberg-3.14.0/results/statistical.json +0 -21
- kreuzberg-3.14.0/test_report.json +0 -16
- kreuzberg-3.14.0/tests/core/extraction_batch_test.py +0 -0
- kreuzberg-3.14.0/tests/core/extraction_test.py +0 -0
- kreuzberg-3.14.0/tests/core/types_test.py +0 -0
- kreuzberg-3.14.0/tests/features/gmft_test.py +0 -0
- kreuzberg-3.14.0/tests/features/language_detection_test.py +0 -0
- kreuzberg-3.14.0/tests/utils/playa_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.commitlintrc +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.deepsource.toml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.docker/README.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.dockerignore +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.github/workflows/test-docker-builds.yml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/LICENSE +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/benchmarks/__init__.py +0 -0
- {kreuzberg-3.14.0/kreuzberg → kreuzberg-3.15.0/benchmarks}/py.typed +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/benchmarks/src/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docker-logs/docker-info.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docker-logs/docker-version.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/cli.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/contributing.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/kreuzberg/exceptions.py +0 -0
- /kreuzberg-3.14.0/output.txt → /kreuzberg-3.15.0/kreuzberg/py.typed +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/mkdocs.yaml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/conftest.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/core/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/core/exceptions_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/core/html_to_markdown_config_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/core/mime_types_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/core/registry_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/e2e/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/e2e/docker_e2e_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/features/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/features/chunker_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/features/document_classification_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/features/entity_extraction_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/features/hooks_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/api/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/api/large_file_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/api/mounted_config_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/ocr/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/integration/regression_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/interfaces/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/mcp/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/mcp/mcp_server_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/ocr/tesseract_tsv_test.py +0 -0
- {kreuzberg-3.14.0/tests/utils → kreuzberg-3.15.0/tests/performance}/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/google-doc-document.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/sharable-web-guide.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/tables/borderless_table.png +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/tables/complex_document.png +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/test-excel.xls +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- /kreuzberg-3.14.0/tests/core/config_test.py → /kreuzberg-3.15.0/tests/utils/__init__.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/ocr_cache_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/playa_helpers_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/quality_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/ref_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.14.0 → kreuzberg-3.15.0}/tests/utils/tmp_test.py +0 -0
@@ -99,7 +99,7 @@ jobs:
|
|
99
99
|
run: |
|
100
100
|
mkdir -p tests/e2e/logs
|
101
101
|
echo "Running E2E tests for ${{ matrix.image.name }}..."
|
102
|
-
python3 tests/e2e/
|
102
|
+
python3 tests/e2e/docker_e2e.py --image ${{ matrix.image.name }}
|
103
103
|
|
104
104
|
- name: Generate test report - ${{ matrix.image.name }}
|
105
105
|
if: always()
|
@@ -1,18 +1,20 @@
|
|
1
1
|
*$py.class
|
2
2
|
*.Cache
|
3
|
-
.clause/
|
4
3
|
*.cscfg
|
5
4
|
*.egg-info/
|
6
5
|
*.log
|
7
6
|
*.py[cod]
|
8
7
|
*.suo
|
8
|
+
*.tar.gz
|
9
|
+
*.temp
|
10
|
+
*.tmp
|
9
11
|
*.user
|
12
|
+
*.whl
|
10
13
|
*temp/
|
14
|
+
.cache/
|
15
|
+
.claude/
|
11
16
|
.coverage
|
12
17
|
.coverage*
|
13
|
-
coverage.lcov
|
14
|
-
htmlcov/
|
15
|
-
.claude/
|
16
18
|
.cursorrules
|
17
19
|
.dist/
|
18
20
|
.DS_store
|
@@ -20,47 +22,47 @@ htmlcov/
|
|
20
22
|
.idea/
|
21
23
|
.kreuzberg/
|
22
24
|
.mypy_cache/
|
25
|
+
.nox/
|
23
26
|
.pytest_cache/
|
24
27
|
.python-version
|
25
28
|
.ropeproject
|
26
29
|
.ruff_cache/
|
27
30
|
.run/
|
31
|
+
.task/
|
32
|
+
.tmp/
|
33
|
+
.tox/
|
28
34
|
.venv/
|
29
35
|
.vscode/
|
30
36
|
.windsurfrules
|
31
37
|
__pycache__/
|
38
|
+
AGENTS.md
|
32
39
|
benchmark_results.json
|
40
|
+
benchmarks/results/
|
41
|
+
build/
|
33
42
|
CLAUDE.md
|
43
|
+
coverage.lcov
|
34
44
|
coverage.xml
|
45
|
+
dist/
|
35
46
|
docker-compose.yaml
|
47
|
+
docs/_build/
|
48
|
+
docs/build/
|
36
49
|
GEMINI.md
|
50
|
+
htmlcov/
|
51
|
+
node_modules/
|
52
|
+
npm-debug.log*
|
53
|
+
output.txt
|
37
54
|
prompt_template.egg-info/
|
38
55
|
requirements.txt
|
56
|
+
share/python-wheels/
|
39
57
|
site/
|
40
|
-
.
|
41
|
-
dist/
|
42
|
-
build/
|
43
|
-
.task/
|
44
|
-
tests/e2e/test_report.json
|
58
|
+
test_report.json
|
45
59
|
tests/e2e/logs/
|
46
|
-
|
47
|
-
# Additional build artifacts
|
48
|
-
*.whl
|
49
|
-
*.tar.gz
|
50
|
-
.tox/
|
51
|
-
.nox/
|
60
|
+
tests/e2e/test_report.json
|
52
61
|
wheels/
|
53
|
-
share/python-wheels/
|
54
|
-
|
55
|
-
# Documentation builds
|
56
|
-
docs/_build/
|
57
|
-
docs/build/
|
58
|
-
|
59
|
-
# Node.js (if any frontend tools are used)
|
60
|
-
node_modules/
|
61
|
-
npm-debug.log*
|
62
62
|
yarn-debug.log*
|
63
63
|
yarn-error.log*
|
64
|
+
todo.md
|
65
|
+
TODO.md
|
64
66
|
|
65
67
|
# Temporary files
|
66
68
|
*.tmp
|
@@ -69,3 +71,4 @@ yarn-error.log*
|
|
69
71
|
|
70
72
|
# AI Rules generated files
|
71
73
|
.claude/agents/
|
74
|
+
AGENTS.md
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.15.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -107,6 +107,7 @@ Description-Content-Type: text/markdown
|
|
107
107
|
### Document Intelligence Capabilities
|
108
108
|
|
109
109
|
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
110
|
+
- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
|
110
111
|
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
111
112
|
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
112
113
|
- **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
|
@@ -16,6 +16,7 @@
|
|
16
16
|
### Document Intelligence Capabilities
|
17
17
|
|
18
18
|
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
19
|
+
- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
|
19
20
|
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
20
21
|
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
21
22
|
- **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
|
@@ -16,7 +16,7 @@ tasks:
|
|
16
16
|
deps:
|
17
17
|
- docker:build
|
18
18
|
cmds:
|
19
|
-
- uv run python {{.TEST_DIR}}/
|
19
|
+
- uv run python {{.TEST_DIR}}/docker_e2e.py
|
20
20
|
|
21
21
|
docker:build:
|
22
22
|
desc: "Build all Docker images for testing"
|
@@ -67,7 +67,7 @@ tasks:
|
|
67
67
|
docker:test:
|
68
68
|
desc: "Run Docker E2E tests (images must be built)"
|
69
69
|
cmds:
|
70
|
-
- uv run python {{.TEST_DIR}}/
|
70
|
+
- uv run python {{.TEST_DIR}}/docker_e2e.py
|
71
71
|
|
72
72
|
docker:clean:
|
73
73
|
desc: "Clean up Docker test images and containers"
|
@@ -385,6 +385,7 @@ rules:
|
|
385
385
|
- NEVER proactively create documentation files (*.md) or README files
|
386
386
|
- Only create documentation files if explicitly requested by the User
|
387
387
|
- All builtin imports should be at the top level (except for cyclical or optional dependencies)
|
388
|
+
- All config dataclasses must be hashable, frozen, and use slots: `@dataclass(unsafe_hash=True, frozen=True, slots=True)`
|
388
389
|
- When committing, always use the format specified in the CLAUDE.md
|
389
390
|
name: Important Instructions
|
390
391
|
priority: critical
|
@@ -87,6 +87,18 @@ uv run python -m benchmarks.src run --stress
|
|
87
87
|
# Run backend comparison benchmarks
|
88
88
|
uv run python -m benchmarks.src run --backend-comparison
|
89
89
|
|
90
|
+
# Include Tesseract OCR benchmarks (sync)
|
91
|
+
uv run python -m benchmarks.src run --tesseract
|
92
|
+
|
93
|
+
# Include expanded Tesseract variant matrix (formats/PSM)
|
94
|
+
uv run python -m benchmarks.src run --tesseract --tesseract-matrix
|
95
|
+
|
96
|
+
# Compare Tesseract architectures (threads vs processes)
|
97
|
+
uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-arch
|
98
|
+
|
99
|
+
# Compare with custom worker counts (e.g., 1,4,8)
|
100
|
+
uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-arch --workers 1,4,8
|
101
|
+
|
90
102
|
# Custom test files directory
|
91
103
|
uv run python -m benchmarks.src run --test-files-dir /path/to/test/files
|
92
104
|
|
@@ -232,3 +244,21 @@ uv run python -m benchmarks.src run --sync-only --suite-name main_baseline
|
|
232
244
|
uv run python -m benchmarks.src run --sync-only --suite-name pr_test
|
233
245
|
uv run python -m benchmarks.src compare results/main_baseline.json results/pr_test.json
|
234
246
|
```
|
247
|
+
|
248
|
+
### Tesseract Benchmarks
|
249
|
+
|
250
|
+
The suite includes focused Tesseract OCR benchmarks:
|
251
|
+
|
252
|
+
- `--tesseract` adds thread-based batch OCR and a process-pool placeholder for A/B comparisons.
|
253
|
+
- `--tesseract-matrix` expands with a small matrix across output formats (`text`, `markdown`, `tsv`) and PSM modes
|
254
|
+
(`AUTO`, `SINGLE_BLOCK`, `SINGLE_LINE`) to quantify overhead of richer outputs and segmentation strategies.
|
255
|
+
|
256
|
+
Examples:
|
257
|
+
|
258
|
+
```bash
|
259
|
+
# Minimal Tesseract batch OCR benchmarks
|
260
|
+
uv run python -m benchmarks.src run --sync-only --tesseract
|
261
|
+
|
262
|
+
# Full Tesseract config matrix
|
263
|
+
uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-matrix
|
264
|
+
```
|
@@ -0,0 +1,179 @@
|
|
1
|
+
import json
|
2
|
+
import shutil
|
3
|
+
import tempfile
|
4
|
+
import time
|
5
|
+
from concurrent.futures import ProcessPoolExecutor
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any
|
8
|
+
|
9
|
+
from PIL import Image, ImageDraw
|
10
|
+
|
11
|
+
from kreuzberg import extract_file_sync
|
12
|
+
from kreuzberg._ocr._tesseract import _process_image_with_tesseract
|
13
|
+
from kreuzberg._types import ExtractionConfig
|
14
|
+
from kreuzberg._utils._process_pool import get_optimal_worker_count, process_pool
|
15
|
+
|
16
|
+
|
17
|
+
def create_test_images(sizes: list[tuple[int, int]], output_dir: Path) -> list[Path]:
|
18
|
+
output_dir.mkdir(exist_ok=True)
|
19
|
+
image_paths = []
|
20
|
+
|
21
|
+
for i, (width, height) in enumerate(sizes):
|
22
|
+
img = Image.new("RGB", (width, height), color="white")
|
23
|
+
draw = ImageDraw.Draw(img)
|
24
|
+
|
25
|
+
for y in range(0, height, 50):
|
26
|
+
for x in range(0, width, 100):
|
27
|
+
draw.text((x, y), f"Test {i}", fill="black")
|
28
|
+
|
29
|
+
path = output_dir / f"test_{width}x{height}_{i}.png"
|
30
|
+
img.save(path)
|
31
|
+
image_paths.append(path)
|
32
|
+
|
33
|
+
return image_paths
|
34
|
+
|
35
|
+
|
36
|
+
def benchmark_batch_fixed_workers(images: list[Path], num_workers: int) -> dict[str, Any]:
|
37
|
+
start = time.perf_counter()
|
38
|
+
config_dict = {"language": "eng", "psm": 3}
|
39
|
+
|
40
|
+
with ProcessPoolExecutor(max_workers=num_workers) as pool:
|
41
|
+
futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
|
42
|
+
[f.result() for f in futures]
|
43
|
+
|
44
|
+
duration = time.perf_counter() - start
|
45
|
+
return {
|
46
|
+
"strategy": "fixed",
|
47
|
+
"workers": num_workers,
|
48
|
+
"batch_size": len(images),
|
49
|
+
"duration": duration,
|
50
|
+
"per_image": duration / len(images) if images else 0,
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
def benchmark_batch_dynamic_workers(images: list[Path]) -> dict[str, Any]:
|
55
|
+
start = time.perf_counter()
|
56
|
+
config_dict = {"language": "eng", "psm": 3}
|
57
|
+
|
58
|
+
optimal_workers = get_optimal_worker_count(len(images), cpu_intensive=True)
|
59
|
+
|
60
|
+
with ProcessPoolExecutor(max_workers=optimal_workers) as pool:
|
61
|
+
futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
|
62
|
+
[f.result() for f in futures]
|
63
|
+
|
64
|
+
duration = time.perf_counter() - start
|
65
|
+
return {
|
66
|
+
"strategy": "dynamic",
|
67
|
+
"workers": optimal_workers,
|
68
|
+
"batch_size": len(images),
|
69
|
+
"duration": duration,
|
70
|
+
"per_image": duration / len(images) if images else 0,
|
71
|
+
}
|
72
|
+
|
73
|
+
|
74
|
+
def benchmark_batch_shared_pool(images: list[Path]) -> dict[str, Any]:
|
75
|
+
start = time.perf_counter()
|
76
|
+
config_dict = {"language": "eng", "psm": 3}
|
77
|
+
|
78
|
+
with process_pool() as pool:
|
79
|
+
futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
|
80
|
+
[f.result() for f in futures]
|
81
|
+
|
82
|
+
duration = time.perf_counter() - start
|
83
|
+
return {
|
84
|
+
"strategy": "shared_pool",
|
85
|
+
"workers": 14,
|
86
|
+
"batch_size": len(images),
|
87
|
+
"duration": duration,
|
88
|
+
"per_image": duration / len(images) if images else 0,
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
def benchmark_extraction_api(images: list[Path]) -> dict[str, Any]:
|
93
|
+
start = time.perf_counter()
|
94
|
+
|
95
|
+
config = ExtractionConfig(use_cache=False, force_ocr=True)
|
96
|
+
|
97
|
+
for image_path in images:
|
98
|
+
extract_file_sync(image_path, config=config)
|
99
|
+
|
100
|
+
duration = time.perf_counter() - start
|
101
|
+
return {
|
102
|
+
"strategy": "extraction_api",
|
103
|
+
"workers": "auto",
|
104
|
+
"batch_size": len(images),
|
105
|
+
"duration": duration,
|
106
|
+
"per_image": duration / len(images) if images else 0,
|
107
|
+
}
|
108
|
+
|
109
|
+
|
110
|
+
def main() -> None:
|
111
|
+
batch_sizes = [1, 2, 5, 10, 20]
|
112
|
+
image_sizes = [
|
113
|
+
(640, 480),
|
114
|
+
(1024, 768),
|
115
|
+
(1920, 1080),
|
116
|
+
]
|
117
|
+
|
118
|
+
test_dir = Path(tempfile.mkdtemp(prefix="kreuzberg_bench_"))
|
119
|
+
|
120
|
+
results = []
|
121
|
+
|
122
|
+
for img_width, img_height in image_sizes:
|
123
|
+
max_batch = max(batch_sizes)
|
124
|
+
images = create_test_images([(img_width, img_height)] * max_batch, test_dir)
|
125
|
+
|
126
|
+
for batch_size in batch_sizes:
|
127
|
+
batch = images[:batch_size]
|
128
|
+
|
129
|
+
strategies = []
|
130
|
+
|
131
|
+
fixed_result = benchmark_batch_fixed_workers(batch, 14)
|
132
|
+
strategies.append(fixed_result)
|
133
|
+
|
134
|
+
dynamic_result = benchmark_batch_dynamic_workers(batch)
|
135
|
+
strategies.append(dynamic_result)
|
136
|
+
|
137
|
+
shared_result = benchmark_batch_shared_pool(batch)
|
138
|
+
strategies.append(shared_result)
|
139
|
+
|
140
|
+
if batch_size <= 10:
|
141
|
+
api_result = benchmark_extraction_api(batch)
|
142
|
+
strategies.append(api_result)
|
143
|
+
|
144
|
+
baseline = fixed_result["duration"]
|
145
|
+
if baseline > 0:
|
146
|
+
for strategy in strategies[1:]:
|
147
|
+
improvement = ((baseline - strategy["duration"]) / baseline) * 100
|
148
|
+
strategy["improvement_pct"] = improvement
|
149
|
+
|
150
|
+
result_entry = {
|
151
|
+
"image_size": f"{img_width}x{img_height}",
|
152
|
+
"batch_size": batch_size,
|
153
|
+
"strategies": strategies,
|
154
|
+
}
|
155
|
+
results.append(result_entry)
|
156
|
+
|
157
|
+
output_file = Path("results/batch_size_benchmarks.json")
|
158
|
+
output_file.parent.mkdir(exist_ok=True)
|
159
|
+
|
160
|
+
with output_file.open("w") as f:
|
161
|
+
json.dump({"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "results": results}, f, indent=2)
|
162
|
+
|
163
|
+
for img_size in image_sizes:
|
164
|
+
size_str = f"{img_size[0]}x{img_size[1]}"
|
165
|
+
|
166
|
+
size_results = [r for r in results if r["image_size"] == size_str]
|
167
|
+
for result in size_results:
|
168
|
+
batch_size = result["batch_size"] # type: ignore[assignment]
|
169
|
+
strategies = result["strategies"] # type: ignore[assignment]
|
170
|
+
|
171
|
+
dynamic = next((s for s in strategies if s["strategy"] == "dynamic"), None)
|
172
|
+
if dynamic and "improvement_pct" in dynamic:
|
173
|
+
pass
|
174
|
+
|
175
|
+
shutil.rmtree(test_dir)
|
176
|
+
|
177
|
+
|
178
|
+
if __name__ == "__main__":
|
179
|
+
main()
|
@@ -0,0 +1,83 @@
|
|
1
|
+
import json
|
2
|
+
import time
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Any
|
5
|
+
|
6
|
+
from kreuzberg import extract_file_sync
|
7
|
+
from kreuzberg._types import ExtractionConfig
|
8
|
+
|
9
|
+
|
10
|
+
def benchmark_real_world_scenario(file_paths: list[Path], scenario_name: str) -> dict[str, Any]:
|
11
|
+
config = ExtractionConfig(use_cache=False)
|
12
|
+
|
13
|
+
start = time.perf_counter()
|
14
|
+
results = []
|
15
|
+
for path in file_paths:
|
16
|
+
result = extract_file_sync(path, config=config)
|
17
|
+
results.append(len(result.content))
|
18
|
+
|
19
|
+
duration = time.perf_counter() - start
|
20
|
+
|
21
|
+
return {
|
22
|
+
"scenario": scenario_name,
|
23
|
+
"file_count": len(file_paths),
|
24
|
+
"duration": duration,
|
25
|
+
"per_file": duration / len(file_paths),
|
26
|
+
"total_chars": sum(results),
|
27
|
+
}
|
28
|
+
|
29
|
+
|
30
|
+
def main() -> None:
|
31
|
+
test_dir = Path("/Users/naamanhirschfeld/workspace/kreuzberg/tests/test_source_files")
|
32
|
+
|
33
|
+
scenarios = []
|
34
|
+
|
35
|
+
mixed_files = []
|
36
|
+
for ext in ["*.pdf", "*.docx", "*.xlsx", "*.pptx"]:
|
37
|
+
mixed_files.extend(list(test_dir.glob(ext))[:2])
|
38
|
+
if mixed_files:
|
39
|
+
result = benchmark_real_world_scenario(mixed_files, "Mixed Office Documents")
|
40
|
+
scenarios.append(result)
|
41
|
+
|
42
|
+
image_files = []
|
43
|
+
for ext in ["*.png", "*.jpg", "*.jpeg"]:
|
44
|
+
image_files.extend(list(test_dir.glob(ext))[:3])
|
45
|
+
if image_files:
|
46
|
+
result = benchmark_real_world_scenario(image_files, "Image Batch Processing")
|
47
|
+
scenarios.append(result)
|
48
|
+
|
49
|
+
pdf_files = list(test_dir.glob("*.pdf"))[:5]
|
50
|
+
if pdf_files:
|
51
|
+
result = benchmark_real_world_scenario(pdf_files, "PDF Document Processing")
|
52
|
+
scenarios.append(result)
|
53
|
+
|
54
|
+
small_files = []
|
55
|
+
for ext in ["*.txt", "*.md", "*.html"]:
|
56
|
+
small_files.extend(list(test_dir.glob(ext))[:3])
|
57
|
+
if small_files:
|
58
|
+
result = benchmark_real_world_scenario(small_files, "Small Text Files")
|
59
|
+
scenarios.append(result)
|
60
|
+
|
61
|
+
total_files = sum(s["file_count"] for s in scenarios)
|
62
|
+
total_time = sum(s["duration"] for s in scenarios)
|
63
|
+
total_chars = sum(s["total_chars"] for s in scenarios)
|
64
|
+
|
65
|
+
output = {
|
66
|
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
67
|
+
"scenarios": scenarios,
|
68
|
+
"summary": {
|
69
|
+
"total_files": total_files,
|
70
|
+
"total_time": total_time,
|
71
|
+
"avg_per_file": total_time / total_files if total_files > 0 else 0,
|
72
|
+
"total_chars": total_chars,
|
73
|
+
"throughput": total_chars / total_time if total_time > 0 else 0,
|
74
|
+
},
|
75
|
+
}
|
76
|
+
|
77
|
+
output_file = Path("results/final_batch_validation.json")
|
78
|
+
with output_file.open("w") as f:
|
79
|
+
json.dump(output, f, indent=2)
|
80
|
+
|
81
|
+
|
82
|
+
if __name__ == "__main__":
|
83
|
+
main()
|
@@ -17,18 +17,13 @@ classifiers = [
|
|
17
17
|
# kreuzberg-bench = "src.cli:app"
|
18
18
|
|
19
19
|
dependencies = [
|
20
|
+
"click>=8.2.1",
|
20
21
|
"kreuzberg",
|
21
|
-
"
|
22
|
-
"memory-profiler>=0.61",
|
23
|
-
"pandas>=2",
|
22
|
+
"msgpack>=1.1.1",
|
24
23
|
"psutil>=5.9",
|
25
24
|
"py-spy>=0.3.14",
|
26
25
|
"rich>=13",
|
27
|
-
"typer>=0.9",
|
28
26
|
]
|
29
27
|
|
30
|
-
[tool.ruff]
|
31
|
-
lint.extend-ignore = [ "ARG002", "B008", "B904", "BLE001", "E722", "PLR2004", "PYI036", "SLF001" ]
|
32
|
-
|
33
28
|
[tool.uv.sources]
|
34
29
|
kreuzberg = { workspace = true }
|