kreuzberg 3.14.1__tar.gz → 3.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/ci.yaml +1 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/docker-e2e-tests.yml +1 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.gitignore +27 -24
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.pre-commit-config.yaml +1 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/PKG-INFO +13 -11
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/README.md +11 -9
- kreuzberg-3.16.0/Taskfile.yml +50 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/ai-rulez.yaml +1 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/README.md +30 -0
- kreuzberg-3.16.0/benchmarks/batch_size_benchmark.py +179 -0
- kreuzberg-3.16.0/benchmarks/batch_validation_benchmark.py +83 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/pyproject.toml +2 -7
- kreuzberg-3.16.0/benchmarks/src/__main__.py +4 -0
- kreuzberg-3.16.0/benchmarks/src/benchmarks.py +703 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/src/cli.py +215 -182
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/src/models.py +10 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/src/profiler.py +12 -21
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/src/runner.py +52 -63
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/api-reference/types.md +32 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/examples/extraction-examples.md +348 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/index.md +2 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/api-server.md +128 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/extraction-configuration.md +265 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/metadata-extraction.md +51 -0
- kreuzberg-3.16.0/docs/user-guide/supported-formats.md +71 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/__init__.py +10 -0
- kreuzberg-3.16.0/kreuzberg/_api/_config_cache.py +247 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_api/main.py +74 -45
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_chunker.py +7 -6
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_config.py +11 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_constants.py +2 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_document_classification.py +5 -7
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_entity_extraction.py +9 -4
- kreuzberg-3.16.0/kreuzberg/_extractors/_base.py +328 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_extractors/_email.py +101 -27
- kreuzberg-3.16.0/kreuzberg/_extractors/_html.py +148 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_extractors/_image.py +23 -22
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_extractors/_pandoc.py +106 -75
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_extractors/_pdf.py +208 -99
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_extractors/_presentation.py +76 -8
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_extractors/_spread_sheet.py +24 -30
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_extractors/_structured.py +83 -15
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_gmft.py +5 -0
- kreuzberg-3.16.0/kreuzberg/_mcp/server.py +493 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_mime_types.py +42 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_ocr/_easyocr.py +53 -21
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_ocr/_paddleocr.py +1 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_ocr/_tesseract.py +88 -37
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_types.py +291 -61
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_cache.py +10 -4
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_device.py +2 -4
- kreuzberg-3.16.0/kreuzberg/_utils/_html_streaming.py +20 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_image_preprocessing.py +12 -39
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_process_pool.py +29 -8
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_quality.py +7 -2
- kreuzberg-3.16.0/kreuzberg/_utils/_resource_managers.py +65 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_serialization.py +13 -6
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_sync.py +39 -10
- kreuzberg-3.16.0/kreuzberg/_utils/_tmp.py +64 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/cli.py +34 -20
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/pyproject.toml +7 -23
- kreuzberg-3.16.0/tests/api/config_cache_test.py +248 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/api/conftest.py +1 -0
- kreuzberg-3.16.0/tests/api/header_config_hashing_test.py +29 -0
- kreuzberg-3.16.0/tests/api/image_extraction_test.py +59 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/api/main_test.py +7 -8
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/api/runtime_config_test.py +52 -0
- kreuzberg-3.16.0/tests/core/comprehensive_config_test.py +603 -0
- kreuzberg-3.16.0/tests/core/config_test.py +15 -0
- kreuzberg-3.16.0/tests/core/constants_test.py +22 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/core/dpi_configuration_test.py +44 -78
- kreuzberg-3.16.0/tests/core/exceptions_test.py +159 -0
- kreuzberg-3.16.0/tests/core/extraction_batch_test.py +389 -0
- kreuzberg-3.16.0/tests/core/extraction_test.py +494 -0
- kreuzberg-3.16.0/tests/core/image_ocr_result_test.py +27 -0
- kreuzberg-3.16.0/tests/core/init_test.py +85 -0
- kreuzberg-3.16.0/tests/core/main_test.py +35 -0
- kreuzberg-3.16.0/tests/core/mime_types_test.py +242 -0
- kreuzberg-3.16.0/tests/core/registry_test.py +225 -0
- kreuzberg-3.16.0/tests/core/types_test.py +403 -0
- kreuzberg-3.16.0/tests/extractors/README_image_tests.md +85 -0
- kreuzberg-3.16.0/tests/extractors/base_extractor_test.py +420 -0
- kreuzberg-3.16.0/tests/extractors/base_memory_limits_test.py +100 -0
- kreuzberg-3.16.0/tests/extractors/base_ocr_processing_test.py +276 -0
- kreuzberg-3.16.0/tests/extractors/base_ocr_simple_test.py +64 -0
- kreuzberg-3.16.0/tests/extractors/email_error_paths_test.py +39 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/email_test.py +33 -9
- kreuzberg-3.16.0/tests/extractors/html_invalid_base64_test.py +11 -0
- kreuzberg-3.16.0/tests/extractors/image_deduplication_test.py +87 -0
- kreuzberg-3.16.0/tests/extractors/image_error_handling_test.py +253 -0
- kreuzberg-3.16.0/tests/extractors/image_error_simple_test.py +75 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/image_test.py +144 -75
- kreuzberg-3.16.0/tests/extractors/json_test.py +427 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/pandoc_test.py +27 -29
- kreuzberg-3.16.0/tests/extractors/pdf_images_test.py +52 -0
- kreuzberg-3.16.0/tests/extractors/pdf_sync_images_test.py +217 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/pdf_test.py +34 -14
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/presentation_test.py +33 -0
- kreuzberg-3.14.1/tests/extractors/spreed_sheet_test.py → kreuzberg-3.16.0/tests/extractors/spreadsheet_test.py +52 -37
- kreuzberg-3.16.0/tests/features/chunker_test.py +94 -0
- kreuzberg-3.16.0/tests/features/document_classification_test.py +747 -0
- kreuzberg-3.16.0/tests/features/entity_extraction_test.py +348 -0
- kreuzberg-3.16.0/tests/features/gmft_test.py +1496 -0
- kreuzberg-3.16.0/tests/features/language_detection_test.py +387 -0
- kreuzberg-3.16.0/tests/integration/all_extractors_images_test.py +252 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/dpi_integration_test.py +9 -44
- kreuzberg-3.16.0/tests/integration/pandoc_images_test.py +30 -0
- kreuzberg-3.16.0/tests/integration/pdf_images_test.py +18 -0
- kreuzberg-3.16.0/tests/integration/pdf_real_images_test.py +52 -0
- kreuzberg-3.16.0/tests/integration/pptx_complex_test.py +22 -0
- kreuzberg-3.16.0/tests/integration/pptx_images_test.py +18 -0
- kreuzberg-3.16.0/tests/interfaces/cli_test.py +527 -0
- kreuzberg-3.16.0/tests/interfaces/mcp_server_test.py +1116 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/multiprocessing/gmft_isolated_test.py +1 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/ocr/easyocr_test.py +6 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/ocr/paddleocr_test.py +31 -20
- kreuzberg-3.16.0/tests/performance/large_pdf_perf_test.py +29 -0
- kreuzberg-3.16.0/tests/test_source_files/json/complex_nested.json +41 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/aws_policy.json +43 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/earthquakes.geojson +6 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/github_emojis.json +111 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/iss_location.json +1 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/openapi_spec.json +84 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/package.json +33 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/rick_morty_character.json +1 -0
- kreuzberg-3.16.0/tests/test_source_files/json/schema_test.json +25 -0
- kreuzberg-3.16.0/tests/utils/playa_metadata_test.py +753 -0
- kreuzberg-3.16.0/tests/utils/playa_test.py +315 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/process_pool_test.py +1 -1
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/serialization_test.py +82 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/uv.lock +76 -44
- kreuzberg-3.14.1/Taskfile.yml +0 -161
- kreuzberg-3.14.1/benchmarks/src/__main__.py +0 -4
- kreuzberg-3.14.1/benchmarks/src/benchmarks.py +0 -302
- kreuzberg-3.14.1/docker-compose.example.yml +0 -26
- kreuzberg-3.14.1/docs/user-guide/supported-formats.md +0 -48
- kreuzberg-3.14.1/kreuzberg/_extractors/_base.py +0 -62
- kreuzberg-3.14.1/kreuzberg/_extractors/_html.py +0 -43
- kreuzberg-3.14.1/kreuzberg/_mcp/server.py +0 -194
- kreuzberg-3.14.1/kreuzberg/_utils/_tmp.py +0 -28
- kreuzberg-3.14.1/results/baseline.json +0 -9
- kreuzberg-3.14.1/results/serialization.json +0 -11
- kreuzberg-3.14.1/results/statistical.json +0 -21
- kreuzberg-3.14.1/test_report.json +0 -16
- kreuzberg-3.14.1/tests/core/exceptions_test.py +0 -0
- kreuzberg-3.14.1/tests/core/extraction_batch_test.py +0 -0
- kreuzberg-3.14.1/tests/core/extraction_test.py +0 -0
- kreuzberg-3.14.1/tests/core/mime_types_test.py +0 -0
- kreuzberg-3.14.1/tests/core/registry_test.py +0 -0
- kreuzberg-3.14.1/tests/core/types_test.py +0 -0
- kreuzberg-3.14.1/tests/features/chunker_test.py +0 -0
- kreuzberg-3.14.1/tests/features/document_classification_test.py +0 -0
- kreuzberg-3.14.1/tests/features/entity_extraction_test.py +0 -0
- kreuzberg-3.14.1/tests/features/gmft_test.py +0 -0
- kreuzberg-3.14.1/tests/features/language_detection_test.py +0 -0
- kreuzberg-3.14.1/tests/utils/playa_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.commitlintrc +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.deepsource.toml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.docker/README.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.dockerignore +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/test-docker-builds.yml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/LICENSE +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/__init__.py +0 -0
- {kreuzberg-3.14.1/kreuzberg → kreuzberg-3.16.0/benchmarks}/py.typed +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/src/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docker-logs/docker-info.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docker-logs/docker-version.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/cli.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/contributing.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/kreuzberg/exceptions.py +0 -0
- /kreuzberg-3.14.1/output.txt → /kreuzberg-3.16.0/kreuzberg/py.typed +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/mkdocs.yaml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/conftest.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/core/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/core/html_to_markdown_config_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/e2e/__init__.py +0 -0
- /kreuzberg-3.14.1/tests/e2e/docker_e2e_test.py → /kreuzberg-3.16.0/tests/e2e/docker_e2e.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/features/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/features/hooks_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/api/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/api/large_file_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/api/mounted_config_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/ocr/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/integration/regression_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/interfaces/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/mcp/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/mcp/mcp_server_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/ocr/tesseract_tsv_test.py +0 -0
- {kreuzberg-3.14.1/tests/utils → kreuzberg-3.16.0/tests/performance}/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/google-doc-document.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/sharable-web-guide.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/tables/borderless_table.png +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/tables/complex_document.png +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/test-excel.xls +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- /kreuzberg-3.14.1/tests/core/config_test.py → /kreuzberg-3.16.0/tests/utils/__init__.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/ocr_cache_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/playa_helpers_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/quality_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/ref_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.14.1 → kreuzberg-3.16.0}/tests/utils/tmp_test.py +0 -0
@@ -99,7 +99,7 @@ jobs:
|
|
99
99
|
run: |
|
100
100
|
mkdir -p tests/e2e/logs
|
101
101
|
echo "Running E2E tests for ${{ matrix.image.name }}..."
|
102
|
-
python3 tests/e2e/
|
102
|
+
python3 tests/e2e/docker_e2e.py --image ${{ matrix.image.name }}
|
103
103
|
|
104
104
|
- name: Generate test report - ${{ matrix.image.name }}
|
105
105
|
if: always()
|
@@ -1,18 +1,20 @@
|
|
1
1
|
*$py.class
|
2
2
|
*.Cache
|
3
|
-
.clause/
|
4
3
|
*.cscfg
|
5
4
|
*.egg-info/
|
6
5
|
*.log
|
7
6
|
*.py[cod]
|
8
7
|
*.suo
|
8
|
+
*.tar.gz
|
9
|
+
*.temp
|
10
|
+
*.tmp
|
9
11
|
*.user
|
12
|
+
*.whl
|
10
13
|
*temp/
|
14
|
+
.cache/
|
15
|
+
.claude/
|
11
16
|
.coverage
|
12
17
|
.coverage*
|
13
|
-
coverage.lcov
|
14
|
-
htmlcov/
|
15
|
-
.claude/
|
16
18
|
.cursorrules
|
17
19
|
.dist/
|
18
20
|
.DS_store
|
@@ -20,47 +22,47 @@ htmlcov/
|
|
20
22
|
.idea/
|
21
23
|
.kreuzberg/
|
22
24
|
.mypy_cache/
|
25
|
+
.nox/
|
23
26
|
.pytest_cache/
|
24
27
|
.python-version
|
25
28
|
.ropeproject
|
26
29
|
.ruff_cache/
|
27
30
|
.run/
|
31
|
+
.task/
|
32
|
+
.tmp/
|
33
|
+
.tox/
|
28
34
|
.venv/
|
29
35
|
.vscode/
|
30
36
|
.windsurfrules
|
31
37
|
__pycache__/
|
38
|
+
AGENTS.md
|
32
39
|
benchmark_results.json
|
40
|
+
benchmarks/results/
|
41
|
+
build/
|
33
42
|
CLAUDE.md
|
43
|
+
coverage.lcov
|
34
44
|
coverage.xml
|
45
|
+
dist/
|
35
46
|
docker-compose.yaml
|
47
|
+
docs/_build/
|
48
|
+
docs/build/
|
36
49
|
GEMINI.md
|
50
|
+
htmlcov/
|
51
|
+
node_modules/
|
52
|
+
npm-debug.log*
|
53
|
+
output.txt
|
37
54
|
prompt_template.egg-info/
|
38
55
|
requirements.txt
|
56
|
+
share/python-wheels/
|
39
57
|
site/
|
40
|
-
.
|
41
|
-
dist/
|
42
|
-
build/
|
43
|
-
.task/
|
44
|
-
tests/e2e/test_report.json
|
58
|
+
test_report.json
|
45
59
|
tests/e2e/logs/
|
46
|
-
|
47
|
-
# Additional build artifacts
|
48
|
-
*.whl
|
49
|
-
*.tar.gz
|
50
|
-
.tox/
|
51
|
-
.nox/
|
60
|
+
tests/e2e/test_report.json
|
52
61
|
wheels/
|
53
|
-
share/python-wheels/
|
54
|
-
|
55
|
-
# Documentation builds
|
56
|
-
docs/_build/
|
57
|
-
docs/build/
|
58
|
-
|
59
|
-
# Node.js (if any frontend tools are used)
|
60
|
-
node_modules/
|
61
|
-
npm-debug.log*
|
62
62
|
yarn-debug.log*
|
63
63
|
yarn-error.log*
|
64
|
+
todo.md
|
65
|
+
TODO.md
|
64
66
|
|
65
67
|
# Temporary files
|
66
68
|
*.tmp
|
@@ -69,3 +71,4 @@ yarn-error.log*
|
|
69
71
|
|
70
72
|
# AI Rules generated files
|
71
73
|
.claude/agents/
|
74
|
+
AGENTS.md
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.16.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.13.0
|
35
35
|
Requires-Dist: mcp>=1.14.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: numpy>=2.0.0
|
@@ -107,8 +107,9 @@ Description-Content-Type: text/markdown
|
|
107
107
|
### Document Intelligence Capabilities
|
108
108
|
|
109
109
|
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
110
|
+
- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
|
110
111
|
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
111
|
-
- **Format Support**:
|
112
|
+
- **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
112
113
|
- **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
|
113
114
|
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
114
115
|
|
@@ -226,14 +227,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
|
|
226
227
|
|
227
228
|
## Supported Formats
|
228
229
|
|
229
|
-
| Category
|
230
|
-
|
|
231
|
-
| **Documents**
|
232
|
-
| **Images**
|
233
|
-
| **Spreadsheets**
|
234
|
-
| **Presentations**
|
235
|
-
| **Web**
|
236
|
-
| **
|
230
|
+
| Category | Formats |
|
231
|
+
| ------------------- | ------------------------------ |
|
232
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
233
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
234
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
235
|
+
| **Presentations** | PPTX, PPT, ODP |
|
236
|
+
| **Web** | HTML, XML, MHTML |
|
237
|
+
| **Structured Data** | JSON, YAML, TOML |
|
238
|
+
| **Archives** | Support via extraction |
|
237
239
|
|
238
240
|
## 📊 Performance Characteristics
|
239
241
|
|
@@ -16,8 +16,9 @@
|
|
16
16
|
### Document Intelligence Capabilities
|
17
17
|
|
18
18
|
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
19
|
+
- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
|
19
20
|
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
20
|
-
- **Format Support**:
|
21
|
+
- **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
21
22
|
- **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
|
22
23
|
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
23
24
|
|
@@ -135,14 +136,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
|
|
135
136
|
|
136
137
|
## Supported Formats
|
137
138
|
|
138
|
-
| Category
|
139
|
-
|
|
140
|
-
| **Documents**
|
141
|
-
| **Images**
|
142
|
-
| **Spreadsheets**
|
143
|
-
| **Presentations**
|
144
|
-
| **Web**
|
145
|
-
| **
|
139
|
+
| Category | Formats |
|
140
|
+
| ------------------- | ------------------------------ |
|
141
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
142
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
143
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
144
|
+
| **Presentations** | PPTX, PPT, ODP |
|
145
|
+
| **Web** | HTML, XML, MHTML |
|
146
|
+
| **Structured Data** | JSON, YAML, TOML |
|
147
|
+
| **Archives** | Support via extraction |
|
146
148
|
|
147
149
|
## 📊 Performance Characteristics
|
148
150
|
|
@@ -0,0 +1,50 @@
|
|
1
|
+
version: "3"
|
2
|
+
|
3
|
+
env:
|
4
|
+
DOCKER_BUILDKIT: 1
|
5
|
+
BUILDKIT_PROGRESS: plain
|
6
|
+
|
7
|
+
tasks:
|
8
|
+
setup:
|
9
|
+
desc: "Install dependencies with uv"
|
10
|
+
cmds:
|
11
|
+
- uv sync --all-extras --all-packages
|
12
|
+
- pre-commit install && pre-commit install -hook-type commit-msg
|
13
|
+
|
14
|
+
update:
|
15
|
+
desc: "Update the dependencies"
|
16
|
+
cmds:
|
17
|
+
- uv run uv-bump
|
18
|
+
- cd benchmarks && uv run uv-bump && cd -
|
19
|
+
- uv sync --all-extras --all-packages --upgrade
|
20
|
+
- pre-commit autoupdate
|
21
|
+
|
22
|
+
test:
|
23
|
+
desc: "Run tests with pytest"
|
24
|
+
cmds:
|
25
|
+
- uv run pytest
|
26
|
+
|
27
|
+
test:cov:
|
28
|
+
desc: "Run tests with coverage"
|
29
|
+
cmds:
|
30
|
+
- uv run pytest --cov
|
31
|
+
|
32
|
+
lint:
|
33
|
+
desc: "Lint code with ruff and docs with markdownlint"
|
34
|
+
cmds:
|
35
|
+
- pre-commit run --all-files
|
36
|
+
|
37
|
+
docs:build:
|
38
|
+
desc: "Build documentation"
|
39
|
+
cmds:
|
40
|
+
- uv run mkdocs build --clean --strict
|
41
|
+
|
42
|
+
docs:serve:
|
43
|
+
desc: "Serve documentation locally"
|
44
|
+
cmds:
|
45
|
+
- uv run mkdocs serve
|
46
|
+
|
47
|
+
default:
|
48
|
+
desc: "Show available tasks"
|
49
|
+
cmds:
|
50
|
+
- task --list
|
@@ -385,6 +385,7 @@ rules:
|
|
385
385
|
- NEVER proactively create documentation files (*.md) or README files
|
386
386
|
- Only create documentation files if explicitly requested by the User
|
387
387
|
- All builtin imports should be at the top level (except for cyclical or optional dependencies)
|
388
|
+
- All config dataclasses must be hashable, frozen, and use slots: `@dataclass(unsafe_hash=True, frozen=True, slots=True)`
|
388
389
|
- When committing, always use the format specified in the CLAUDE.md
|
389
390
|
name: Important Instructions
|
390
391
|
priority: critical
|
@@ -87,6 +87,18 @@ uv run python -m benchmarks.src run --stress
|
|
87
87
|
# Run backend comparison benchmarks
|
88
88
|
uv run python -m benchmarks.src run --backend-comparison
|
89
89
|
|
90
|
+
# Include Tesseract OCR benchmarks (sync)
|
91
|
+
uv run python -m benchmarks.src run --tesseract
|
92
|
+
|
93
|
+
# Include expanded Tesseract variant matrix (formats/PSM)
|
94
|
+
uv run python -m benchmarks.src run --tesseract --tesseract-matrix
|
95
|
+
|
96
|
+
# Compare Tesseract architectures (threads vs processes)
|
97
|
+
uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-arch
|
98
|
+
|
99
|
+
# Compare with custom worker counts (e.g., 1,4,8)
|
100
|
+
uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-arch --workers 1,4,8
|
101
|
+
|
90
102
|
# Custom test files directory
|
91
103
|
uv run python -m benchmarks.src run --test-files-dir /path/to/test/files
|
92
104
|
|
@@ -232,3 +244,21 @@ uv run python -m benchmarks.src run --sync-only --suite-name main_baseline
|
|
232
244
|
uv run python -m benchmarks.src run --sync-only --suite-name pr_test
|
233
245
|
uv run python -m benchmarks.src compare results/main_baseline.json results/pr_test.json
|
234
246
|
```
|
247
|
+
|
248
|
+
### Tesseract Benchmarks
|
249
|
+
|
250
|
+
The suite includes focused Tesseract OCR benchmarks:
|
251
|
+
|
252
|
+
- `--tesseract` adds thread-based batch OCR and a process-pool placeholder for A/B comparisons.
|
253
|
+
- `--tesseract-matrix` expands with a small matrix across output formats (`text`, `markdown`, `tsv`) and PSM modes
|
254
|
+
(`AUTO`, `SINGLE_BLOCK`, `SINGLE_LINE`) to quantify overhead of richer outputs and segmentation strategies.
|
255
|
+
|
256
|
+
Examples:
|
257
|
+
|
258
|
+
```bash
|
259
|
+
# Minimal Tesseract batch OCR benchmarks
|
260
|
+
uv run python -m benchmarks.src run --sync-only --tesseract
|
261
|
+
|
262
|
+
# Full Tesseract config matrix
|
263
|
+
uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-matrix
|
264
|
+
```
|
@@ -0,0 +1,179 @@
|
|
1
|
+
import json
|
2
|
+
import shutil
|
3
|
+
import tempfile
|
4
|
+
import time
|
5
|
+
from concurrent.futures import ProcessPoolExecutor
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any
|
8
|
+
|
9
|
+
from PIL import Image, ImageDraw
|
10
|
+
|
11
|
+
from kreuzberg import extract_file_sync
|
12
|
+
from kreuzberg._ocr._tesseract import _process_image_with_tesseract
|
13
|
+
from kreuzberg._types import ExtractionConfig
|
14
|
+
from kreuzberg._utils._process_pool import get_optimal_worker_count, process_pool
|
15
|
+
|
16
|
+
|
17
|
+
def create_test_images(sizes: list[tuple[int, int]], output_dir: Path) -> list[Path]:
|
18
|
+
output_dir.mkdir(exist_ok=True)
|
19
|
+
image_paths = []
|
20
|
+
|
21
|
+
for i, (width, height) in enumerate(sizes):
|
22
|
+
img = Image.new("RGB", (width, height), color="white")
|
23
|
+
draw = ImageDraw.Draw(img)
|
24
|
+
|
25
|
+
for y in range(0, height, 50):
|
26
|
+
for x in range(0, width, 100):
|
27
|
+
draw.text((x, y), f"Test {i}", fill="black")
|
28
|
+
|
29
|
+
path = output_dir / f"test_{width}x{height}_{i}.png"
|
30
|
+
img.save(path)
|
31
|
+
image_paths.append(path)
|
32
|
+
|
33
|
+
return image_paths
|
34
|
+
|
35
|
+
|
36
|
+
def benchmark_batch_fixed_workers(images: list[Path], num_workers: int) -> dict[str, Any]:
|
37
|
+
start = time.perf_counter()
|
38
|
+
config_dict = {"language": "eng", "psm": 3}
|
39
|
+
|
40
|
+
with ProcessPoolExecutor(max_workers=num_workers) as pool:
|
41
|
+
futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
|
42
|
+
[f.result() for f in futures]
|
43
|
+
|
44
|
+
duration = time.perf_counter() - start
|
45
|
+
return {
|
46
|
+
"strategy": "fixed",
|
47
|
+
"workers": num_workers,
|
48
|
+
"batch_size": len(images),
|
49
|
+
"duration": duration,
|
50
|
+
"per_image": duration / len(images) if images else 0,
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
def benchmark_batch_dynamic_workers(images: list[Path]) -> dict[str, Any]:
|
55
|
+
start = time.perf_counter()
|
56
|
+
config_dict = {"language": "eng", "psm": 3}
|
57
|
+
|
58
|
+
optimal_workers = get_optimal_worker_count(len(images), cpu_intensive=True)
|
59
|
+
|
60
|
+
with ProcessPoolExecutor(max_workers=optimal_workers) as pool:
|
61
|
+
futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
|
62
|
+
[f.result() for f in futures]
|
63
|
+
|
64
|
+
duration = time.perf_counter() - start
|
65
|
+
return {
|
66
|
+
"strategy": "dynamic",
|
67
|
+
"workers": optimal_workers,
|
68
|
+
"batch_size": len(images),
|
69
|
+
"duration": duration,
|
70
|
+
"per_image": duration / len(images) if images else 0,
|
71
|
+
}
|
72
|
+
|
73
|
+
|
74
|
+
def benchmark_batch_shared_pool(images: list[Path]) -> dict[str, Any]:
|
75
|
+
start = time.perf_counter()
|
76
|
+
config_dict = {"language": "eng", "psm": 3}
|
77
|
+
|
78
|
+
with process_pool() as pool:
|
79
|
+
futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
|
80
|
+
[f.result() for f in futures]
|
81
|
+
|
82
|
+
duration = time.perf_counter() - start
|
83
|
+
return {
|
84
|
+
"strategy": "shared_pool",
|
85
|
+
"workers": 14,
|
86
|
+
"batch_size": len(images),
|
87
|
+
"duration": duration,
|
88
|
+
"per_image": duration / len(images) if images else 0,
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
def benchmark_extraction_api(images: list[Path]) -> dict[str, Any]:
|
93
|
+
start = time.perf_counter()
|
94
|
+
|
95
|
+
config = ExtractionConfig(use_cache=False, force_ocr=True)
|
96
|
+
|
97
|
+
for image_path in images:
|
98
|
+
extract_file_sync(image_path, config=config)
|
99
|
+
|
100
|
+
duration = time.perf_counter() - start
|
101
|
+
return {
|
102
|
+
"strategy": "extraction_api",
|
103
|
+
"workers": "auto",
|
104
|
+
"batch_size": len(images),
|
105
|
+
"duration": duration,
|
106
|
+
"per_image": duration / len(images) if images else 0,
|
107
|
+
}
|
108
|
+
|
109
|
+
|
110
|
+
def main() -> None:
|
111
|
+
batch_sizes = [1, 2, 5, 10, 20]
|
112
|
+
image_sizes = [
|
113
|
+
(640, 480),
|
114
|
+
(1024, 768),
|
115
|
+
(1920, 1080),
|
116
|
+
]
|
117
|
+
|
118
|
+
test_dir = Path(tempfile.mkdtemp(prefix="kreuzberg_bench_"))
|
119
|
+
|
120
|
+
results = []
|
121
|
+
|
122
|
+
for img_width, img_height in image_sizes:
|
123
|
+
max_batch = max(batch_sizes)
|
124
|
+
images = create_test_images([(img_width, img_height)] * max_batch, test_dir)
|
125
|
+
|
126
|
+
for batch_size in batch_sizes:
|
127
|
+
batch = images[:batch_size]
|
128
|
+
|
129
|
+
strategies = []
|
130
|
+
|
131
|
+
fixed_result = benchmark_batch_fixed_workers(batch, 14)
|
132
|
+
strategies.append(fixed_result)
|
133
|
+
|
134
|
+
dynamic_result = benchmark_batch_dynamic_workers(batch)
|
135
|
+
strategies.append(dynamic_result)
|
136
|
+
|
137
|
+
shared_result = benchmark_batch_shared_pool(batch)
|
138
|
+
strategies.append(shared_result)
|
139
|
+
|
140
|
+
if batch_size <= 10:
|
141
|
+
api_result = benchmark_extraction_api(batch)
|
142
|
+
strategies.append(api_result)
|
143
|
+
|
144
|
+
baseline = fixed_result["duration"]
|
145
|
+
if baseline > 0:
|
146
|
+
for strategy in strategies[1:]:
|
147
|
+
improvement = ((baseline - strategy["duration"]) / baseline) * 100
|
148
|
+
strategy["improvement_pct"] = improvement
|
149
|
+
|
150
|
+
result_entry = {
|
151
|
+
"image_size": f"{img_width}x{img_height}",
|
152
|
+
"batch_size": batch_size,
|
153
|
+
"strategies": strategies,
|
154
|
+
}
|
155
|
+
results.append(result_entry)
|
156
|
+
|
157
|
+
output_file = Path("results/batch_size_benchmarks.json")
|
158
|
+
output_file.parent.mkdir(exist_ok=True)
|
159
|
+
|
160
|
+
with output_file.open("w") as f:
|
161
|
+
json.dump({"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "results": results}, f, indent=2)
|
162
|
+
|
163
|
+
for img_size in image_sizes:
|
164
|
+
size_str = f"{img_size[0]}x{img_size[1]}"
|
165
|
+
|
166
|
+
size_results = [r for r in results if r["image_size"] == size_str]
|
167
|
+
for result in size_results:
|
168
|
+
batch_size = result["batch_size"] # type: ignore[assignment]
|
169
|
+
strategies = result["strategies"] # type: ignore[assignment]
|
170
|
+
|
171
|
+
dynamic = next((s for s in strategies if s["strategy"] == "dynamic"), None)
|
172
|
+
if dynamic and "improvement_pct" in dynamic:
|
173
|
+
pass
|
174
|
+
|
175
|
+
shutil.rmtree(test_dir)
|
176
|
+
|
177
|
+
|
178
|
+
if __name__ == "__main__":
|
179
|
+
main()
|
@@ -0,0 +1,83 @@
|
|
1
|
+
import json
|
2
|
+
import time
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Any
|
5
|
+
|
6
|
+
from kreuzberg import extract_file_sync
|
7
|
+
from kreuzberg._types import ExtractionConfig
|
8
|
+
|
9
|
+
|
10
|
+
def benchmark_real_world_scenario(file_paths: list[Path], scenario_name: str) -> dict[str, Any]:
|
11
|
+
config = ExtractionConfig(use_cache=False)
|
12
|
+
|
13
|
+
start = time.perf_counter()
|
14
|
+
results = []
|
15
|
+
for path in file_paths:
|
16
|
+
result = extract_file_sync(path, config=config)
|
17
|
+
results.append(len(result.content))
|
18
|
+
|
19
|
+
duration = time.perf_counter() - start
|
20
|
+
|
21
|
+
return {
|
22
|
+
"scenario": scenario_name,
|
23
|
+
"file_count": len(file_paths),
|
24
|
+
"duration": duration,
|
25
|
+
"per_file": duration / len(file_paths),
|
26
|
+
"total_chars": sum(results),
|
27
|
+
}
|
28
|
+
|
29
|
+
|
30
|
+
def main() -> None:
|
31
|
+
test_dir = Path("/Users/naamanhirschfeld/workspace/kreuzberg/tests/test_source_files")
|
32
|
+
|
33
|
+
scenarios = []
|
34
|
+
|
35
|
+
mixed_files = []
|
36
|
+
for ext in ["*.pdf", "*.docx", "*.xlsx", "*.pptx"]:
|
37
|
+
mixed_files.extend(list(test_dir.glob(ext))[:2])
|
38
|
+
if mixed_files:
|
39
|
+
result = benchmark_real_world_scenario(mixed_files, "Mixed Office Documents")
|
40
|
+
scenarios.append(result)
|
41
|
+
|
42
|
+
image_files = []
|
43
|
+
for ext in ["*.png", "*.jpg", "*.jpeg"]:
|
44
|
+
image_files.extend(list(test_dir.glob(ext))[:3])
|
45
|
+
if image_files:
|
46
|
+
result = benchmark_real_world_scenario(image_files, "Image Batch Processing")
|
47
|
+
scenarios.append(result)
|
48
|
+
|
49
|
+
pdf_files = list(test_dir.glob("*.pdf"))[:5]
|
50
|
+
if pdf_files:
|
51
|
+
result = benchmark_real_world_scenario(pdf_files, "PDF Document Processing")
|
52
|
+
scenarios.append(result)
|
53
|
+
|
54
|
+
small_files = []
|
55
|
+
for ext in ["*.txt", "*.md", "*.html"]:
|
56
|
+
small_files.extend(list(test_dir.glob(ext))[:3])
|
57
|
+
if small_files:
|
58
|
+
result = benchmark_real_world_scenario(small_files, "Small Text Files")
|
59
|
+
scenarios.append(result)
|
60
|
+
|
61
|
+
total_files = sum(s["file_count"] for s in scenarios)
|
62
|
+
total_time = sum(s["duration"] for s in scenarios)
|
63
|
+
total_chars = sum(s["total_chars"] for s in scenarios)
|
64
|
+
|
65
|
+
output = {
|
66
|
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
67
|
+
"scenarios": scenarios,
|
68
|
+
"summary": {
|
69
|
+
"total_files": total_files,
|
70
|
+
"total_time": total_time,
|
71
|
+
"avg_per_file": total_time / total_files if total_files > 0 else 0,
|
72
|
+
"total_chars": total_chars,
|
73
|
+
"throughput": total_chars / total_time if total_time > 0 else 0,
|
74
|
+
},
|
75
|
+
}
|
76
|
+
|
77
|
+
output_file = Path("results/final_batch_validation.json")
|
78
|
+
with output_file.open("w") as f:
|
79
|
+
json.dump(output, f, indent=2)
|
80
|
+
|
81
|
+
|
82
|
+
if __name__ == "__main__":
|
83
|
+
main()
|
@@ -17,18 +17,13 @@ classifiers = [
|
|
17
17
|
# kreuzberg-bench = "src.cli:app"
|
18
18
|
|
19
19
|
dependencies = [
|
20
|
+
"click>=8.2.1",
|
20
21
|
"kreuzberg",
|
21
|
-
"
|
22
|
-
"memory-profiler>=0.61",
|
23
|
-
"pandas>=2",
|
22
|
+
"msgpack>=1.1.1",
|
24
23
|
"psutil>=5.9",
|
25
24
|
"py-spy>=0.3.14",
|
26
25
|
"rich>=13",
|
27
|
-
"typer>=0.9",
|
28
26
|
]
|
29
27
|
|
30
|
-
[tool.ruff]
|
31
|
-
lint.extend-ignore = [ "ARG002", "B008", "B904", "BLE001", "E722", "PLR2004", "PYI036", "SLF001" ]
|
32
|
-
|
33
28
|
[tool.uv.sources]
|
34
29
|
kreuzberg = { workspace = true }
|