kreuzberg 3.13.3__tar.gz → 3.14.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/ci.yaml +179 -16
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/docker-e2e-tests.yml +1 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/publish-docker.yml +1 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.pre-commit-config.yaml +1 -7
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/PKG-INFO +4 -4
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/ai-rulez.yaml +73 -7
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/examples/extraction-examples.md +1 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/api-server.md +59 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/extraction-configuration.md +75 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/ocr-configuration.md +65 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_api/main.py +126 -18
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_config.py +0 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_image.py +20 -2
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_pdf.py +21 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_spread_sheet.py +0 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_gmft.py +79 -33
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_mcp/server.py +0 -76
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_base.py +1 -2
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_paddleocr.py +39 -13
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_tesseract.py +2 -3
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_registry.py +26 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_types.py +66 -3
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_cache.py +34 -12
- kreuzberg-3.14.1/kreuzberg/_utils/_image_preprocessing.py +346 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_ocr_cache.py +2 -5
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_process_pool.py +3 -3
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_table.py +4 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/cli.py +19 -2
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/extraction.py +4 -4
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/pyproject.toml +7 -7
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/api/main_test.py +36 -2
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/conftest.py +99 -0
- kreuzberg-3.14.1/tests/core/dpi_configuration_test.py +353 -0
- kreuzberg-3.14.1/tests/core/html_to_markdown_config_test.py +0 -0
- kreuzberg-3.14.1/tests/core/mime_types_test.py +0 -0
- kreuzberg-3.14.1/tests/core/registry_test.py +0 -0
- kreuzberg-3.14.1/tests/core/types_test.py +0 -0
- kreuzberg-3.14.1/tests/e2e/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/e2e/docker_e2e_test.py +4 -4
- kreuzberg-3.14.1/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/html_test.py +1 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/image_test.py +7 -3
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/pandoc_test.py +1 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/pdf_test.py +7 -22
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/presentation_test.py +1 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/spreed_sheet_test.py +4 -0
- kreuzberg-3.14.1/tests/features/__init__.py +0 -0
- kreuzberg-3.14.1/tests/features/chunker_test.py +0 -0
- kreuzberg-3.14.1/tests/features/document_classification_test.py +0 -0
- kreuzberg-3.14.1/tests/features/entity_extraction_test.py +0 -0
- kreuzberg-3.14.1/tests/features/gmft_test.py +0 -0
- kreuzberg-3.14.1/tests/features/hooks_test.py +0 -0
- kreuzberg-3.14.1/tests/features/language_detection_test.py +0 -0
- kreuzberg-3.14.1/tests/integration/__init__.py +0 -0
- kreuzberg-3.14.1/tests/integration/api/__init__.py +0 -0
- kreuzberg-3.14.1/tests/integration/api/large_file_test.py +0 -0
- kreuzberg-3.14.1/tests/integration/api/mounted_config_test.py +0 -0
- kreuzberg-3.14.1/tests/integration/dpi_integration_test.py +244 -0
- kreuzberg-3.14.1/tests/integration/multiprocessing/__init__.py +0 -0
- kreuzberg-3.14.1/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- kreuzberg-3.14.1/tests/integration/ocr/__init__.py +0 -0
- kreuzberg-3.14.1/tests/integration/ocr/device_integration_test.py +0 -0
- kreuzberg-3.14.1/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- kreuzberg-3.14.1/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- kreuzberg-3.14.1/tests/integration/regression_test.py +134 -0
- kreuzberg-3.14.1/tests/interfaces/__init__.py +0 -0
- kreuzberg-3.14.1/tests/mcp/__init__.py +0 -0
- kreuzberg-3.14.1/tests/mcp/mcp_server_test.py +0 -0
- kreuzberg-3.14.1/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/multiprocessing/gmft_isolated_test.py +54 -58
- kreuzberg-3.14.1/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/easyocr_test.py +1 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/paddleocr_test.py +7 -5
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/tesseract_test.py +1 -1
- kreuzberg-3.14.1/tests/test_source_files/sharable-web-guide.pdf +0 -0
- kreuzberg-3.14.1/tests/utils/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/device_test.py +1 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/ocr_cache_test.py +7 -6
- kreuzberg-3.14.1/tests/utils/playa_helpers_test.py +0 -0
- kreuzberg-3.14.1/tests/utils/playa_test.py +0 -0
- kreuzberg-3.14.1/tests/utils/quality_test.py +121 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/serialization_test.py +1 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/table_test.py +6 -6
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/tmp_test.py +1 -1
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/uv.lock +297 -286
- kreuzberg-3.13.3/tests/api/large_file_test.py +0 -184
- kreuzberg-3.13.3/tests/api/mounted_config_test.py +0 -184
- kreuzberg-3.13.3/tests/chunker_test.py +0 -102
- kreuzberg-3.13.3/tests/cli_command_test.py +0 -481
- kreuzberg-3.13.3/tests/cli_integration_test.py +0 -858
- kreuzberg-3.13.3/tests/cli_test.py +0 -324
- kreuzberg-3.13.3/tests/config_test.py +0 -1540
- kreuzberg-3.13.3/tests/document_classification_test.py +0 -837
- kreuzberg-3.13.3/tests/entity_extraction_test.py +0 -588
- kreuzberg-3.13.3/tests/exceptions_test.py +0 -91
- kreuzberg-3.13.3/tests/extraction_batch_test.py +0 -253
- kreuzberg-3.13.3/tests/extraction_test.py +0 -752
- kreuzberg-3.13.3/tests/gmft_extended_test.py +0 -137
- kreuzberg-3.13.3/tests/gmft_test.py +0 -788
- kreuzberg-3.13.3/tests/hooks_test.py +0 -205
- kreuzberg-3.13.3/tests/html_to_markdown_config_test.py +0 -217
- kreuzberg-3.13.3/tests/language_detection_test.py +0 -152
- kreuzberg-3.13.3/tests/mcp_server_test.py +0 -757
- kreuzberg-3.13.3/tests/mime_types_test.py +0 -195
- kreuzberg-3.13.3/tests/multiprocessing/gmft_integration_test.py +0 -98
- kreuzberg-3.13.3/tests/ocr/device_integration_test.py +0 -268
- kreuzberg-3.13.3/tests/ocr/tesseract_tsv_integration_test.py +0 -273
- kreuzberg-3.13.3/tests/playa_helpers_test.py +0 -473
- kreuzberg-3.13.3/tests/playa_test.py +0 -111
- kreuzberg-3.13.3/tests/registry_test.py +0 -190
- kreuzberg-3.13.3/tests/regression_api_test.py +0 -81
- kreuzberg-3.13.3/tests/regression_test.py +0 -159
- kreuzberg-3.13.3/tests/regression_with_config_test.py +0 -145
- kreuzberg-3.13.3/tests/tesseract_sync_formats_test.py +0 -169
- kreuzberg-3.13.3/tests/types_test.py +0 -374
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.commitlintrc +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.deepsource.toml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.docker/README.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.dockerignore +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.github/workflows/test-docker-builds.yml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.gitignore +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/LICENSE +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/README.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/Taskfile.yml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/__main__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/benchmarks.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/cli.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/models.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/profiler.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/benchmarks/src/runner.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docker-compose.example.yml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docker-logs/docker-info.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docker-logs/docker-version.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/cli.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/contributing.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/index.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/mkdocs.yaml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/output.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/results/baseline.json +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/results/serialization.json +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/results/statistical.json +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/test_report.json +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/api/conftest.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/api/runtime_config_test.py +0 -0
- {kreuzberg-3.13.3/tests/e2e → kreuzberg-3.14.1/tests/core}/__init__.py +0 -0
- /kreuzberg-3.13.3/tests/extractors/__init__.py → /kreuzberg-3.14.1/tests/core/config_test.py +0 -0
- /kreuzberg-3.13.3/tests/multiprocessing/__init__.py → /kreuzberg-3.14.1/tests/core/exceptions_test.py +0 -0
- /kreuzberg-3.13.3/tests/ocr/__init__.py → /kreuzberg-3.14.1/tests/core/extraction_batch_test.py +0 -0
- /kreuzberg-3.13.3/tests/utils/__init__.py → /kreuzberg-3.14.1/tests/core/extraction_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/ocr/tesseract_tsv_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/google-doc-document.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/tables/borderless_table.png +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/tables/complex_document.png +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/test-article.pdf +0 -0
- /kreuzberg-3.13.3/tests/test_source_files/testXls.xls → /kreuzberg-3.14.1/tests/test_source_files/test-excel.xls +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/ref_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.13.3 → kreuzberg-3.14.1}/tests/utils/sync_test.py +0 -0
@@ -7,7 +7,6 @@ on:
|
|
7
7
|
push:
|
8
8
|
branches:
|
9
9
|
- main
|
10
|
-
- feat/smart-multiprocessing
|
11
10
|
|
12
11
|
jobs:
|
13
12
|
validate:
|
@@ -38,7 +37,7 @@ jobs:
|
|
38
37
|
echo "Removing existing .venv directory on Windows"
|
39
38
|
rm -rf .venv
|
40
39
|
fi
|
41
|
-
uv sync --all-
|
40
|
+
uv sync --all-extras --dev
|
42
41
|
shell: bash
|
43
42
|
|
44
43
|
- name: Load Cached Pre-Commit Dependencies
|
@@ -53,6 +52,7 @@ jobs:
|
|
53
52
|
|
54
53
|
coverage:
|
55
54
|
needs: validate
|
55
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
56
56
|
runs-on: ubuntu-latest
|
57
57
|
timeout-minutes: 120
|
58
58
|
steps:
|
@@ -88,7 +88,7 @@ jobs:
|
|
88
88
|
max_attempts: 3
|
89
89
|
retry_wait_seconds: 30
|
90
90
|
command: |
|
91
|
-
uv sync --all-
|
91
|
+
uv sync --all-extras --dev
|
92
92
|
shell: bash
|
93
93
|
|
94
94
|
- name: Install System Dependencies
|
@@ -115,7 +115,7 @@ jobs:
|
|
115
115
|
shell: bash
|
116
116
|
|
117
117
|
- name: Upload Coverage to DeepSource
|
118
|
-
if: always()
|
118
|
+
if: always()
|
119
119
|
env:
|
120
120
|
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
121
121
|
run: |
|
@@ -134,15 +134,178 @@ jobs:
|
|
134
134
|
.coverage
|
135
135
|
retention-days: 7
|
136
136
|
|
137
|
-
test:
|
138
|
-
needs:
|
137
|
+
test-pr:
|
138
|
+
needs: validate
|
139
|
+
if: github.event_name == 'pull_request' && needs.validate.result == 'success'
|
140
|
+
runs-on: ubuntu-latest
|
141
|
+
strategy:
|
142
|
+
fail-fast: false
|
143
|
+
matrix:
|
144
|
+
test-category:
|
145
|
+
- name: "core"
|
146
|
+
path: "tests/core,tests/utils"
|
147
|
+
system-deps: false
|
148
|
+
timeout: 15
|
149
|
+
- name: "extractors"
|
150
|
+
path: "tests/extractors"
|
151
|
+
system-deps: true
|
152
|
+
timeout: 20
|
153
|
+
- name: "integration"
|
154
|
+
path: "tests/integration,tests/api"
|
155
|
+
system-deps: true
|
156
|
+
timeout: 25
|
157
|
+
- name: "features"
|
158
|
+
path: "tests/features,tests/interfaces,tests/mcp,tests/multiprocessing,tests/ocr"
|
159
|
+
system-deps: true
|
160
|
+
timeout: 20
|
161
|
+
timeout-minutes: ${{ matrix.test-category.timeout }}
|
162
|
+
steps:
|
163
|
+
- name: Checkout
|
164
|
+
uses: actions/checkout@v5
|
165
|
+
|
166
|
+
- name: Install uv
|
167
|
+
uses: astral-sh/setup-uv@v6
|
168
|
+
with:
|
169
|
+
enable-cache: true
|
170
|
+
|
171
|
+
- name: Install Python
|
172
|
+
uses: actions/setup-python@v6
|
173
|
+
with:
|
174
|
+
python-version: "3.13"
|
175
|
+
|
176
|
+
- name: Cache Python Dependencies
|
177
|
+
uses: actions/cache@v4
|
178
|
+
with:
|
179
|
+
path: |
|
180
|
+
~/.cache/uv
|
181
|
+
.venv
|
182
|
+
key: python-dependencies-ubuntu-latest-3.13-${{ matrix.test-category.name }}-${{ hashFiles('uv.lock') }}
|
183
|
+
restore-keys: |
|
184
|
+
python-dependencies-ubuntu-latest-3.13-
|
185
|
+
|
186
|
+
- name: Install Dependencies
|
187
|
+
run: uv sync --all-extras --dev
|
188
|
+
|
189
|
+
- name: Install System Dependencies
|
190
|
+
if: matrix.test-category.system-deps
|
191
|
+
run: |
|
192
|
+
sudo apt-get update
|
193
|
+
sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
|
194
|
+
|
195
|
+
- name: Run Tests - ${{ matrix.test-category.name }}
|
196
|
+
run: uv run pytest $(echo "${{ matrix.test-category.path }}" | tr ',' ' ') -v --reruns 1 --reruns-delay 1 --cov=kreuzberg --cov-append --cov-report=lcov:coverage-${{ matrix.test-category.name }}.lcov
|
197
|
+
|
198
|
+
- name: Upload Coverage Artifacts
|
199
|
+
uses: actions/upload-artifact@v4
|
200
|
+
with:
|
201
|
+
name: coverage-${{ matrix.test-category.name }}-${{ github.sha }}
|
202
|
+
path: coverage-${{ matrix.test-category.name }}.lcov
|
203
|
+
retention-days: 1
|
204
|
+
|
205
|
+
coverage-pr:
|
206
|
+
needs: test-pr
|
207
|
+
if: github.event_name == 'pull_request' && always()
|
208
|
+
runs-on: ubuntu-latest
|
209
|
+
timeout-minutes: 10
|
210
|
+
steps:
|
211
|
+
- name: Checkout
|
212
|
+
uses: actions/checkout@v5
|
213
|
+
|
214
|
+
- name: Download Coverage Artifacts
|
215
|
+
uses: actions/download-artifact@v4
|
216
|
+
with:
|
217
|
+
pattern: coverage-*-${{ github.sha }}
|
218
|
+
merge-multiple: true
|
219
|
+
|
220
|
+
- name: Install uv
|
221
|
+
uses: astral-sh/setup-uv@v6
|
222
|
+
with:
|
223
|
+
enable-cache: true
|
224
|
+
|
225
|
+
- name: Install Python
|
226
|
+
uses: actions/setup-python@v6
|
227
|
+
with:
|
228
|
+
python-version: "3.13"
|
229
|
+
|
230
|
+
- name: Install Dependencies
|
231
|
+
run: uv sync --dev
|
232
|
+
|
233
|
+
- name: Combine Coverage Reports
|
234
|
+
run: |
|
235
|
+
# Install lcov for combining reports
|
236
|
+
sudo apt-get update && sudo apt-get install -y lcov
|
237
|
+
|
238
|
+
# List available coverage files
|
239
|
+
echo "Available coverage files:"
|
240
|
+
find . -name "coverage-*.lcov" -type f || echo "No coverage files found"
|
241
|
+
|
242
|
+
# Combine all lcov files if they exist
|
243
|
+
coverage_files=($(find . -name "coverage-*.lcov" -type f))
|
244
|
+
if [ ${#coverage_files[@]} -gt 0 ]; then
|
245
|
+
echo "Combining ${#coverage_files[@]} coverage files..."
|
246
|
+
if [ ${#coverage_files[@]} -eq 1 ]; then
|
247
|
+
# Only one file, just copy it
|
248
|
+
cp "${coverage_files[0]}" coverage.lcov
|
249
|
+
else
|
250
|
+
# Multiple files, combine them
|
251
|
+
lcov --rc branch_coverage=1 $(printf " -a %s" "${coverage_files[@]}") -o coverage.lcov
|
252
|
+
fi
|
253
|
+
else
|
254
|
+
echo "No coverage files to combine, creating empty coverage.lcov"
|
255
|
+
echo "TN:" > coverage.lcov
|
256
|
+
echo "end_of_record" >> coverage.lcov
|
257
|
+
fi
|
258
|
+
|
259
|
+
- name: Upload Coverage to DeepSource
|
260
|
+
if: always()
|
261
|
+
env:
|
262
|
+
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
263
|
+
run: |
|
264
|
+
# Install DeepSource CLI
|
265
|
+
curl -fsSL https://deepsource.io/cli | sh
|
266
|
+
# Upload coverage report
|
267
|
+
./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
|
268
|
+
|
269
|
+
test-full:
|
270
|
+
needs: validate
|
271
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main' && needs.validate.result == 'success'
|
139
272
|
runs-on: ${{ matrix.os }}
|
140
273
|
strategy:
|
141
274
|
fail-fast: false
|
142
275
|
matrix:
|
143
276
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
144
277
|
python: ["3.10", "3.11", "3.12", "3.13"]
|
145
|
-
|
278
|
+
test-category:
|
279
|
+
- name: "core"
|
280
|
+
path: "tests/core,tests/utils"
|
281
|
+
system-deps: false
|
282
|
+
timeout: 20
|
283
|
+
- name: "extractors"
|
284
|
+
path: "tests/extractors"
|
285
|
+
system-deps: true
|
286
|
+
timeout: 25
|
287
|
+
- name: "integration"
|
288
|
+
path: "tests/integration,tests/api"
|
289
|
+
system-deps: true
|
290
|
+
timeout: 30
|
291
|
+
- name: "features"
|
292
|
+
path: "tests/features,tests/interfaces,tests/mcp,tests/multiprocessing,tests/ocr"
|
293
|
+
system-deps: true
|
294
|
+
timeout: 25
|
295
|
+
exclude:
|
296
|
+
- test-category: {name: "extractors"}
|
297
|
+
python: "3.11"
|
298
|
+
- test-category: {name: "extractors"}
|
299
|
+
python: "3.12"
|
300
|
+
- test-category: {name: "integration"}
|
301
|
+
python: "3.11"
|
302
|
+
- test-category: {name: "integration"}
|
303
|
+
python: "3.12"
|
304
|
+
- test-category: {name: "features"}
|
305
|
+
python: "3.11"
|
306
|
+
- test-category: {name: "features"}
|
307
|
+
python: "3.12"
|
308
|
+
timeout-minutes: ${{ matrix.test-category.timeout }}
|
146
309
|
steps:
|
147
310
|
- name: Checkout
|
148
311
|
uses: actions/checkout@v5
|
@@ -180,7 +343,7 @@ jobs:
|
|
180
343
|
echo "Removing existing .venv directory on Windows"
|
181
344
|
rm -rf .venv
|
182
345
|
fi
|
183
|
-
uv sync --all-
|
346
|
+
uv sync --all-extras --dev
|
184
347
|
shell: bash
|
185
348
|
|
186
349
|
- name: Cache Test Artifacts
|
@@ -190,7 +353,7 @@ jobs:
|
|
190
353
|
key: pytest-cache-${{ matrix.os }}-${{ matrix.python }}
|
191
354
|
|
192
355
|
- name: Cache and Install Homebrew (macOS)
|
193
|
-
if: runner.os == 'macOS'
|
356
|
+
if: runner.os == 'macOS' && matrix.test-category.system-deps
|
194
357
|
uses: nick-fields/retry@v3
|
195
358
|
with:
|
196
359
|
timeout_minutes: 10
|
@@ -204,7 +367,7 @@ jobs:
|
|
204
367
|
shell: bash
|
205
368
|
|
206
369
|
- name: Cache and Install APT Packages (Linux)
|
207
|
-
if: runner.os == 'Linux'
|
370
|
+
if: runner.os == 'Linux' && matrix.test-category.system-deps
|
208
371
|
uses: nick-fields/retry@v3
|
209
372
|
with:
|
210
373
|
timeout_minutes: 5
|
@@ -216,7 +379,7 @@ jobs:
|
|
216
379
|
shell: bash
|
217
380
|
|
218
381
|
- name: Install System Dependencies (Windows)
|
219
|
-
if: runner.os == 'Windows'
|
382
|
+
if: runner.os == 'Windows' && matrix.test-category.system-deps
|
220
383
|
uses: nick-fields/retry@v3
|
221
384
|
with:
|
222
385
|
timeout_minutes: 10
|
@@ -231,12 +394,12 @@ jobs:
|
|
231
394
|
pandoc --version
|
232
395
|
shell: pwsh
|
233
396
|
|
234
|
-
- name: Run Tests
|
397
|
+
- name: Run Tests - ${{ matrix.test-category.name }}
|
235
398
|
uses: nick-fields/retry@v3
|
236
399
|
with:
|
237
|
-
timeout_minutes:
|
238
|
-
max_attempts:
|
239
|
-
retry_wait_seconds:
|
400
|
+
timeout_minutes: 10
|
401
|
+
max_attempts: 2
|
402
|
+
retry_wait_seconds: 5
|
240
403
|
command: |
|
241
|
-
uv run pytest -
|
404
|
+
uv run pytest $(echo "${{ matrix.test-category.path }}" | tr ',' ' ') -v --reruns 1 --reruns-delay 1
|
242
405
|
shell: bash
|
@@ -5,12 +5,6 @@ repos:
|
|
5
5
|
- id: commitlint
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
|
-
# Temporarily disabled due to CI environment issues
|
9
|
-
# - repo: https://github.com/Goldziher/ai-rulez
|
10
|
-
# rev: v2.0.1
|
11
|
-
# hooks:
|
12
|
-
# - id: ai-rulez-validate
|
13
|
-
# - id: ai-rulez-generate
|
14
8
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
15
9
|
rev: v6.0.0
|
16
10
|
hooks:
|
@@ -54,7 +48,7 @@ repos:
|
|
54
48
|
hooks:
|
55
49
|
- id: pyproject-fmt
|
56
50
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
57
|
-
rev: v0.
|
51
|
+
rev: v0.13.0
|
58
52
|
hooks:
|
59
53
|
- id: ruff
|
60
54
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.14.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,10 +31,10 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
35
|
-
Requires-Dist: mcp>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.11.0
|
35
|
+
Requires-Dist: mcp>=1.14.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
|
-
Requires-Dist: numpy>=
|
37
|
+
Requires-Dist: numpy>=2.0.0
|
38
38
|
Requires-Dist: playa-pdf>=0.7.0
|
39
39
|
Requires-Dist: polars>=1.33.1
|
40
40
|
Requires-Dist: psutil>=7.0.0
|
@@ -388,6 +388,18 @@ rules:
|
|
388
388
|
- When committing, always use the format specified in the CLAUDE.md
|
389
389
|
name: Important Instructions
|
390
390
|
priority: critical
|
391
|
+
- content: |
|
392
|
+
### Docstrings and Comments Guidelines
|
393
|
+
- **NO docstrings in private functions** (functions starting with `_`)
|
394
|
+
- **NO docstrings in private files** (files starting with `_`)
|
395
|
+
- **NO docstrings in private folders** (folders starting with `_`)
|
396
|
+
- **NO docstrings in test files** (files in `tests/` directory)
|
397
|
+
- **ONLY use docstrings in public API** (what's documented in API reference)
|
398
|
+
- **NO redundant comments** - code should be self-documenting
|
399
|
+
- **Comments only when necessary** to explain complex logic or non-obvious decisions
|
400
|
+
- **Prefer clear variable/function names** over comments
|
401
|
+
name: Documentation Standards
|
402
|
+
priority: critical
|
391
403
|
- content: |
|
392
404
|
### Core Extraction Flow
|
393
405
|
1. **Entry Point**: `extraction.py` provides main functions (`extract_file`, `extract_bytes`, etc.)
|
@@ -425,14 +437,68 @@ rules:
|
|
425
437
|
name: Error Handling
|
426
438
|
priority: medium
|
427
439
|
- content: |
|
428
|
-
|
429
|
-
-
|
430
|
-
-
|
431
|
-
|
432
|
-
|
433
|
-
|
440
|
+
### File Organization and Naming
|
441
|
+
- **Test files MUST end with `_test.py`** - No exceptions
|
442
|
+
- **Logical directory structure**:
|
443
|
+
```
|
444
|
+
tests/
|
445
|
+
├── core/ # Core functionality (extraction, config, types)
|
446
|
+
├── features/ # Feature-specific (chunking, language detection)
|
447
|
+
├── integration/ # Integration tests (API, real file processing)
|
448
|
+
├── interfaces/ # User interfaces (CLI, MCP)
|
449
|
+
├── extractors/ # File format extractors (PDF, image, etc.)
|
450
|
+
├── ocr/ # OCR backend tests
|
451
|
+
├── utils/ # Utility function tests
|
452
|
+
└── e2e/ # End-to-end tests
|
453
|
+
```
|
454
|
+
|
455
|
+
### Test Structure and Patterns
|
456
|
+
- **Only function-based tests** - No class-based test methods
|
457
|
+
- **Test naming**: `test_<function>_<scenario>_<expected_outcome>`
|
458
|
+
- ✅ `test_extract_pdf_with_ocr_returns_text()`
|
459
|
+
- ✅ `test_extract_file_raises_validation_error_when_file_missing()`
|
460
|
+
- ❌ `test_basic_extraction()` (too vague)
|
461
|
+
- **Async/sync variants**: Test both async and sync versions where applicable
|
462
|
+
- **Parameterized tests**: Use `@pytest.mark.parametrize` for multiple scenarios
|
463
|
+
|
464
|
+
### Mocking Guidelines
|
465
|
+
- **NEVER mock anyio/asyncio** - Only mock external dependencies
|
466
|
+
- **Mock external services**: OCR engines, file system operations, network calls
|
467
|
+
- **Use real objects when possible**: Prefer `tmp_path` over mocking file operations
|
468
|
+
- **Mock sparingly**: Only when necessary for isolation or performance
|
469
|
+
- **Add comments for legitimate mocks**: Explain why mocking is required
|
470
|
+
```python
|
471
|
+
# Mock OCR backend for predictable testing ~keep
|
472
|
+
mock_ocr = mocker.patch("kreuzberg._ocr.get_backend")
|
473
|
+
```
|
474
|
+
|
475
|
+
### Fixtures and Test Data
|
476
|
+
- **Session-scoped fixtures** for stateless objects (extractors, configs)
|
477
|
+
- **Shared fixtures** in `tests/conftest.py` for common test data
|
478
|
+
- **Test files** in `tests/test_source_files/` for various formats
|
479
|
+
- **Temporary files** using pytest's `tmp_path` fixture
|
480
|
+
|
481
|
+
### Test Coverage and Quality
|
482
|
+
- **95% minimum coverage** requirement
|
483
|
+
- **Test all error paths** - Every exception should be tested
|
484
|
+
- **Edge cases**: Empty inputs, large files, malformed data
|
485
|
+
- **Performance considerations**: Mark slow tests appropriately
|
486
|
+
- **CI resilience**: OCR tests marked as `xfail` in CI environments
|
487
|
+
|
488
|
+
### Test Helpers and Utilities
|
489
|
+
- **Shared assertions** in `tests/extractors/test_helpers.py`:
|
490
|
+
- `assert_valid_extraction_result()` - Standard result validation
|
491
|
+
- `assert_extraction_error()` - Error case validation
|
492
|
+
- `assert_ocr_result()` - OCR-specific validation
|
493
|
+
- **Avoid repetitive assertions** - Use helper functions
|
494
|
+
|
495
|
+
### Integration vs Unit Tests
|
496
|
+
- **Unit tests**: Fast, isolated, mock external dependencies
|
497
|
+
- **Integration tests**: Real file processing, external services, end-to-end flows
|
498
|
+
- **Separation**: Integration tests in `tests/integration/` directory
|
499
|
+
- **Timeouts**: Integration tests use timeouts and retry logic
|
434
500
|
name: Testing Patterns
|
435
|
-
priority:
|
501
|
+
priority: critical
|
436
502
|
- content: |
|
437
503
|
### GitHub Actions Workflows
|
438
504
|
- **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
|
@@ -135,7 +135,7 @@ async def extract_tables_from_pdf():
|
|
135
135
|
print(f"Table {i+1} on page {table['page_number']}:")
|
136
136
|
print(table["text"]) # Markdown formatted table
|
137
137
|
|
138
|
-
# Work with the
|
138
|
+
# Work with the polars DataFrame
|
139
139
|
df = table["df"]
|
140
140
|
print(f"Table shape: {df.shape}")
|
141
141
|
|
@@ -62,6 +62,7 @@ Extract text from one or more files.
|
|
62
62
|
- Method: `POST`
|
63
63
|
- Content-Type: `multipart/form-data`
|
64
64
|
- Body: One or more files with field name `data`
|
65
|
+
- **Maximum file size: 1GB per file**
|
65
66
|
|
66
67
|
**Response:**
|
67
68
|
|
@@ -222,6 +223,42 @@ curl -X POST "http://localhost:8000/extract?max_chars=1000" \
|
|
222
223
|
|
223
224
|
Result: max_chars will be 500 (from header)
|
224
225
|
|
226
|
+
## Interactive API Documentation
|
227
|
+
|
228
|
+
Kreuzberg automatically generates comprehensive OpenAPI documentation that you can access through your web browser when the API server is running.
|
229
|
+
|
230
|
+
### Accessing the Documentation
|
231
|
+
|
232
|
+
Once the API server is running, you can access interactive documentation at:
|
233
|
+
|
234
|
+
- **OpenAPI Schema**: `http://localhost:8000/schema/openapi.json`
|
235
|
+
- **Swagger UI**: `http://localhost:8000/schema/swagger`
|
236
|
+
- **ReDoc Documentation**: `http://localhost:8000/schema/redoc`
|
237
|
+
- **Stoplight Elements**: `http://localhost:8000/schema/elements`
|
238
|
+
- **RapiDoc**: `http://localhost:8000/schema/rapidoc`
|
239
|
+
|
240
|
+
### Features
|
241
|
+
|
242
|
+
The interactive documentation provides:
|
243
|
+
|
244
|
+
- **Complete API Reference**: All endpoints with detailed parameter descriptions
|
245
|
+
- **Try It Out**: Test API endpoints directly from the browser
|
246
|
+
- **Request/Response Examples**: Sample requests and responses for each endpoint
|
247
|
+
- **Schema Validation**: Interactive validation of request parameters
|
248
|
+
- **Download Options**: Export the OpenAPI specification
|
249
|
+
|
250
|
+
### Example Usage
|
251
|
+
|
252
|
+
```bash
|
253
|
+
# Start the API server
|
254
|
+
litestar --app kreuzberg._api.main:app run
|
255
|
+
|
256
|
+
# Open your browser to view the documentation
|
257
|
+
open http://localhost:8000/schema/swagger
|
258
|
+
```
|
259
|
+
|
260
|
+
The documentation includes examples for all configuration options, making it easy to understand the full capabilities of the extraction API.
|
261
|
+
|
225
262
|
#### Error Handling
|
226
263
|
|
227
264
|
Invalid configuration returns appropriate error responses:
|
@@ -258,6 +295,27 @@ Error responses include:
|
|
258
295
|
}
|
259
296
|
```
|
260
297
|
|
298
|
+
### Debugging 500 Errors
|
299
|
+
|
300
|
+
For detailed error information when 500 Internal Server Errors occur, set the `DEBUG` environment variable:
|
301
|
+
|
302
|
+
```bash
|
303
|
+
# Enable debug mode for detailed 500 error responses
|
304
|
+
DEBUG=1 litestar --app kreuzberg._api.main:app run
|
305
|
+
|
306
|
+
# Or with uvicorn
|
307
|
+
DEBUG=1 uvicorn kreuzberg._api.main:app --host 0.0.0.0 --port 8000
|
308
|
+
```
|
309
|
+
|
310
|
+
When `DEBUG=1` is set, 500 errors will include:
|
311
|
+
|
312
|
+
- Full stack traces
|
313
|
+
- Detailed error context
|
314
|
+
- Internal state information
|
315
|
+
- Request debugging details
|
316
|
+
|
317
|
+
⚠️ **Warning**: Only enable debug mode in development environments. Debug information may expose sensitive details and should never be used in production.
|
318
|
+
|
261
319
|
## Features
|
262
320
|
|
263
321
|
- **Runtime Configuration**: Configure extraction via query parameters and HTTP headers
|
@@ -301,6 +359,7 @@ For production use, consider:
|
|
301
359
|
1. **Monitoring**: Enable OpenTelemetry exporters
|
302
360
|
1. **Rate Limiting**: Add rate limiting middleware
|
303
361
|
1. **Authentication**: Add authentication middleware if needed
|
362
|
+
1. **Security**: Ensure `DEBUG` environment variable is not set
|
304
363
|
|
305
364
|
Example production command:
|
306
365
|
|
@@ -57,6 +57,13 @@ detector_base_threshold = 0.9
|
|
57
57
|
remove_null_rows = true
|
58
58
|
enable_multi_header = true
|
59
59
|
|
60
|
+
# DPI and Image Processing configuration
|
61
|
+
target_dpi = 150 # Target DPI for document processing
|
62
|
+
max_image_dimension = 25000 # Maximum pixel dimension before auto-scaling
|
63
|
+
auto_adjust_dpi = true # Automatically adjust DPI for large documents
|
64
|
+
min_dpi = 72 # Minimum DPI threshold
|
65
|
+
max_dpi = 600 # Maximum DPI threshold
|
66
|
+
|
60
67
|
# Language detection configuration
|
61
68
|
[language_detection]
|
62
69
|
multilingual = true
|
@@ -91,6 +98,13 @@ auto_detect_document_type = true
|
|
91
98
|
document_classification_mode = "text"
|
92
99
|
type_confidence_threshold = 0.5
|
93
100
|
|
101
|
+
# DPI and Image Processing
|
102
|
+
target_dpi = 150
|
103
|
+
max_image_dimension = 25000
|
104
|
+
auto_adjust_dpi = true
|
105
|
+
min_dpi = 72
|
106
|
+
max_dpi = 600
|
107
|
+
|
94
108
|
[tool.kreuzberg.tesseract]
|
95
109
|
language = "eng"
|
96
110
|
psm = 6
|
@@ -536,6 +550,67 @@ python -m spacy download fr_core_news_sm # French
|
|
536
550
|
|
537
551
|
Available spaCy models include: `en_core_web_sm`, `de_core_news_sm`, `fr_core_news_sm`, `es_core_news_sm`, `pt_core_news_sm`, `it_core_news_sm`, `nl_core_news_sm`, `zh_core_web_sm`, `ja_core_news_sm`, `ko_core_news_sm`, `ru_core_news_sm`, and many others.
|
538
552
|
|
553
|
+
### DPI and Image Processing
|
554
|
+
|
555
|
+
Kreuzberg provides intelligent DPI (dots per inch) configuration to optimize document processing quality and performance. This feature automatically handles image scaling for large documents while maintaining OCR quality.
|
556
|
+
|
557
|
+
```python
|
558
|
+
from kreuzberg import extract_file, ExtractionConfig
|
559
|
+
|
560
|
+
# Default DPI configuration (optimized for most documents)
|
561
|
+
result = await extract_file("large_document.pdf")
|
562
|
+
|
563
|
+
# Custom DPI configuration for high-quality documents
|
564
|
+
config = ExtractionConfig(
|
565
|
+
target_dpi=200, # Higher quality for detailed documents
|
566
|
+
max_image_dimension=30000, # Allow larger images
|
567
|
+
auto_adjust_dpi=True, # Automatically scale down if too large
|
568
|
+
min_dpi=100, # Higher minimum for quality
|
569
|
+
max_dpi=400, # Lower maximum to control processing time
|
570
|
+
)
|
571
|
+
result = await extract_file("technical_drawing.pdf", config=config)
|
572
|
+
|
573
|
+
# Fast processing configuration for large batches
|
574
|
+
config = ExtractionConfig(
|
575
|
+
target_dpi=120, # Lower DPI for faster processing
|
576
|
+
max_image_dimension=15000, # Smaller maximum size
|
577
|
+
auto_adjust_dpi=True, # Still allow automatic scaling
|
578
|
+
)
|
579
|
+
result = await extract_file("large_batch_document.pdf", config=config)
|
580
|
+
```
|
581
|
+
|
582
|
+
#### DPI Configuration Options
|
583
|
+
|
584
|
+
- **`target_dpi`** (default: 150): The desired DPI for document processing. Higher values provide better quality but slower processing.
|
585
|
+
|
586
|
+
- **`max_image_dimension`** (default: 25000): Maximum pixel dimension (width or height) before automatic scaling kicks in.
|
587
|
+
|
588
|
+
- **`auto_adjust_dpi`** (default: True): Automatically reduce DPI for oversized documents to stay within memory and processing limits.
|
589
|
+
|
590
|
+
- **`min_dpi`** / **`max_dpi`** (defaults: 72/600): Bounds for automatic DPI adjustment to ensure quality remains within acceptable ranges.
|
591
|
+
|
592
|
+
#### When to Adjust DPI Settings
|
593
|
+
|
594
|
+
**Increase DPI for:**
|
595
|
+
|
596
|
+
- Technical documents with small text or fine details
|
597
|
+
- Documents that will undergo further image processing
|
598
|
+
- High-quality archival processing
|
599
|
+
|
600
|
+
**Decrease DPI for:**
|
601
|
+
|
602
|
+
- Large batch processing where speed is important
|
603
|
+
- Documents with simple layouts and large text
|
604
|
+
- Memory-constrained environments
|
605
|
+
|
606
|
+
**Use auto-adjustment for:**
|
607
|
+
|
608
|
+
- Mixed document types with varying sizes
|
609
|
+
- Unknown document dimensions
|
610
|
+
- Production environments processing diverse content
|
611
|
+
|
612
|
+
The DPI system prevents "Image too large" errors while maintaining optimal quality-performance balance.
|
613
|
+
|
539
614
|
### Batch Processing
|
540
615
|
|
541
616
|
```python
|