kreuzberg 3.8.2__tar.gz → 3.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-3.9.1/.deepsource.toml +54 -0
- kreuzberg-3.9.1/.github/workflows/ci.yaml +197 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.gitignore +3 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.pre-commit-config.yaml +1 -1
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/PKG-INFO +17 -13
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/README.md +6 -5
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/end_to_end_benchmark.py +1 -1
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/statistical_benchmark.py +1 -1
- kreuzberg-3.9.1/docs/changelog.md +49 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/index.md +1 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/basic-usage.md +28 -0
- kreuzberg-3.9.1/docs/user-guide/document-classification.md +53 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/extraction-configuration.md +6 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/index.md +1 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_chunker.py +3 -3
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_config.py +1 -1
- kreuzberg-3.9.1/kreuzberg/_document_classification.py +156 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_entity_extraction.py +3 -3
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_image.py +4 -3
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_pdf.py +18 -10
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_spread_sheet.py +4 -5
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_structured.py +24 -18
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_gmft.py +25 -31
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_mime_types.py +1 -1
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/_base.py +1 -1
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/_easyocr.py +4 -4
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/_paddleocr.py +3 -3
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/_tesseract.py +10 -14
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_types.py +23 -7
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_cache.py +2 -3
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_device.py +7 -7
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/cli.py +2 -2
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/extraction.py +18 -9
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/mkdocs.yaml +1 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/pyproject.toml +27 -10
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/conftest.py +5 -0
- kreuzberg-3.9.1/tests/document_classification_test.py +86 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/entity_extraction_test.py +2 -2
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/pdf_test.py +0 -2
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/gmft_test.py +3 -3
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/gmft_integration_test.py +2 -1
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/gmft_isolated_test.py +5 -9
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/device_integration_test.py +14 -13
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/paddleocr_test.py +0 -5
- kreuzberg-3.9.1/tests/test_source_files/contract_test.txt +4 -0
- kreuzberg-3.9.1/tests/test_source_files/form_test.txt +5 -0
- kreuzberg-3.9.1/tests/test_source_files/invoice_image.png +0 -0
- kreuzberg-3.9.1/tests/test_source_files/invoice_test.txt +4 -0
- kreuzberg-3.9.1/tests/test_source_files/receipt_test.txt +5 -0
- kreuzberg-3.9.1/tests/test_source_files/report_test.txt +4 -0
- kreuzberg-3.9.1/tests/utils/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/uv.lock +910 -715
- kreuzberg-3.8.2/.github/workflows/ci.yaml +0 -124
- kreuzberg-3.8.2/docs/changelog.md +0 -32
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.commitlintrc +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.docker/README.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.dockerignore +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/LICENSE +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/ai-rulez.yaml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/cli.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/contributing.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/api/main_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/config_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extraction_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/email_comprehensive_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
- /kreuzberg-3.8.2/tests/utils/__init__.py → /kreuzberg-3.9.1/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/types_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.8.2 → kreuzberg-3.9.1}/tests/utils/tmp_test.py +0 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
version = 1
|
2
|
+
|
3
|
+
test_patterns = ["tests/**"]
|
4
|
+
|
5
|
+
exclude_patterns = [
|
6
|
+
# Virtual environments
|
7
|
+
".venv/**",
|
8
|
+
"venv/**",
|
9
|
+
|
10
|
+
# Build and distribution artifacts
|
11
|
+
"dist/**",
|
12
|
+
"build/**",
|
13
|
+
"*.egg-info/**",
|
14
|
+
|
15
|
+
# Documentation
|
16
|
+
"docs/**",
|
17
|
+
"site/**",
|
18
|
+
|
19
|
+
# Cache directories
|
20
|
+
"**/__pycache__/**",
|
21
|
+
".pytest_cache/**",
|
22
|
+
".mypy_cache/**",
|
23
|
+
".ruff_cache/**",
|
24
|
+
".coverage",
|
25
|
+
"htmlcov/**",
|
26
|
+
|
27
|
+
# Benchmarks and performance tests
|
28
|
+
"benchmarks/**",
|
29
|
+
|
30
|
+
# IDE and editor files
|
31
|
+
".idea/**",
|
32
|
+
".vscode/**",
|
33
|
+
|
34
|
+
# Version control
|
35
|
+
".git/**",
|
36
|
+
|
37
|
+
# Temporary and generated files
|
38
|
+
"*.pyc",
|
39
|
+
".DS_Store",
|
40
|
+
"*.swp",
|
41
|
+
"*.swo",
|
42
|
+
]
|
43
|
+
|
44
|
+
[[analyzers]]
|
45
|
+
name = "test-coverage"
|
46
|
+
|
47
|
+
[[analyzers]]
|
48
|
+
name = "python"
|
49
|
+
|
50
|
+
[analyzers.meta]
|
51
|
+
runtime_version = "3.x.x"
|
52
|
+
|
53
|
+
[[transformers]]
|
54
|
+
name = "ruff"
|
@@ -0,0 +1,197 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
branches:
|
6
|
+
- main
|
7
|
+
push:
|
8
|
+
branches:
|
9
|
+
- main
|
10
|
+
- feat/smart-multiprocessing
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
validate:
|
14
|
+
runs-on: ubuntu-latest
|
15
|
+
timeout-minutes: 10
|
16
|
+
steps:
|
17
|
+
- name: Checkout
|
18
|
+
uses: actions/checkout@v4
|
19
|
+
|
20
|
+
- name: Install uv
|
21
|
+
uses: astral-sh/setup-uv@v6
|
22
|
+
with:
|
23
|
+
enable-cache: true
|
24
|
+
|
25
|
+
- name: Set up Python
|
26
|
+
uses: actions/setup-python@v5
|
27
|
+
with:
|
28
|
+
python-version-file: "pyproject.toml"
|
29
|
+
|
30
|
+
- name: Install Dependencies
|
31
|
+
uses: nick-fields/retry@v3
|
32
|
+
with:
|
33
|
+
timeout_minutes: 5
|
34
|
+
max_attempts: 3
|
35
|
+
retry_wait_seconds: 30
|
36
|
+
command: |
|
37
|
+
if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
|
38
|
+
echo "Removing existing .venv directory on Windows"
|
39
|
+
rm -rf .venv
|
40
|
+
fi
|
41
|
+
uv sync --all-packages --all-extras --dev
|
42
|
+
shell: bash
|
43
|
+
|
44
|
+
- name: Load Cached Pre-Commit Dependencies
|
45
|
+
id: cached-pre-commit-dependencies
|
46
|
+
uses: actions/cache@v4
|
47
|
+
with:
|
48
|
+
path: ~/.cache/pre-commit/
|
49
|
+
key: pre-commit|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
|
50
|
+
|
51
|
+
- name: Execute Pre-Commit
|
52
|
+
run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
|
53
|
+
|
54
|
+
test:
|
55
|
+
strategy:
|
56
|
+
matrix:
|
57
|
+
os: [ ubuntu-latest, macOS-latest, windows-latest ]
|
58
|
+
python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
|
59
|
+
runs-on: ${{ matrix.os }}
|
60
|
+
timeout-minutes: 30
|
61
|
+
steps:
|
62
|
+
- name: Checkout
|
63
|
+
uses: actions/checkout@v4
|
64
|
+
|
65
|
+
- name: Install uv
|
66
|
+
uses: astral-sh/setup-uv@v6
|
67
|
+
with:
|
68
|
+
enable-cache: true
|
69
|
+
|
70
|
+
- name: Install Python
|
71
|
+
uses: actions/setup-python@v5
|
72
|
+
id: setup-python
|
73
|
+
with:
|
74
|
+
python-version: ${{ matrix.python }}
|
75
|
+
|
76
|
+
- name: Cache Python Dependencies
|
77
|
+
id: python-cache
|
78
|
+
uses: actions/cache@v4
|
79
|
+
with:
|
80
|
+
path: |
|
81
|
+
~/.cache/uv
|
82
|
+
.venv
|
83
|
+
key: python-dependencies-${{ matrix.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('uv.lock') }}
|
84
|
+
restore-keys: |
|
85
|
+
python-dependencies-${{ matrix.os }}-${{ matrix.python }}-
|
86
|
+
|
87
|
+
- name: Install Dependencies
|
88
|
+
uses: nick-fields/retry@v3
|
89
|
+
with:
|
90
|
+
timeout_minutes: 5
|
91
|
+
max_attempts: 3
|
92
|
+
retry_wait_seconds: 30
|
93
|
+
command: |
|
94
|
+
if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
|
95
|
+
echo "Removing existing .venv directory on Windows"
|
96
|
+
rm -rf .venv
|
97
|
+
fi
|
98
|
+
uv sync --all-packages --all-extras --dev
|
99
|
+
shell: bash
|
100
|
+
|
101
|
+
- name: Cache Test Artifacts
|
102
|
+
uses: actions/cache@v4
|
103
|
+
with:
|
104
|
+
path: .pytest_cache/
|
105
|
+
key: pytest-cache-${{ matrix.os }}-${{ matrix.python }}
|
106
|
+
|
107
|
+
- name: Cache and Install Homebrew (macOS)
|
108
|
+
if: runner.os == 'macOS'
|
109
|
+
uses: nick-fields/retry@v3
|
110
|
+
with:
|
111
|
+
timeout_minutes: 10
|
112
|
+
max_attempts: 3
|
113
|
+
retry_wait_seconds: 30
|
114
|
+
command: |
|
115
|
+
# Using the underlying homebrew commands instead of the action
|
116
|
+
brew update || true
|
117
|
+
brew install tesseract tesseract-lang pandoc || brew upgrade tesseract tesseract-lang pandoc || true
|
118
|
+
brew list tesseract tesseract-lang pandoc
|
119
|
+
shell: bash
|
120
|
+
|
121
|
+
- name: Cache and Install APT Packages (Linux)
|
122
|
+
if: runner.os == 'Linux'
|
123
|
+
uses: nick-fields/retry@v3
|
124
|
+
with:
|
125
|
+
timeout_minutes: 5
|
126
|
+
max_attempts: 3
|
127
|
+
retry_wait_seconds: 30
|
128
|
+
command: |
|
129
|
+
sudo apt-get update
|
130
|
+
sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
|
131
|
+
shell: bash
|
132
|
+
|
133
|
+
- name: Install System Dependencies (Windows)
|
134
|
+
if: runner.os == 'Windows'
|
135
|
+
uses: nick-fields/retry@v3
|
136
|
+
with:
|
137
|
+
timeout_minutes: 10
|
138
|
+
max_attempts: 3
|
139
|
+
retry_wait_seconds: 30
|
140
|
+
command: |
|
141
|
+
choco install -y tesseract pandoc --no-progress
|
142
|
+
Write-Output "C:\Program Files\Tesseract-OCR" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
143
|
+
Write-Output "C:\Program Files\Pandoc" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
144
|
+
$env:PATH = "C:\Program Files\Tesseract-OCR;C:\Program Files\Pandoc;" + $env:PATH
|
145
|
+
tesseract --version
|
146
|
+
pandoc --version
|
147
|
+
shell: pwsh
|
148
|
+
|
149
|
+
- name: Clean Coverage Data
|
150
|
+
run: |
|
151
|
+
rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
|
152
|
+
shell: bash
|
153
|
+
|
154
|
+
- name: Run Tests with Coverage
|
155
|
+
run: |
|
156
|
+
uv run coverage erase
|
157
|
+
uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
|
158
|
+
|
159
|
+
- name: Upload Coverage Artifacts
|
160
|
+
if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
|
161
|
+
uses: actions/upload-artifact@v4
|
162
|
+
with:
|
163
|
+
name: coverage-report
|
164
|
+
path: coverage.lcov
|
165
|
+
retention-days: 1
|
166
|
+
|
167
|
+
upload-coverage:
|
168
|
+
needs: test
|
169
|
+
runs-on: ubuntu-latest
|
170
|
+
if: github.event_name == 'push' || github.event_name == 'pull_request'
|
171
|
+
steps:
|
172
|
+
- name: Checkout
|
173
|
+
uses: actions/checkout@v4
|
174
|
+
with:
|
175
|
+
ref: ${{ github.event.pull_request.head.sha || github.sha }}
|
176
|
+
|
177
|
+
- name: Download Coverage Artifacts
|
178
|
+
uses: actions/download-artifact@v4
|
179
|
+
with:
|
180
|
+
name: coverage-report
|
181
|
+
path: .
|
182
|
+
|
183
|
+
- name: Install DeepSource CLI
|
184
|
+
uses: nick-fields/retry@v3
|
185
|
+
with:
|
186
|
+
timeout_minutes: 3
|
187
|
+
max_attempts: 3
|
188
|
+
retry_wait_seconds: 10
|
189
|
+
command: |
|
190
|
+
curl -fsSL https://deepsource.io/cli | sh
|
191
|
+
shell: bash
|
192
|
+
|
193
|
+
- name: Upload Coverage to DeepSource
|
194
|
+
env:
|
195
|
+
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
196
|
+
run: |
|
197
|
+
./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
|
@@ -1,13 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.9.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
7
7
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
8
8
|
License: MIT
|
9
9
|
License-File: LICENSE
|
10
|
-
Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
10
|
+
Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
12
12
|
Classifier: Intended Audience :: Developers
|
13
13
|
Classifier: Intended Audience :: Information Technology
|
@@ -29,12 +29,12 @@ Classifier: Topic :: Text Processing :: General
|
|
29
29
|
Classifier: Typing :: Typed
|
30
30
|
Requires-Python: >=3.10
|
31
31
|
Requires-Dist: anyio>=4.9.0
|
32
|
-
Requires-Dist: chardetng-py>=0.3.
|
32
|
+
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
35
|
-
Requires-Dist: mcp>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
+
Requires-Dist: mcp>=1.12.2
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
|
-
Requires-Dist: playa-pdf>=0.6.
|
37
|
+
Requires-Dist: playa-pdf>=0.6.4
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
39
39
|
Requires-Dist: pypdfium2==4.30.0
|
40
40
|
Requires-Dist: python-calamine>=0.3.2
|
@@ -53,18 +53,21 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
|
|
53
53
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
54
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
55
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
56
|
-
Requires-Dist: rich>=14.
|
56
|
+
Requires-Dist: rich>=14.1.0; extra == 'all'
|
57
57
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
58
58
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
59
59
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
60
60
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
61
61
|
Provides-Extra: api
|
62
62
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
63
|
+
Provides-Extra: auto-classify-document-type
|
64
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
|
65
|
+
Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
|
63
66
|
Provides-Extra: chunking
|
64
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
65
68
|
Provides-Extra: cli
|
66
69
|
Requires-Dist: click>=8.2.1; extra == 'cli'
|
67
|
-
Requires-Dist: rich>=14.
|
70
|
+
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
68
71
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
69
72
|
Provides-Extra: easyocr
|
70
73
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
@@ -88,7 +91,7 @@ Description-Content-Type: text/markdown
|
|
88
91
|
[](https://kreuzberg.dev/)
|
89
92
|
[](https://benchmarks.kreuzberg.dev/)
|
90
93
|
[](https://opensource.org/licenses/MIT)
|
91
|
-
[](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
|
92
95
|
|
93
96
|
**A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
|
94
97
|
|
@@ -103,6 +106,7 @@ Description-Content-Type: text/markdown
|
|
103
106
|
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
104
107
|
- **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
|
105
108
|
- **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
|
109
|
+
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
106
110
|
|
107
111
|
### Technical Architecture
|
108
112
|
|
@@ -126,14 +130,14 @@ Kreuzberg leverages established open source technologies:
|
|
126
130
|
### Extract Text with CLI
|
127
131
|
|
128
132
|
```bash
|
129
|
-
# Extract text from any file to
|
130
|
-
uvx kreuzberg extract document.pdf > output.
|
133
|
+
# Extract text from any file to text format
|
134
|
+
uvx kreuzberg extract document.pdf > output.txt
|
131
135
|
|
132
136
|
# With all features (OCR, table extraction, etc.)
|
133
|
-
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format
|
137
|
+
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
|
134
138
|
|
135
139
|
# Extract with rich metadata
|
136
|
-
uvx kreuzberg extract report.pdf --show-metadata --format json
|
140
|
+
uvx kreuzberg extract report.pdf --show-metadata --output-format json
|
137
141
|
```
|
138
142
|
|
139
143
|
### Python Usage
|
@@ -5,7 +5,7 @@
|
|
5
5
|
[](https://kreuzberg.dev/)
|
6
6
|
[](https://benchmarks.kreuzberg.dev/)
|
7
7
|
[](https://opensource.org/licenses/MIT)
|
8
|
-
[](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
|
9
9
|
|
10
10
|
**A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
|
11
11
|
|
@@ -20,6 +20,7 @@
|
|
20
20
|
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
21
21
|
- **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
|
22
22
|
- **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
|
23
|
+
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
23
24
|
|
24
25
|
### Technical Architecture
|
25
26
|
|
@@ -43,14 +44,14 @@ Kreuzberg leverages established open source technologies:
|
|
43
44
|
### Extract Text with CLI
|
44
45
|
|
45
46
|
```bash
|
46
|
-
# Extract text from any file to
|
47
|
-
uvx kreuzberg extract document.pdf > output.
|
47
|
+
# Extract text from any file to text format
|
48
|
+
uvx kreuzberg extract document.pdf > output.txt
|
48
49
|
|
49
50
|
# With all features (OCR, table extraction, etc.)
|
50
|
-
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format
|
51
|
+
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
|
51
52
|
|
52
53
|
# Extract with rich metadata
|
53
|
-
uvx kreuzberg extract report.pdf --show-metadata --format json
|
54
|
+
uvx kreuzberg extract report.pdf --show-metadata --output-format json
|
54
55
|
```
|
55
56
|
|
56
57
|
### Python Usage
|
@@ -43,7 +43,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
|
|
43
43
|
print(f"Tables: {len(cold_result.tables)}")
|
44
44
|
print(f"Chunks: {len(cold_result.chunks)}")
|
45
45
|
|
46
|
-
from kreuzberg._utils._cache import (
|
46
|
+
from kreuzberg._utils._cache import ( # noqa: PLC0415
|
47
47
|
get_ocr_cache,
|
48
48
|
get_table_cache,
|
49
49
|
get_mime_cache,
|
@@ -130,7 +130,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
|
|
130
130
|
f" Cache consistency: {'✅ STABLE' if warm_clean_stdev / warm_clean_mean < 0.1 else '⚠️ VARIABLE'}"
|
131
131
|
)
|
132
132
|
|
133
|
-
from kreuzberg._utils._cache import (
|
133
|
+
from kreuzberg._utils._cache import ( # noqa: PLC0415
|
134
134
|
get_ocr_cache,
|
135
135
|
get_table_cache,
|
136
136
|
get_mime_cache,
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to Kreuzberg will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
|
+
|
8
|
+
## [3.9.0] - 2025-01-17
|
9
|
+
|
10
|
+
### Added
|
11
|
+
|
12
|
+
- Automatic Document Type Detection (#88) - A new feature for classifying documents into categories (contract, form, invoice, receipt, report)
|
13
|
+
- Integration with Google Translate for multi-language support
|
14
|
+
- New optional dependency group `auto-classify-document-type` with `deep-translator` and `pandas`
|
15
|
+
- Comprehensive tests and documentation
|
16
|
+
- DeepSource integration for code quality analysis
|
17
|
+
|
18
|
+
### Fixed
|
19
|
+
|
20
|
+
- PDF extraction handling when no OCR backend is available
|
21
|
+
- Entity extraction test updated to use frozenset of tuples
|
22
|
+
- Config handling for dataclasses with `slots=True` - replaced `config.__dict__` with `asdict(config)`
|
23
|
+
- Coverage configuration and cleanup issues
|
24
|
+
|
25
|
+
### Changed
|
26
|
+
|
27
|
+
- CI/CD: Added retry logic for flaky steps across all platforms
|
28
|
+
- Improved coverage gathering and cleanup in test runs
|
29
|
+
- Updated dependencies in `uv.lock`
|
30
|
+
|
31
|
+
## [3.8.2] - Previous Release
|
32
|
+
|
33
|
+
### Added
|
34
|
+
|
35
|
+
- Documentation site with comprehensive examples and API reference
|
36
|
+
- Improved configuration for all OCR backends
|
37
|
+
- Added hooks system for validation and post-processing
|
38
|
+
- Language detection feature with `auto_detect_language` configuration option
|
39
|
+
- New optional dependency group `langdetect` for automatic language detection
|
40
|
+
|
41
|
+
### Changed
|
42
|
+
|
43
|
+
- Refactored internal structure for better maintainability
|
44
|
+
- Updated extraction functions to use config object instead of kwargs
|
45
|
+
- Improved error messages and reporting
|
46
|
+
|
47
|
+
## Previous Versions
|
48
|
+
|
49
|
+
For a complete history of changes, please refer to the [GitHub releases page](https://github.com/strickvl/kreuzberg/releases).
|
@@ -49,6 +49,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
|
|
49
49
|
- **OCR Engines**: Tesseract (default), EasyOCR, PaddleOCR with automatic fallback strategies
|
50
50
|
- **Data Extraction**: Text content, document metadata, table structures, and embedded resources
|
51
51
|
- **Processing Capabilities**: Content chunking for RAG pipelines, language detection, format preservation
|
52
|
+
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
52
53
|
- **Extensibility**: Plugin architecture for custom extractors and hooks
|
53
54
|
|
54
55
|
## Architecture Philosophy
|
@@ -131,3 +131,31 @@ async def show_metadata():
|
|
131
131
|
|
132
132
|
asyncio.run(show_metadata())
|
133
133
|
```
|
134
|
+
|
135
|
+
## Document Classification
|
136
|
+
|
137
|
+
Kreuzberg can automatically classify documents into categories (contracts, forms, invoices, receipts, reports):
|
138
|
+
|
139
|
+
```python
|
140
|
+
import asyncio
|
141
|
+
from kreuzberg import extract_file, ExtractionConfig
|
142
|
+
|
143
|
+
async def classify_document():
|
144
|
+
config = ExtractionConfig(
|
145
|
+
auto_detect_document_type=True,
|
146
|
+
document_classification_mode="text", # or "vision" for better accuracy
|
147
|
+
type_confidence_threshold=0.5,
|
148
|
+
)
|
149
|
+
|
150
|
+
result = await extract_file("invoice.pdf", config=config)
|
151
|
+
|
152
|
+
# Access classification results
|
153
|
+
if result.document_type:
|
154
|
+
print(f"Document type: {result.document_type}")
|
155
|
+
print(f"Confidence: {result.type_confidence:.2%}")
|
156
|
+
|
157
|
+
# The extracted content is still available
|
158
|
+
print(f"Content: {result.content[:200]}...")
|
159
|
+
|
160
|
+
asyncio.run(classify_document())
|
161
|
+
```
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Automatic Document Classification
|
2
|
+
|
3
|
+
Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
|
4
|
+
|
5
|
+
## Enabling Document Classification
|
6
|
+
|
7
|
+
To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
|
8
|
+
|
9
|
+
```python
|
10
|
+
from kreuzberg import ExtractionConfig, extract_file
|
11
|
+
|
12
|
+
config = ExtractionConfig(auto_detect_document_type=True)
|
13
|
+
result = await extract_file("path/to/your/document.pdf", config=config)
|
14
|
+
|
15
|
+
if result.document_type:
|
16
|
+
print(f"Detected document type: {result.document_type}")
|
17
|
+
print(f"Confidence: {result.document_type_confidence:.2f}")
|
18
|
+
```
|
19
|
+
|
20
|
+
## Classification Modes
|
21
|
+
|
22
|
+
You can choose between two classification modes using the `document_classification_mode` parameter in `ExtractionConfig`:
|
23
|
+
|
24
|
+
- `"text"` (default): This mode uses a rule-based classifier that analyzes the extracted text for keywords and patterns. It's fast and works well for text-based documents.
|
25
|
+
- `"vision"`: This mode uses layout information from OCR to identify document types. It's more accurate for scanned documents and images, but it requires the Tesseract OCR backend.
|
26
|
+
|
27
|
+
Here's how to use the vision-based classifier:
|
28
|
+
|
29
|
+
```python
|
30
|
+
config = ExtractionConfig(
|
31
|
+
auto_detect_document_type=True,
|
32
|
+
document_classification_mode="vision",
|
33
|
+
force_ocr=True, # Recommended for vision-based classification
|
34
|
+
)
|
35
|
+
```
|
36
|
+
|
37
|
+
## Confidence Threshold
|
38
|
+
|
39
|
+
You can control the minimum confidence required for a classification to be considered valid by setting the `type_confidence_threshold` in `ExtractionConfig`. The default value is `0.7`.
|
40
|
+
|
41
|
+
```python
|
42
|
+
config = ExtractionConfig(
|
43
|
+
auto_detect_document_type=True,
|
44
|
+
type_confidence_threshold=0.85, # Require 85% confidence
|
45
|
+
)
|
46
|
+
```
|
47
|
+
|
48
|
+
## Output
|
49
|
+
|
50
|
+
The classification results are available in the `ExtractionResult` object:
|
51
|
+
|
52
|
+
- `document_type`: The detected document type (e.g., `"invoice"`, `"contract"`) or `None` if no type was detected with sufficient confidence.
|
53
|
+
- `type_confidence`: The confidence score of the detection (a float between 0.0 and 1.0) or `None`.
|
@@ -31,6 +31,9 @@ max_chars = 2000
|
|
31
31
|
max_overlap = 100
|
32
32
|
ocr_backend = "tesseract"
|
33
33
|
auto_detect_language = true
|
34
|
+
auto_detect_document_type = true
|
35
|
+
document_classification_mode = "text" # or "vision"
|
36
|
+
type_confidence_threshold = 0.5
|
34
37
|
|
35
38
|
# Tesseract OCR configuration
|
36
39
|
[tesseract]
|
@@ -76,6 +79,9 @@ force_ocr = false
|
|
76
79
|
chunk_content = true
|
77
80
|
extract_tables = true
|
78
81
|
auto_detect_language = true
|
82
|
+
auto_detect_document_type = true
|
83
|
+
document_classification_mode = "text"
|
84
|
+
type_confidence_threshold = 0.5
|
79
85
|
|
80
86
|
[tool.kreuzberg.tesseract]
|
81
87
|
language = "eng"
|
@@ -8,6 +8,7 @@ This guide provides comprehensive documentation for the Kreuzberg document intel
|
|
8
8
|
- [Extraction Configuration](extraction-configuration.md) - Configure the extraction process ([API](../api-reference/types.md#extractionconfig))
|
9
9
|
- [Metadata Extraction](metadata-extraction.md) - Document metadata extraction ([API](../api-reference/types.md#metadata))
|
10
10
|
- [Content Chunking](chunking.md) - Split documents into manageable chunks
|
11
|
+
- [Document Classification](document-classification.md) - Automatic document type detection
|
11
12
|
- [OCR Configuration](ocr-configuration.md) - Configure OCR settings ([API](../api-reference/ocr-configuration.md))
|
12
13
|
- [OCR Backends](ocr-backends.md) - Choose and configure different OCR engines
|
13
14
|
- [Supported Formats](supported-formats.md) - All supported document formats
|
@@ -2,9 +2,9 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING
|
4
4
|
|
5
|
-
from kreuzberg import MissingDependencyError
|
6
5
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
7
6
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
7
|
+
from kreuzberg.exceptions import MissingDependencyError
|
8
8
|
|
9
9
|
if TYPE_CHECKING:
|
10
10
|
from semantic_text_splitter import MarkdownSplitter, TextSplitter
|
@@ -36,11 +36,11 @@ def get_chunker(
|
|
36
36
|
if key not in _chunkers:
|
37
37
|
try:
|
38
38
|
if mime_type == MARKDOWN_MIME_TYPE:
|
39
|
-
from semantic_text_splitter import MarkdownSplitter
|
39
|
+
from semantic_text_splitter import MarkdownSplitter # noqa: PLC0415
|
40
40
|
|
41
41
|
_chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
|
42
42
|
else:
|
43
|
-
from semantic_text_splitter import TextSplitter
|
43
|
+
from semantic_text_splitter import TextSplitter # noqa: PLC0415
|
44
44
|
|
45
45
|
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
46
|
except ImportError as e:
|
@@ -95,7 +95,7 @@ def parse_ocr_backend_config(
|
|
95
95
|
# Convert psm integer to PSMMode enum if needed
|
96
96
|
processed_config = backend_config.copy()
|
97
97
|
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
98
|
-
from kreuzberg._ocr._tesseract import PSMMode
|
98
|
+
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
99
99
|
|
100
100
|
processed_config["psm"] = PSMMode(processed_config["psm"])
|
101
101
|
return TesseractConfig(**processed_config)
|