kreuzberg 3.8.0__tar.gz → 3.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/PKG-INFO +31 -43
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/README.md +22 -39
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/index.md +11 -15
- kreuzberg-3.8.1/docs/performance-analysis.md +140 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_entity_extraction.py +1 -2
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_base.py +3 -5
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_image.py +18 -32
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_pandoc.py +3 -14
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_pdf.py +19 -40
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_gmft.py +314 -7
- kreuzberg-3.8.1/kreuzberg/_ocr/__init__.py +26 -0
- kreuzberg-3.8.1/kreuzberg/_ocr/_base.py +113 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/_easyocr.py +91 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg-3.8.1/kreuzberg/_ocr/_tesseract.py +996 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_cache.py +35 -2
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_errors.py +3 -7
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_process_pool.py +2 -6
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/cli.py +1 -2
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/extraction.py +4 -22
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/mkdocs.yaml +1 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/pyproject.toml +17 -12
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extraction_batch_test.py +4 -4
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/image_test.py +52 -69
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/gmft_test.py +15 -2
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/gmft_isolated_test.py +11 -10
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/tesseract_pool_test.py +4 -4
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/base_test.py +14 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/easyocr_test.py +36 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/paddleocr_test.py +50 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/tesseract_test.py +44 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/process_pool_test.py +1 -1
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/uv.lock +1 -1
- kreuzberg-3.8.0/kreuzberg/_multiprocessing/__init__.py +0 -5
- kreuzberg-3.8.0/kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
- kreuzberg-3.8.0/kreuzberg/_ocr/__init__.py +0 -47
- kreuzberg-3.8.0/kreuzberg/_ocr/_base.py +0 -54
- kreuzberg-3.8.0/kreuzberg/_ocr/_pool.py +0 -357
- kreuzberg-3.8.0/kreuzberg/_ocr/_sync.py +0 -566
- kreuzberg-3.8.0/kreuzberg/_ocr/_tesseract.py +0 -440
- kreuzberg-3.8.0/tests/multiprocessing/sync_easyocr_test.py +0 -640
- kreuzberg-3.8.0/tests/multiprocessing/sync_paddleocr_test.py +0 -529
- kreuzberg-3.8.0/tests/multiprocessing/sync_tesseract_test.py +0 -362
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.commitlintrc +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.docker/README.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.dockerignore +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.gitignore +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/LICENSE +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/ai-rulez.yaml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/changelog.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/cli.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/contributing.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/api/main_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/conftest.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extraction_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/email_comprehensive_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/toml/sample-config.toml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/types_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.8.0 → kreuzberg-3.8.1}/tests/utils/tmp_test.py +0 -0
@@ -1,14 +1,16 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.8.
|
4
|
-
Summary:
|
3
|
+
Version: 3.8.1
|
4
|
+
Summary: Advanced document intelligence framework for extracting structured content from PDFs, images, and office documents
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
7
|
License: MIT
|
8
8
|
License-File: LICENSE
|
9
|
-
Keywords: document-processing,entity-extraction,image-to-text,
|
9
|
+
Keywords: automation,content-extraction,data-processing,document-analysis,document-intelligence,document-processing,entity-extraction,image-to-text,information-extraction,ocr,pdf-extraction,rag,structured-data,table-extraction,text-extraction
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: Intended Audience :: Information Technology
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
12
14
|
Classifier: License :: OSI Approved :: MIT License
|
13
15
|
Classifier: Operating System :: OS Independent
|
14
16
|
Classifier: Programming Language :: Python :: 3 :: Only
|
@@ -16,10 +18,13 @@ Classifier: Programming Language :: Python :: 3.10
|
|
16
18
|
Classifier: Programming Language :: Python :: 3.11
|
17
19
|
Classifier: Programming Language :: Python :: 3.12
|
18
20
|
Classifier: Programming Language :: Python :: 3.13
|
21
|
+
Classifier: Topic :: Database
|
22
|
+
Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
|
23
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
19
24
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
20
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
27
|
Classifier: Topic :: Text Processing :: General
|
22
|
-
Classifier: Topic :: Utilities
|
23
28
|
Classifier: Typing :: Typed
|
24
29
|
Requires-Python: >=3.10
|
25
30
|
Requires-Dist: anyio>=4.9.0
|
@@ -83,49 +88,31 @@ Description-Content-Type: text/markdown
|
|
83
88
|
[](https://opensource.org/licenses/MIT)
|
84
89
|
[](https://github.com/Goldziher/kreuzberg)
|
85
90
|
|
86
|
-
**
|
91
|
+
**Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
|
87
92
|
|
88
93
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
89
94
|
|
90
95
|
## Why Choose Kreuzberg?
|
91
96
|
|
92
|
-
###
|
97
|
+
### ⚡ Proven Performance
|
93
98
|
|
94
|
-
|
95
|
-
- Minimal footprint: 71MB install vs 1GB+ for competitors
|
96
|
-
- Lowest memory usage (~530MB average) optimized for production workloads
|
97
|
-
- Edge and serverless ready - deploy anywhere without heavy dependencies
|
99
|
+
[Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
|
98
100
|
|
99
|
-
###
|
101
|
+
### 🏗️ Production Engineering
|
100
102
|
|
101
|
-
|
102
|
-
- 95%+ test coverage with comprehensive test suite
|
103
|
-
- Thoroughly benchmarked and profiled for real-world performance
|
104
|
-
- Only framework offering true async/await support alongside sync APIs
|
105
|
-
- Robust error handling and detailed logging
|
103
|
+
Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
|
106
104
|
|
107
|
-
###
|
105
|
+
### 🔧 Developer Experience
|
108
106
|
|
109
|
-
|
110
|
-
- Native MCP server for AI tool integration (Claude Desktop, Cursor)
|
111
|
-
- Full type safety with excellent IDE support (completions)
|
112
|
-
- Comprehensive documentation including full API reference
|
107
|
+
Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
|
113
108
|
|
114
|
-
###
|
109
|
+
### 🚀 Flexible Deployment
|
115
110
|
|
116
|
-
|
117
|
-
- Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
|
118
|
-
- CPU-only processing - no GPU requirements, lower energy consumption
|
119
|
-
- 100% local processing - no external API dependencies
|
120
|
-
- Multiple deployment modes: CLI, REST API, MCP server
|
111
|
+
Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
|
121
112
|
|
122
|
-
###
|
113
|
+
### 📄 Comprehensive Format Support
|
123
114
|
|
124
|
-
|
125
|
-
- Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
|
126
|
-
- Advanced features: Table extraction, metadata extraction, content chunking for RAG
|
127
|
-
- Production tools: REST API, CLI tools, batch processing, custom extractors
|
128
|
-
- Fully extensible: Add your own extractors
|
115
|
+
Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
|
129
116
|
|
130
117
|
## Quick Start
|
131
118
|
|
@@ -161,7 +148,7 @@ import asyncio
|
|
161
148
|
from kreuzberg import extract_file
|
162
149
|
|
163
150
|
async def main():
|
164
|
-
# Extract from
|
151
|
+
# Extract content from files
|
165
152
|
result = await extract_file("document.pdf")
|
166
153
|
print(result.content)
|
167
154
|
print(result.metadata)
|
@@ -275,23 +262,23 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
275
262
|
|
276
263
|
## 📊 Performance Comparison
|
277
264
|
|
278
|
-
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across
|
265
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
|
279
266
|
|
280
|
-
| Framework | Speed
|
281
|
-
| ------------- |
|
282
|
-
| **Kreuzberg** |
|
283
|
-
| Unstructured | ~12 files/s
|
284
|
-
| MarkItDown | ~15 files/s
|
285
|
-
| Docling | ~1 file/min
|
267
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
268
|
+
| ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
|
269
|
+
| **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
|
270
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
271
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
272
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
286
273
|
|
287
274
|
\*_Performance varies significantly with document complexity and size_
|
288
275
|
|
289
276
|
**Key strengths:**
|
290
277
|
|
291
|
-
-
|
278
|
+
- 6-126x faster processing than comparable frameworks
|
292
279
|
- Smallest installation footprint and memory usage
|
293
280
|
- Only framework with built-in async/await support
|
294
|
-
-
|
281
|
+
- Supports both CPU and GPU processing
|
295
282
|
- Built by software engineers for production reliability
|
296
283
|
|
297
284
|
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
@@ -302,6 +289,7 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
302
289
|
|
303
290
|
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
304
291
|
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
292
|
+
- [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
|
305
293
|
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
306
294
|
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
307
295
|
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
@@ -6,49 +6,31 @@
|
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
7
7
|
[](https://github.com/Goldziher/kreuzberg)
|
8
8
|
|
9
|
-
**
|
9
|
+
**Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
|
10
10
|
|
11
11
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
12
12
|
|
13
13
|
## Why Choose Kreuzberg?
|
14
14
|
|
15
|
-
###
|
15
|
+
### ⚡ Proven Performance
|
16
16
|
|
17
|
-
|
18
|
-
- Minimal footprint: 71MB install vs 1GB+ for competitors
|
19
|
-
- Lowest memory usage (~530MB average) optimized for production workloads
|
20
|
-
- Edge and serverless ready - deploy anywhere without heavy dependencies
|
17
|
+
[Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
|
21
18
|
|
22
|
-
###
|
19
|
+
### 🏗️ Production Engineering
|
23
20
|
|
24
|
-
|
25
|
-
- 95%+ test coverage with comprehensive test suite
|
26
|
-
- Thoroughly benchmarked and profiled for real-world performance
|
27
|
-
- Only framework offering true async/await support alongside sync APIs
|
28
|
-
- Robust error handling and detailed logging
|
21
|
+
Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
|
29
22
|
|
30
|
-
###
|
23
|
+
### 🔧 Developer Experience
|
31
24
|
|
32
|
-
|
33
|
-
- Native MCP server for AI tool integration (Claude Desktop, Cursor)
|
34
|
-
- Full type safety with excellent IDE support (completions)
|
35
|
-
- Comprehensive documentation including full API reference
|
25
|
+
Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
|
36
26
|
|
37
|
-
###
|
27
|
+
### 🚀 Flexible Deployment
|
38
28
|
|
39
|
-
|
40
|
-
- Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
|
41
|
-
- CPU-only processing - no GPU requirements, lower energy consumption
|
42
|
-
- 100% local processing - no external API dependencies
|
43
|
-
- Multiple deployment modes: CLI, REST API, MCP server
|
29
|
+
Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
|
44
30
|
|
45
|
-
###
|
31
|
+
### 📄 Comprehensive Format Support
|
46
32
|
|
47
|
-
|
48
|
-
- Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
|
49
|
-
- Advanced features: Table extraction, metadata extraction, content chunking for RAG
|
50
|
-
- Production tools: REST API, CLI tools, batch processing, custom extractors
|
51
|
-
- Fully extensible: Add your own extractors
|
33
|
+
Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
|
52
34
|
|
53
35
|
## Quick Start
|
54
36
|
|
@@ -84,7 +66,7 @@ import asyncio
|
|
84
66
|
from kreuzberg import extract_file
|
85
67
|
|
86
68
|
async def main():
|
87
|
-
# Extract from
|
69
|
+
# Extract content from files
|
88
70
|
result = await extract_file("document.pdf")
|
89
71
|
print(result.content)
|
90
72
|
print(result.metadata)
|
@@ -198,23 +180,23 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
198
180
|
|
199
181
|
## 📊 Performance Comparison
|
200
182
|
|
201
|
-
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across
|
183
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
|
202
184
|
|
203
|
-
| Framework | Speed
|
204
|
-
| ------------- |
|
205
|
-
| **Kreuzberg** |
|
206
|
-
| Unstructured | ~12 files/s
|
207
|
-
| MarkItDown | ~15 files/s
|
208
|
-
| Docling | ~1 file/min
|
185
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
186
|
+
| ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
|
187
|
+
| **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
|
188
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
189
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
190
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
209
191
|
|
210
192
|
\*_Performance varies significantly with document complexity and size_
|
211
193
|
|
212
194
|
**Key strengths:**
|
213
195
|
|
214
|
-
-
|
196
|
+
- 6-126x faster processing than comparable frameworks
|
215
197
|
- Smallest installation footprint and memory usage
|
216
198
|
- Only framework with built-in async/await support
|
217
|
-
-
|
199
|
+
- Supports both CPU and GPU processing
|
218
200
|
- Built by software engineers for production reliability
|
219
201
|
|
220
202
|
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
@@ -225,6 +207,7 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
225
207
|
|
226
208
|
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
227
209
|
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
210
|
+
- [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
|
228
211
|
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
229
212
|
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
230
213
|
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
@@ -1,23 +1,19 @@
|
|
1
1
|
# Kreuzberg
|
2
2
|
|
3
|
-
Kreuzberg is
|
4
|
-
|
5
|
-
|
6
|
-
Kreuzberg was also created (primarily) in Kreuzberg - the famous and beautiful neighborhood of Berlin.
|
3
|
+
Kreuzberg is an advanced open source document intelligence framework built for production workloads. Designed by engineers for reliability and performance, it transforms PDFs, images, and office documents into structured data with minimal complexity.
|
4
|
+
|
5
|
+
Built on proven technologies including PDFium, Tesseract, and Pandoc, Kreuzberg delivers enterprise-grade document processing capabilities while maintaining simplicity and speed.
|
7
6
|
|
8
7
|
## Why Kreuzberg?
|
9
8
|
|
10
|
-
|
11
|
-
BUT - this is not necessarily a mutually exclusive solution. For example.
|
12
|
-
many text extraction pipelines can integrate a library such as Kreuzberg with some kind of heuristics on when to use it
|
13
|
-
and when use something else.
|
9
|
+
Kreuzberg addresses real production needs with measurable benefits. While not exclusively a complete solution, it integrates well with existing pipelines and can be deployed alongside other tools based on specific requirements.
|
14
10
|
|
15
11
|
### 🚀 Performance
|
16
12
|
|
17
|
-
- [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) -
|
18
|
-
faster than
|
19
|
-
- Minimal footprint:
|
20
|
-
- Lowest memory usage (~
|
13
|
+
- [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 6-126x
|
14
|
+
faster than competitors
|
15
|
+
- Minimal footprint: 87MB install vs 1GB+ for competitors
|
16
|
+
- Lowest memory usage (~360MB average) optimized for production workloads
|
21
17
|
- Edge and serverless ready - deploy anywhere without heavy dependencies
|
22
18
|
|
23
19
|
### 🛠️ Engineering Quality
|
@@ -39,13 +35,13 @@ and when use something else.
|
|
39
35
|
|
40
36
|
- Docker images for all architectures (AMD64, ARM64)
|
41
37
|
- Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
|
42
|
-
-
|
43
|
-
-
|
38
|
+
- Supports both CPU and GPU processing (PaddleOCR, EasyOCR)
|
39
|
+
- Local processing - no external API dependencies
|
44
40
|
- Multiple deployment modes: CLI, REST API, MCP server
|
45
41
|
|
46
42
|
### 🎯 Complete Solution
|
47
43
|
|
48
|
-
-
|
44
|
+
- Comprehensive format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
|
49
45
|
- Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
|
50
46
|
- Advanced features: Table extraction, metadata extraction, content chunking for RAG
|
51
47
|
- Production tools: REST API, CLI tools, batch processing, custom extractors
|
@@ -0,0 +1,140 @@
|
|
1
|
+
# Performance Analysis
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
|
5
|
+
This page presents comprehensive benchmark results comparing Kreuzberg against other text extraction frameworks. All data is derived from rigorous testing across ~100 real-world documents using standardized methodology.
|
6
|
+
|
7
|
+
> **Benchmark Methodology**: Results based on the [python-text-extraction-libraries-benchmarks-2025](https://github.com/Goldziher/python-text-extraction-libraries-benchmarks-2025) project with comprehensive testing across multiple document types and sizes.
|
8
|
+
|
9
|
+
## Executive Summary
|
10
|
+
|
11
|
+
Kreuzberg demonstrates exceptional performance across all key metrics:
|
12
|
+
|
13
|
+
- **Speed**: 6-126x faster than competitors
|
14
|
+
- **Memory**: 2-4x lower usage
|
15
|
+
- **Installation**: 2-68x smaller footprint
|
16
|
+
- **Reliability**: Perfect 100% success rate
|
17
|
+
|
18
|
+
## Detailed Performance Metrics
|
19
|
+
|
20
|
+
### Processing Speed
|
21
|
+
|
22
|
+
#### By File Size Category
|
23
|
+
|
24
|
+
| Category | Kreuzberg Sync | Kreuzberg Async | Best Competitor | Advantage |
|
25
|
+
| --------------------- | -------------- | --------------- | --------------- | ----------- |
|
26
|
+
| **Tiny (\<100KB)** | 31.6 files/sec | 23.6 files/sec | 4.8 files/sec | 6.6x faster |
|
27
|
+
| **Small (100KB-1MB)** | 9.0 files/sec | 10.1 files/sec | 3.6 files/sec | 2.8x faster |
|
28
|
+
| **Medium (1-10MB)** | 2.6 files/sec | 3.2 files/sec | 0.065 files/sec | 49x faster |
|
29
|
+
|
30
|
+
#### Processing Time Comparison
|
31
|
+
|
32
|
+
| Framework | Tiny Files (s) | Small Files (s) | Medium Files (s) |
|
33
|
+
| ------------------- | -------------- | --------------- | ---------------- |
|
34
|
+
| **Kreuzberg Sync** | 0.032 | 0.111 | 0.388 |
|
35
|
+
| **Kreuzberg Async** | 0.042 | 0.099 | 0.315 |
|
36
|
+
| Extractous | 0.316 | 0.281 | 15.38 |
|
37
|
+
| Unstructured | 0.210 | 1.123 | - |
|
38
|
+
| Docling | 3.956 | 14.47 | - |
|
39
|
+
|
40
|
+
### Memory Usage
|
41
|
+
|
42
|
+
| Framework | Average Memory (MB) | vs Kreuzberg |
|
43
|
+
| ------------------- | ------------------- | ------------ |
|
44
|
+
| **Kreuzberg Sync** | 360 | Baseline |
|
45
|
+
| **Kreuzberg Async** | 396 | +10% |
|
46
|
+
| Extractous | 513 | +43% |
|
47
|
+
| Unstructured | 1,389 | +286% |
|
48
|
+
| Docling | 1,838 | +411% |
|
49
|
+
|
50
|
+
### Installation Size
|
51
|
+
|
52
|
+
| Framework | Size (MB) | Packages | vs Kreuzberg |
|
53
|
+
| ------------- | --------- | -------- | ------------ |
|
54
|
+
| **Kreuzberg** | 87 | 43 | Baseline |
|
55
|
+
| Unstructured | 176 | 54 | 2.0x larger |
|
56
|
+
| MarkItDown | 208 | 25 | 2.4x larger |
|
57
|
+
| Docling | 5,900 | 103 | 67.8x larger |
|
58
|
+
|
59
|
+
### Success Rate & Reliability
|
60
|
+
|
61
|
+
| Framework | Tiny Files | Small Files | Medium Files | Overall |
|
62
|
+
| ------------- | ---------- | ----------- | ------------ | -------- |
|
63
|
+
| **Kreuzberg** | 100% | 100% | 100% | **100%** |
|
64
|
+
| Extractous | 100% | 95.8% | 100% | 98.6% |
|
65
|
+
| Unstructured | 100% | 100% | - | 100% |
|
66
|
+
| Docling | 100% | 96.3% | - | 98.2% |
|
67
|
+
|
68
|
+
### Content Extraction Quality
|
69
|
+
|
70
|
+
#### Characters Extracted (Average)
|
71
|
+
|
72
|
+
| Framework | Tiny Files | Small Files | Medium Files |
|
73
|
+
| ------------- | ---------- | ----------- | ------------ |
|
74
|
+
| **Kreuzberg** | 6,950 | 173,505 | 500,643 |
|
75
|
+
| Extractous | 6,894 | 106,641 | 251,612 |
|
76
|
+
| Unstructured | 3,842 | 70,396 | - |
|
77
|
+
| Docling | 3,316 | 59,129 | - |
|
78
|
+
|
79
|
+
## Performance Insights
|
80
|
+
|
81
|
+
### Speed Advantages
|
82
|
+
|
83
|
+
1. **Optimized Processing Pipeline**: Efficient async/await implementation
|
84
|
+
1. **Smart Resource Management**: Minimal overhead operations
|
85
|
+
1. **Native Libraries**: Built on high-performance C libraries (PDFium, Tesseract)
|
86
|
+
|
87
|
+
### Memory Efficiency
|
88
|
+
|
89
|
+
1. **Lean Architecture**: Minimal memory footprint during processing
|
90
|
+
1. **Resource Cleanup**: Proper resource disposal and garbage collection
|
91
|
+
1. **Streaming Processing**: Process large files without loading entirely into memory
|
92
|
+
|
93
|
+
### Installation Benefits
|
94
|
+
|
95
|
+
1. **Minimal Dependencies**: Only essential packages included
|
96
|
+
1. **No Heavy ML Models**: CPU-focused processing without large model files
|
97
|
+
1. **Efficient Packaging**: Optimized distribution with selective dependencies
|
98
|
+
|
99
|
+
## Production Implications
|
100
|
+
|
101
|
+
### Cost Savings
|
102
|
+
|
103
|
+
- **Infrastructure**: 2-4x lower memory requirements reduce server costs
|
104
|
+
- **Storage**: 2-68x smaller installation saves disk space
|
105
|
+
- **Processing**: 6-126x faster execution reduces compute time
|
106
|
+
|
107
|
+
### Operational Benefits
|
108
|
+
|
109
|
+
- **Deployment Speed**: Faster installations and updates
|
110
|
+
- **Resource Planning**: Predictable memory and CPU usage
|
111
|
+
- **Scaling**: Efficient resource utilization enables higher throughput
|
112
|
+
|
113
|
+
### Developer Experience
|
114
|
+
|
115
|
+
- **Quick Setup**: Minimal installation time and complexity
|
116
|
+
- **Reliable Performance**: Consistent results across document types
|
117
|
+
- **Production Ready**: Battle-tested performance characteristics
|
118
|
+
|
119
|
+
## Test Environment
|
120
|
+
|
121
|
+
**Hardware**: Linux CI runners
|
122
|
+
**Python Version**: 3.13
|
123
|
+
**Document Corpus**: ~100 real-world documents tested across multiple frameworks
|
124
|
+
**Test Date**: July 13, 2025
|
125
|
+
**Methodology**: [Full methodology available](https://github.com/Goldziher/python-text-extraction-libraries-benchmarks-2025)
|
126
|
+
|
127
|
+
## Framework Comparison Matrix
|
128
|
+
|
129
|
+
| Metric | Kreuzberg | Extractous | Unstructured | Docling |
|
130
|
+
| ------------------- | --------- | ---------- | ------------ | ------- |
|
131
|
+
| **Speed** | ★★★★★ | ★★☆☆☆ | ★★☆☆☆ | ★☆☆☆☆ |
|
132
|
+
| **Memory** | ★★★★★ | ★★★★☆ | ★★☆☆☆ | ★☆☆☆☆ |
|
133
|
+
| **Installation** | ★★★★★ | - | ★★★☆☆ | ★☆☆☆☆ |
|
134
|
+
| **Reliability** | ★★★★★ | ★★★★☆ | ★★★★★ | ★★★★☆ |
|
135
|
+
| **Content Quality** | ★★★★★ | ★★★☆☆ | ★★★☆☆ | ★★☆☆☆ |
|
136
|
+
| **Overall** | ★★★★★ | ★★★☆☆ | ★★★☆☆ | ★★☆☆☆ |
|
137
|
+
|
138
|
+
______________________________________________________________________
|
139
|
+
|
140
|
+
*Performance data is based on comprehensive benchmarking across real-world document corpus. Results may vary based on specific use cases and hardware configurations.*
|
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import os
|
3
4
|
import re
|
4
5
|
from dataclasses import dataclass
|
5
6
|
from functools import lru_cache
|
@@ -181,8 +182,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
181
182
|
import spacy
|
182
183
|
|
183
184
|
if spacy_config.model_cache_dir:
|
184
|
-
import os
|
185
|
-
|
186
185
|
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
187
186
|
|
188
187
|
nlp = spacy.load(model_name)
|
@@ -3,10 +3,12 @@ from __future__ import annotations
|
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from typing import TYPE_CHECKING, ClassVar
|
5
5
|
|
6
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
7
|
+
from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
|
8
|
+
|
6
9
|
if TYPE_CHECKING:
|
7
10
|
from pathlib import Path
|
8
11
|
|
9
|
-
from kreuzberg import ExtractionResult
|
10
12
|
from kreuzberg._types import ExtractionConfig
|
11
13
|
|
12
14
|
|
@@ -104,8 +106,6 @@ class Extractor(ABC):
|
|
104
106
|
if not self.config.enable_quality_processing:
|
105
107
|
return result
|
106
108
|
|
107
|
-
from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
|
108
|
-
|
109
109
|
if not result.content:
|
110
110
|
return result
|
111
111
|
|
@@ -120,8 +120,6 @@ class Extractor(ABC):
|
|
120
120
|
enhanced_metadata["quality_score"] = quality_score
|
121
121
|
|
122
122
|
# Return enhanced result
|
123
|
-
from kreuzberg._types import ExtractionResult, normalize_metadata
|
124
|
-
|
125
123
|
return ExtractionResult(
|
126
124
|
content=cleaned_content,
|
127
125
|
mime_type=result.mime_type,
|
@@ -11,13 +11,17 @@ from anyio import Path as AsyncPath
|
|
11
11
|
from kreuzberg._extractors._base import Extractor
|
12
12
|
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
13
13
|
from kreuzberg._ocr import get_ocr_backend
|
14
|
-
from kreuzberg.
|
14
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
15
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
16
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
15
17
|
from kreuzberg._utils._tmp import create_temp_file
|
16
18
|
from kreuzberg.exceptions import ValidationError
|
17
19
|
|
18
20
|
if TYPE_CHECKING: # pragma: no cover
|
19
21
|
from collections.abc import Mapping
|
20
22
|
|
23
|
+
from kreuzberg._types import ExtractionResult
|
24
|
+
|
21
25
|
|
22
26
|
class ImageExtractor(Extractor):
|
23
27
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
|
@@ -78,44 +82,26 @@ class ImageExtractor(Extractor):
|
|
78
82
|
if self.config.ocr_backend is None:
|
79
83
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
80
84
|
|
81
|
-
|
82
|
-
from kreuzberg._ocr._sync import process_batch_images_sync
|
83
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
84
|
-
|
85
|
-
if isinstance(self.config.ocr_config, TesseractConfig):
|
86
|
-
config = self.config.ocr_config
|
87
|
-
else:
|
88
|
-
config = TesseractConfig()
|
89
|
-
|
90
|
-
results = process_batch_images_sync([str(path)], config, backend="tesseract")
|
91
|
-
if results:
|
92
|
-
result = results[0]
|
93
|
-
return self._apply_quality_processing(result)
|
94
|
-
return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
|
95
|
-
|
96
|
-
if self.config.ocr_backend == "paddleocr":
|
97
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
98
|
-
from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
|
85
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
99
86
|
|
87
|
+
if self.config.ocr_backend == "tesseract":
|
88
|
+
config = (
|
89
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
90
|
+
)
|
91
|
+
result = backend.process_file_sync(path, **config.__dict__)
|
92
|
+
elif self.config.ocr_backend == "paddleocr":
|
100
93
|
paddle_config = (
|
101
94
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
102
95
|
)
|
103
|
-
|
104
|
-
|
105
|
-
return self._apply_quality_processing(result)
|
106
|
-
|
107
|
-
if self.config.ocr_backend == "easyocr":
|
108
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
109
|
-
from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
|
110
|
-
|
96
|
+
result = backend.process_file_sync(path, **paddle_config.__dict__)
|
97
|
+
elif self.config.ocr_backend == "easyocr":
|
111
98
|
easy_config = (
|
112
99
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
113
100
|
)
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
101
|
+
result = backend.process_file_sync(path, **easy_config.__dict__)
|
102
|
+
else:
|
103
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
104
|
+
return self._apply_quality_processing(result)
|
119
105
|
|
120
106
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
121
107
|
if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
|
@@ -1,8 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import contextlib
|
4
|
+
import os
|
4
5
|
import re
|
6
|
+
import subprocess
|
5
7
|
import sys
|
8
|
+
import tempfile
|
6
9
|
from json import JSONDecodeError, loads
|
7
10
|
from pathlib import Path
|
8
11
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
|
@@ -203,10 +206,6 @@ class PandocExtractor(Extractor):
|
|
203
206
|
Returns:
|
204
207
|
ExtractionResult with the extracted text and metadata.
|
205
208
|
"""
|
206
|
-
import os
|
207
|
-
import tempfile
|
208
|
-
from pathlib import Path
|
209
|
-
|
210
209
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
211
210
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
212
211
|
|
@@ -579,8 +578,6 @@ class PandocExtractor(Extractor):
|
|
579
578
|
|
580
579
|
def _validate_pandoc_version_sync(self) -> None:
|
581
580
|
"""Synchronous version of _validate_pandoc_version."""
|
582
|
-
import subprocess
|
583
|
-
|
584
581
|
try:
|
585
582
|
if self._checked_version:
|
586
583
|
return
|
@@ -625,10 +622,6 @@ class PandocExtractor(Extractor):
|
|
625
622
|
|
626
623
|
def _extract_metadata_sync(self, path: Path) -> Metadata:
|
627
624
|
"""Synchronous version of _handle_extract_metadata."""
|
628
|
-
import os
|
629
|
-
import subprocess
|
630
|
-
import tempfile
|
631
|
-
|
632
625
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
633
626
|
fd, metadata_file = tempfile.mkstemp(suffix=".json")
|
634
627
|
os.close(fd)
|
@@ -663,10 +656,6 @@ class PandocExtractor(Extractor):
|
|
663
656
|
|
664
657
|
def _extract_file_sync(self, path: Path) -> str:
|
665
658
|
"""Synchronous version of _handle_extract_file."""
|
666
|
-
import os
|
667
|
-
import subprocess
|
668
|
-
import tempfile
|
669
|
-
|
670
659
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
671
660
|
fd, output_path = tempfile.mkstemp(suffix=".md")
|
672
661
|
os.close(fd)
|