kreuzberg 3.8.1__tar.gz → 3.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-3.8.2/PKG-INFO +265 -0
- kreuzberg-3.8.2/README.md +182 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/benchmark_baseline.py +1 -2
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/end_to_end_benchmark.py +2 -3
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/serialization_benchmark.py +2 -3
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/src/kreuzberg_benchmarks/models.py +7 -7
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/src/kreuzberg_benchmarks/runner.py +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/statistical_benchmark.py +1 -2
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/advanced/custom-extractors.md +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/advanced/index.md +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/advanced/performance.md +6 -6
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/api-reference/index.md +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/getting-started/installation.md +2 -2
- kreuzberg-3.8.2/docs/index.md +58 -0
- kreuzberg-3.8.2/docs/performance-analysis.md +168 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/basic-usage.md +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/docker.md +99 -4
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/extraction-configuration.md +145 -2
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/index.md +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/mcp-server.md +13 -13
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/__init__.py +4 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_api/main.py +22 -1
- kreuzberg-3.8.2/kreuzberg/_config.py +404 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_entity_extraction.py +3 -3
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_pdf.py +22 -19
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_spread_sheet.py +2 -3
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_structured.py +10 -7
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_gmft.py +8 -11
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_language_detection.py +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_mcp/server.py +58 -8
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_ocr/_easyocr.py +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_ocr/_paddleocr.py +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_ocr/_tesseract.py +2 -7
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_playa.py +2 -3
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_types.py +46 -24
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_cache.py +15 -17
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_device.py +10 -20
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_errors.py +41 -38
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_quality.py +7 -11
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_serialization.py +21 -16
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_string.py +22 -12
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_table.py +3 -4
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/cli.py +3 -3
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/exceptions.py +10 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/extraction.py +2 -2
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/mkdocs.yaml +1 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/pyproject.toml +14 -8
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/cli_test.py +1 -1
- kreuzberg-3.8.2/tests/config_test.py +401 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/image_test.py +35 -8
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/mcp_server_test.py +8 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/multiprocessing/gmft_isolated_test.py +50 -41
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/ocr/tesseract_test.py +2 -2
- kreuzberg-3.8.2/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/types_test.py +60 -1
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/uv.lock +1 -1
- kreuzberg-3.8.1/PKG-INFO +0 -301
- kreuzberg-3.8.1/README.md +0 -219
- kreuzberg-3.8.1/docs/index.md +0 -54
- kreuzberg-3.8.1/docs/performance-analysis.md +0 -140
- kreuzberg-3.8.1/kreuzberg/_cli_config.py +0 -175
- kreuzberg-3.8.1/tests/test_source_files/toml/sample-config.toml +0 -33
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.commitlintrc +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.docker/Dockerfile +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.docker/README.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.dockerignore +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.gitignore +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.markdownlint.yaml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/LICENSE +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/ai-rulez.yaml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/README.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/assets/logo.png +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/changelog.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/cli.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/contributing.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/css/extra.css +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/examples/index.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/api/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/api/main_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/chunker_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/conftest.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extraction_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/email_comprehensive_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/gmft_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/hooks_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/playa_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/registry_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.8.1 → kreuzberg-3.8.2}/tests/utils/tmp_test.py +0 -0
kreuzberg-3.8.2/PKG-INFO
ADDED
@@ -0,0 +1,265 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 3.8.2
|
4
|
+
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
|
+
Project-URL: documentation, https://kreuzberg.dev
|
6
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
7
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
8
|
+
License: MIT
|
9
|
+
License-File: LICENSE
|
10
|
+
Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Intended Audience :: Information Technology
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
16
|
+
Classifier: Operating System :: OS Independent
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
22
|
+
Classifier: Topic :: Database
|
23
|
+
Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
|
24
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Classifier: Topic :: Text Processing :: General
|
29
|
+
Classifier: Typing :: Typed
|
30
|
+
Requires-Python: >=3.10
|
31
|
+
Requires-Dist: anyio>=4.9.0
|
32
|
+
Requires-Dist: chardetng-py>=0.3.4
|
33
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.8.0
|
35
|
+
Requires-Dist: mcp>=1.11.0
|
36
|
+
Requires-Dist: msgspec>=0.18.0
|
37
|
+
Requires-Dist: playa-pdf>=0.6.1
|
38
|
+
Requires-Dist: psutil>=7.0.0
|
39
|
+
Requires-Dist: pypdfium2==4.30.0
|
40
|
+
Requires-Dist: python-calamine>=0.3.2
|
41
|
+
Requires-Dist: python-pptx>=1.0.2
|
42
|
+
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
43
|
+
Provides-Extra: additional-extensions
|
44
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
45
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
46
|
+
Provides-Extra: all
|
47
|
+
Requires-Dist: click>=8.2.1; extra == 'all'
|
48
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
49
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
50
|
+
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
51
|
+
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
52
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
53
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
56
|
+
Requires-Dist: rich>=14.0.0; extra == 'all'
|
57
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
58
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
59
|
+
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
60
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
61
|
+
Provides-Extra: api
|
62
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
63
|
+
Provides-Extra: chunking
|
64
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
65
|
+
Provides-Extra: cli
|
66
|
+
Requires-Dist: click>=8.2.1; extra == 'cli'
|
67
|
+
Requires-Dist: rich>=14.0.0; extra == 'cli'
|
68
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
69
|
+
Provides-Extra: easyocr
|
70
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
71
|
+
Provides-Extra: entity-extraction
|
72
|
+
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
73
|
+
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
74
|
+
Provides-Extra: gmft
|
75
|
+
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
76
|
+
Provides-Extra: langdetect
|
77
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
78
|
+
Provides-Extra: paddleocr
|
79
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
80
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
81
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
82
|
+
Description-Content-Type: text/markdown
|
83
|
+
|
84
|
+
# Kreuzberg
|
85
|
+
|
86
|
+
[](https://discord.gg/pXxagNK2zN)
|
87
|
+
[](https://badge.fury.io/py/kreuzberg)
|
88
|
+
[](https://kreuzberg.dev/)
|
89
|
+
[](https://benchmarks.kreuzberg.dev/)
|
90
|
+
[](https://opensource.org/licenses/MIT)
|
91
|
+
[](https://github.com/Goldziher/kreuzberg)
|
92
|
+
|
93
|
+
**A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
|
94
|
+
|
95
|
+
📖 **[Complete Documentation](https://kreuzberg.dev/)**
|
96
|
+
|
97
|
+
## Framework Overview
|
98
|
+
|
99
|
+
### Document Intelligence Capabilities
|
100
|
+
|
101
|
+
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
102
|
+
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
103
|
+
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
104
|
+
- **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
|
105
|
+
- **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
|
106
|
+
|
107
|
+
### Technical Architecture
|
108
|
+
|
109
|
+
- **Performance**: Highest throughput among Python document processing frameworks (30+ docs/second)
|
110
|
+
- **Resource Efficiency**: 71MB installation, ~360MB runtime memory footprint
|
111
|
+
- **Extensibility**: Plugin architecture for custom extractors via the Extractor base class
|
112
|
+
- **API Design**: Synchronous and asynchronous APIs with consistent interfaces
|
113
|
+
- **Type Safety**: Complete type annotations throughout the codebase
|
114
|
+
|
115
|
+
### Open Source Foundation
|
116
|
+
|
117
|
+
Kreuzberg leverages established open source technologies:
|
118
|
+
|
119
|
+
- **Pandoc**: Universal document converter for robust format support
|
120
|
+
- **PDFium**: Google's PDF rendering engine for accurate PDF processing
|
121
|
+
- **Tesseract**: Google's OCR engine for text recognition
|
122
|
+
- **Python-docx/pptx**: Native Microsoft Office format support
|
123
|
+
|
124
|
+
## Quick Start
|
125
|
+
|
126
|
+
### Extract Text with CLI
|
127
|
+
|
128
|
+
```bash
|
129
|
+
# Extract text from any file to markdown
|
130
|
+
uvx kreuzberg extract document.pdf > output.md
|
131
|
+
|
132
|
+
# With all features (OCR, table extraction, etc.)
|
133
|
+
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
|
134
|
+
|
135
|
+
# Extract with rich metadata
|
136
|
+
uvx kreuzberg extract report.pdf --show-metadata --format json
|
137
|
+
```
|
138
|
+
|
139
|
+
### Python Usage
|
140
|
+
|
141
|
+
**Async (recommended for web apps):**
|
142
|
+
|
143
|
+
```python
|
144
|
+
from kreuzberg import extract_file
|
145
|
+
|
146
|
+
# In your async function
|
147
|
+
result = await extract_file("presentation.pptx")
|
148
|
+
print(result.content)
|
149
|
+
|
150
|
+
# Rich metadata extraction
|
151
|
+
print(f"Title: {result.metadata.title}")
|
152
|
+
print(f"Author: {result.metadata.author}")
|
153
|
+
print(f"Page count: {result.metadata.page_count}")
|
154
|
+
print(f"Created: {result.metadata.created_at}")
|
155
|
+
```
|
156
|
+
|
157
|
+
**Sync (for scripts and CLI tools):**
|
158
|
+
|
159
|
+
```python
|
160
|
+
from kreuzberg import extract_file_sync
|
161
|
+
|
162
|
+
result = extract_file_sync("report.docx")
|
163
|
+
print(result.content)
|
164
|
+
|
165
|
+
# Access rich metadata
|
166
|
+
print(f"Language: {result.metadata.language}")
|
167
|
+
print(f"Word count: {result.metadata.word_count}")
|
168
|
+
print(f"Keywords: {result.metadata.keywords}")
|
169
|
+
```
|
170
|
+
|
171
|
+
### Docker
|
172
|
+
|
173
|
+
```bash
|
174
|
+
# Run the REST API
|
175
|
+
docker run -p 8000:8000 goldziher/kreuzberg
|
176
|
+
|
177
|
+
# Extract via API
|
178
|
+
curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
|
179
|
+
```
|
180
|
+
|
181
|
+
📖 **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** • **[CLI Documentation](https://kreuzberg.dev/cli/)** • **[API Reference](https://kreuzberg.dev/api-reference/)**
|
182
|
+
|
183
|
+
## Deployment Options
|
184
|
+
|
185
|
+
### 🤖 MCP Server (AI Integration)
|
186
|
+
|
187
|
+
**Add to Claude Desktop with one command:**
|
188
|
+
|
189
|
+
```bash
|
190
|
+
claude mcp add kreuzberg uvx -- --from "kreuzberg[all]" kreuzberg-mcp
|
191
|
+
```
|
192
|
+
|
193
|
+
**Or configure manually in `claude_desktop_config.json`:**
|
194
|
+
|
195
|
+
```json
|
196
|
+
{
|
197
|
+
"mcpServers": {
|
198
|
+
"kreuzberg": {
|
199
|
+
"command": "uvx",
|
200
|
+
"args": ["--from", "kreuzberg[all]", "kreuzberg-mcp"]
|
201
|
+
}
|
202
|
+
}
|
203
|
+
}
|
204
|
+
```
|
205
|
+
|
206
|
+
**MCP capabilities:**
|
207
|
+
|
208
|
+
- Extract text from PDFs, images, Office docs, and more
|
209
|
+
- Full OCR support with multiple engines
|
210
|
+
- Table extraction and metadata parsing
|
211
|
+
|
212
|
+
📖 **[MCP Documentation](https://kreuzberg.dev/user-guide/mcp-server/)**
|
213
|
+
|
214
|
+
## Supported Formats
|
215
|
+
|
216
|
+
| Category | Formats |
|
217
|
+
| ----------------- | ------------------------------ |
|
218
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
219
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
220
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
221
|
+
| **Presentations** | PPTX, PPT, ODP |
|
222
|
+
| **Web** | HTML, XML, MHTML |
|
223
|
+
| **Archives** | Support via extraction |
|
224
|
+
|
225
|
+
## 📊 Performance Characteristics
|
226
|
+
|
227
|
+
[View comprehensive benchmarks](https://benchmarks.kreuzberg.dev/) • [Benchmark methodology](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://kreuzberg.dev/performance-analysis/)
|
228
|
+
|
229
|
+
### Technical Specifications
|
230
|
+
|
231
|
+
| Metric | Kreuzberg Sync | Kreuzberg Async | Benchmarked |
|
232
|
+
| ---------------------------- | -------------- | --------------- | ------------------ |
|
233
|
+
| **Throughput (tiny files)** | 31.78 files/s | 23.94 files/s | Highest throughput |
|
234
|
+
| **Throughput (small files)** | 8.91 files/s | 9.31 files/s | Highest throughput |
|
235
|
+
| **Memory footprint** | 359.8 MB | 395.2 MB | Lowest usage |
|
236
|
+
| **Installation size** | 71 MB | 71 MB | Smallest size |
|
237
|
+
| **Success rate** | 100% | 100% | Perfect |
|
238
|
+
| **Supported formats** | 18 | 18 | Comprehensive |
|
239
|
+
|
240
|
+
### Architecture Advantages
|
241
|
+
|
242
|
+
- **Native C extensions**: Built on PDFium and Tesseract for maximum performance
|
243
|
+
- **Async/await support**: True asynchronous processing with intelligent task scheduling
|
244
|
+
- **Memory efficiency**: Streaming architecture minimizes memory allocation
|
245
|
+
- **Process pooling**: Automatic multiprocessing for CPU-intensive operations
|
246
|
+
- **Optimized data flow**: Efficient data handling with minimal transformations
|
247
|
+
|
248
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
249
|
+
|
250
|
+
## Documentation
|
251
|
+
|
252
|
+
### Quick Links
|
253
|
+
|
254
|
+
- [Installation Guide](https://kreuzberg.dev/getting-started/installation/) - Setup and dependencies
|
255
|
+
- [User Guide](https://kreuzberg.dev/user-guide/) - Comprehensive usage guide
|
256
|
+
- [Performance Analysis](https://kreuzberg.dev/performance-analysis/) - Detailed benchmark results
|
257
|
+
- [API Reference](https://kreuzberg.dev/api-reference/) - Complete API documentation
|
258
|
+
- [Docker Guide](https://kreuzberg.dev/user-guide/docker/) - Container deployment
|
259
|
+
- [REST API](https://kreuzberg.dev/user-guide/api-server/) - HTTP endpoints
|
260
|
+
- [CLI Guide](https://kreuzberg.dev/cli/) - Command-line usage
|
261
|
+
- [OCR Configuration](https://kreuzberg.dev/user-guide/ocr-configuration/) - OCR engine setup
|
262
|
+
|
263
|
+
## License
|
264
|
+
|
265
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
@@ -0,0 +1,182 @@
|
|
1
|
+
# Kreuzberg
|
2
|
+
|
3
|
+
[](https://discord.gg/pXxagNK2zN)
|
4
|
+
[](https://badge.fury.io/py/kreuzberg)
|
5
|
+
[](https://kreuzberg.dev/)
|
6
|
+
[](https://benchmarks.kreuzberg.dev/)
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
8
|
+
[](https://github.com/Goldziher/kreuzberg)
|
9
|
+
|
10
|
+
**A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
|
11
|
+
|
12
|
+
📖 **[Complete Documentation](https://kreuzberg.dev/)**
|
13
|
+
|
14
|
+
## Framework Overview
|
15
|
+
|
16
|
+
### Document Intelligence Capabilities
|
17
|
+
|
18
|
+
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
19
|
+
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
20
|
+
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
21
|
+
- **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
|
22
|
+
- **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
|
23
|
+
|
24
|
+
### Technical Architecture
|
25
|
+
|
26
|
+
- **Performance**: Highest throughput among Python document processing frameworks (30+ docs/second)
|
27
|
+
- **Resource Efficiency**: 71MB installation, ~360MB runtime memory footprint
|
28
|
+
- **Extensibility**: Plugin architecture for custom extractors via the Extractor base class
|
29
|
+
- **API Design**: Synchronous and asynchronous APIs with consistent interfaces
|
30
|
+
- **Type Safety**: Complete type annotations throughout the codebase
|
31
|
+
|
32
|
+
### Open Source Foundation
|
33
|
+
|
34
|
+
Kreuzberg leverages established open source technologies:
|
35
|
+
|
36
|
+
- **Pandoc**: Universal document converter for robust format support
|
37
|
+
- **PDFium**: Google's PDF rendering engine for accurate PDF processing
|
38
|
+
- **Tesseract**: Google's OCR engine for text recognition
|
39
|
+
- **Python-docx/pptx**: Native Microsoft Office format support
|
40
|
+
|
41
|
+
## Quick Start
|
42
|
+
|
43
|
+
### Extract Text with CLI
|
44
|
+
|
45
|
+
```bash
|
46
|
+
# Extract text from any file to markdown
|
47
|
+
uvx kreuzberg extract document.pdf > output.md
|
48
|
+
|
49
|
+
# With all features (OCR, table extraction, etc.)
|
50
|
+
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
|
51
|
+
|
52
|
+
# Extract with rich metadata
|
53
|
+
uvx kreuzberg extract report.pdf --show-metadata --format json
|
54
|
+
```
|
55
|
+
|
56
|
+
### Python Usage
|
57
|
+
|
58
|
+
**Async (recommended for web apps):**
|
59
|
+
|
60
|
+
```python
|
61
|
+
from kreuzberg import extract_file
|
62
|
+
|
63
|
+
# In your async function
|
64
|
+
result = await extract_file("presentation.pptx")
|
65
|
+
print(result.content)
|
66
|
+
|
67
|
+
# Rich metadata extraction
|
68
|
+
print(f"Title: {result.metadata.title}")
|
69
|
+
print(f"Author: {result.metadata.author}")
|
70
|
+
print(f"Page count: {result.metadata.page_count}")
|
71
|
+
print(f"Created: {result.metadata.created_at}")
|
72
|
+
```
|
73
|
+
|
74
|
+
**Sync (for scripts and CLI tools):**
|
75
|
+
|
76
|
+
```python
|
77
|
+
from kreuzberg import extract_file_sync
|
78
|
+
|
79
|
+
result = extract_file_sync("report.docx")
|
80
|
+
print(result.content)
|
81
|
+
|
82
|
+
# Access rich metadata
|
83
|
+
print(f"Language: {result.metadata.language}")
|
84
|
+
print(f"Word count: {result.metadata.word_count}")
|
85
|
+
print(f"Keywords: {result.metadata.keywords}")
|
86
|
+
```
|
87
|
+
|
88
|
+
### Docker
|
89
|
+
|
90
|
+
```bash
|
91
|
+
# Run the REST API
|
92
|
+
docker run -p 8000:8000 goldziher/kreuzberg
|
93
|
+
|
94
|
+
# Extract via API
|
95
|
+
curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
|
96
|
+
```
|
97
|
+
|
98
|
+
📖 **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** • **[CLI Documentation](https://kreuzberg.dev/cli/)** • **[API Reference](https://kreuzberg.dev/api-reference/)**
|
99
|
+
|
100
|
+
## Deployment Options
|
101
|
+
|
102
|
+
### 🤖 MCP Server (AI Integration)
|
103
|
+
|
104
|
+
**Add to Claude Desktop with one command:**
|
105
|
+
|
106
|
+
```bash
|
107
|
+
claude mcp add kreuzberg uvx -- --from "kreuzberg[all]" kreuzberg-mcp
|
108
|
+
```
|
109
|
+
|
110
|
+
**Or configure manually in `claude_desktop_config.json`:**
|
111
|
+
|
112
|
+
```json
|
113
|
+
{
|
114
|
+
"mcpServers": {
|
115
|
+
"kreuzberg": {
|
116
|
+
"command": "uvx",
|
117
|
+
"args": ["--from", "kreuzberg[all]", "kreuzberg-mcp"]
|
118
|
+
}
|
119
|
+
}
|
120
|
+
}
|
121
|
+
```
|
122
|
+
|
123
|
+
**MCP capabilities:**
|
124
|
+
|
125
|
+
- Extract text from PDFs, images, Office docs, and more
|
126
|
+
- Full OCR support with multiple engines
|
127
|
+
- Table extraction and metadata parsing
|
128
|
+
|
129
|
+
📖 **[MCP Documentation](https://kreuzberg.dev/user-guide/mcp-server/)**
|
130
|
+
|
131
|
+
## Supported Formats
|
132
|
+
|
133
|
+
| Category | Formats |
|
134
|
+
| ----------------- | ------------------------------ |
|
135
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
136
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
137
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
138
|
+
| **Presentations** | PPTX, PPT, ODP |
|
139
|
+
| **Web** | HTML, XML, MHTML |
|
140
|
+
| **Archives** | Support via extraction |
|
141
|
+
|
142
|
+
## 📊 Performance Characteristics
|
143
|
+
|
144
|
+
[View comprehensive benchmarks](https://benchmarks.kreuzberg.dev/) • [Benchmark methodology](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://kreuzberg.dev/performance-analysis/)
|
145
|
+
|
146
|
+
### Technical Specifications
|
147
|
+
|
148
|
+
| Metric | Kreuzberg Sync | Kreuzberg Async | Benchmarked |
|
149
|
+
| ---------------------------- | -------------- | --------------- | ------------------ |
|
150
|
+
| **Throughput (tiny files)** | 31.78 files/s | 23.94 files/s | Highest throughput |
|
151
|
+
| **Throughput (small files)** | 8.91 files/s | 9.31 files/s | Highest throughput |
|
152
|
+
| **Memory footprint** | 359.8 MB | 395.2 MB | Lowest usage |
|
153
|
+
| **Installation size** | 71 MB | 71 MB | Smallest size |
|
154
|
+
| **Success rate** | 100% | 100% | Perfect |
|
155
|
+
| **Supported formats** | 18 | 18 | Comprehensive |
|
156
|
+
|
157
|
+
### Architecture Advantages
|
158
|
+
|
159
|
+
- **Native C extensions**: Built on PDFium and Tesseract for maximum performance
|
160
|
+
- **Async/await support**: True asynchronous processing with intelligent task scheduling
|
161
|
+
- **Memory efficiency**: Streaming architecture minimizes memory allocation
|
162
|
+
- **Process pooling**: Automatic multiprocessing for CPU-intensive operations
|
163
|
+
- **Optimized data flow**: Efficient data handling with minimal transformations
|
164
|
+
|
165
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
166
|
+
|
167
|
+
## Documentation
|
168
|
+
|
169
|
+
### Quick Links
|
170
|
+
|
171
|
+
- [Installation Guide](https://kreuzberg.dev/getting-started/installation/) - Setup and dependencies
|
172
|
+
- [User Guide](https://kreuzberg.dev/user-guide/) - Comprehensive usage guide
|
173
|
+
- [Performance Analysis](https://kreuzberg.dev/performance-analysis/) - Detailed benchmark results
|
174
|
+
- [API Reference](https://kreuzberg.dev/api-reference/) - Complete API documentation
|
175
|
+
- [Docker Guide](https://kreuzberg.dev/user-guide/docker/) - Container deployment
|
176
|
+
- [REST API](https://kreuzberg.dev/user-guide/api-server/) - HTTP endpoints
|
177
|
+
- [CLI Guide](https://kreuzberg.dev/cli/) - Command-line usage
|
178
|
+
- [OCR Configuration](https://kreuzberg.dev/user-guide/ocr-configuration/) - OCR engine setup
|
179
|
+
|
180
|
+
## License
|
181
|
+
|
182
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Baseline performance benchmark before implementing multi-layer caching."""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import json
|
4
5
|
import time
|
5
6
|
from pathlib import Path
|
6
7
|
|
@@ -110,8 +111,6 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
|
|
110
111
|
if __name__ == "__main__":
|
111
112
|
baseline_results = asyncio.run(run_baseline_benchmark())
|
112
113
|
|
113
|
-
import json
|
114
|
-
|
115
114
|
baseline_file = Path("baseline_results.json")
|
116
115
|
with baseline_file.open("w") as f:
|
117
116
|
json.dump(baseline_results, f, indent=2, default=str)
|
@@ -1,8 +1,10 @@
|
|
1
1
|
"""End-to-end reproducible benchmark with proper statistics."""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import json
|
4
5
|
import statistics
|
5
6
|
import time
|
7
|
+
from datetime import datetime
|
6
8
|
from pathlib import Path
|
7
9
|
from typing import Any
|
8
10
|
|
@@ -201,9 +203,6 @@ if __name__ == "__main__":
|
|
201
203
|
try:
|
202
204
|
results = asyncio.run(run_end_to_end_benchmark(trials=30))
|
203
205
|
|
204
|
-
import json
|
205
|
-
from datetime import datetime
|
206
|
-
|
207
206
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
208
207
|
results_file = Path(f"benchmark_msgpack_{timestamp}.json")
|
209
208
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
"""Direct serialization performance comparison: JSON vs msgpack."""
|
2
2
|
|
3
|
-
import
|
3
|
+
import json
|
4
4
|
import statistics
|
5
|
+
import time
|
5
6
|
from pathlib import Path
|
6
7
|
|
7
8
|
from kreuzberg._types import ExtractionResult
|
@@ -151,8 +152,6 @@ if __name__ == "__main__":
|
|
151
152
|
try:
|
152
153
|
results = benchmark_serialization()
|
153
154
|
|
154
|
-
import json
|
155
|
-
|
156
155
|
results_file = Path("serialization_benchmark_results.json")
|
157
156
|
with results_file.open("w") as f:
|
158
157
|
json.dump(results, f, indent=2, default=str)
|
@@ -11,7 +11,7 @@ from typing import Any
|
|
11
11
|
import psutil
|
12
12
|
|
13
13
|
|
14
|
-
@dataclass
|
14
|
+
@dataclass(slots=True)
|
15
15
|
class SystemInfo:
|
16
16
|
"""System information for benchmark context."""
|
17
17
|
|
@@ -42,7 +42,7 @@ class SystemInfo:
|
|
42
42
|
)
|
43
43
|
|
44
44
|
|
45
|
-
@dataclass
|
45
|
+
@dataclass(slots=True)
|
46
46
|
class PerformanceMetrics:
|
47
47
|
"""Performance metrics for a single benchmark run."""
|
48
48
|
|
@@ -55,7 +55,7 @@ class PerformanceMetrics:
|
|
55
55
|
exception_info: str | None = None
|
56
56
|
|
57
57
|
|
58
|
-
@dataclass
|
58
|
+
@dataclass(slots=True)
|
59
59
|
class MetadataQualityMetrics:
|
60
60
|
"""Metadata quality metrics for extraction result."""
|
61
61
|
|
@@ -71,7 +71,7 @@ class MetadataQualityMetrics:
|
|
71
71
|
extraction_backend: str | None = None
|
72
72
|
|
73
73
|
|
74
|
-
@dataclass
|
74
|
+
@dataclass(slots=True)
|
75
75
|
class ExtractionQualityMetrics:
|
76
76
|
"""Quality metrics for extraction result."""
|
77
77
|
|
@@ -86,7 +86,7 @@ class ExtractionQualityMetrics:
|
|
86
86
|
metadata_quality: MetadataQualityMetrics | None = None
|
87
87
|
|
88
88
|
|
89
|
-
@dataclass
|
89
|
+
@dataclass(slots=True)
|
90
90
|
class BenchmarkResult:
|
91
91
|
"""Complete result of a single benchmark."""
|
92
92
|
|
@@ -98,7 +98,7 @@ class BenchmarkResult:
|
|
98
98
|
extraction_quality: ExtractionQualityMetrics | None = None
|
99
99
|
|
100
100
|
|
101
|
-
@dataclass
|
101
|
+
@dataclass(slots=True)
|
102
102
|
class BenchmarkSuite:
|
103
103
|
"""Complete benchmark suite results."""
|
104
104
|
|
@@ -193,7 +193,7 @@ class BenchmarkSuite:
|
|
193
193
|
}
|
194
194
|
|
195
195
|
|
196
|
-
@dataclass
|
196
|
+
@dataclass(slots=True)
|
197
197
|
class FlameGraphConfig:
|
198
198
|
"""Configuration for flame graph generation."""
|
199
199
|
|
@@ -3,6 +3,7 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
5
|
import asyncio
|
6
|
+
import json
|
6
7
|
import time
|
7
8
|
import traceback
|
8
9
|
from typing import TYPE_CHECKING, Any, Callable
|
@@ -392,7 +393,6 @@ class BenchmarkRunner:
|
|
392
393
|
|
393
394
|
def save_results(self, suite: BenchmarkSuite, output_path: Path) -> None:
|
394
395
|
"""Save benchmark results to JSON file."""
|
395
|
-
import json
|
396
396
|
|
397
397
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
398
398
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Statistical benchmark comparing JSON vs msgpack with proper error analysis."""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import json
|
4
5
|
import statistics
|
5
6
|
import time
|
6
7
|
from pathlib import Path
|
@@ -194,8 +195,6 @@ if __name__ == "__main__":
|
|
194
195
|
try:
|
195
196
|
results = asyncio.run(run_statistical_benchmark())
|
196
197
|
|
197
|
-
import json
|
198
|
-
|
199
198
|
results_file = Path("statistical_benchmark_results.json")
|
200
199
|
with results_file.open("w") as f:
|
201
200
|
json.dump(results, f, indent=2, default=str)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Custom Extractors
|
2
2
|
|
3
|
-
Kreuzberg
|
3
|
+
The Kreuzberg document intelligence framework provides an extensible architecture through the `ExtractorRegistry` system. This plugin mechanism enables developers to extend document processing capabilities by implementing custom extractors for specialized formats or processing requirements.
|
4
4
|
|
5
5
|
## Creating a Custom Extractor
|
6
6
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Advanced Topics
|
2
2
|
|
3
|
-
This section
|
3
|
+
This section explores advanced capabilities of the Kreuzberg document intelligence framework, including extensibility mechanisms, performance optimization, and custom integration patterns.
|
4
4
|
|
5
5
|
## Topics
|
6
6
|
|
@@ -16,13 +16,13 @@ Kreuzberg provides both synchronous and asynchronous APIs, each optimized for di
|
|
16
16
|
|
17
17
|
## Competitive Performance
|
18
18
|
|
19
|
-
[
|
19
|
+
[Live benchmarks](https://benchmarks.kreuzberg.dev/) ([source code](https://github.com/Goldziher/python-text-extraction-libs-benchmarks)) demonstrate Kreuzberg as the fastest Python CPU-based text extraction framework:
|
20
20
|
|
21
|
-
- **
|
22
|
-
- **
|
23
|
-
- **Smallest Installation**:
|
24
|
-
- **100%
|
25
|
-
- **
|
21
|
+
- **Leading Performance**: 31.78 files/second for small documents, 2.42 files/second for medium files
|
22
|
+
- **Minimal Memory**: ~360MB average usage, lowest among tested frameworks
|
23
|
+
- **Smallest Installation**: 71MB package size for maximum deployment flexibility
|
24
|
+
- **High Reliability**: 100% success rate across all 18 tested file formats
|
25
|
+
- **Production Optimized**: Built for high-throughput, real-time applications
|
26
26
|
|
27
27
|
## Internal Benchmark Results
|
28
28
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
# Installation
|
2
2
|
|
3
|
-
Kreuzberg is
|
3
|
+
Kreuzberg is a modular document intelligence framework with a core package and optional components for specialized functionality.
|
4
4
|
|
5
5
|
## System Dependencies
|
6
6
|
|
7
7
|
### Pandoc
|
8
8
|
|
9
|
-
Kreuzberg
|
9
|
+
Pandoc is the foundation of Kreuzberg's universal document conversion capabilities. This **required** system dependency enables reliable extraction across diverse document formats. Install Pandoc for your platform:
|
10
10
|
|
11
11
|
#### Ubuntu/Debian
|
12
12
|
|