kreuzberg 3.7.0__tar.gz → 3.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.gitignore +2 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/PKG-INFO +66 -50
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/README.md +59 -47
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/pyproject.toml +4 -1
- kreuzberg-3.8.0/benchmarks/results/latest.json +607 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +36 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +145 -3
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/models.py +60 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +127 -3
- kreuzberg-3.8.0/docs/index.md +58 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/docker.md +1 -1
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/mcp-server.md +15 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_extractors/_base.py +40 -0
- kreuzberg-3.8.0/kreuzberg/_extractors/_email.py +149 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_extractors/_html.py +15 -3
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_extractors/_image.py +17 -18
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_extractors/_pdf.py +68 -14
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg-3.8.0/kreuzberg/_extractors/_spread_sheet.py +358 -0
- kreuzberg-3.8.0/kreuzberg/_extractors/_structured.py +148 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_gmft.py +2 -2
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_mime_types.py +27 -1
- kreuzberg-3.8.0/kreuzberg/_multiprocessing/__init__.py +5 -0
- kreuzberg-3.8.0/kreuzberg/_ocr/__init__.py +47 -0
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/tesseract_pool.py → kreuzberg-3.8.0/kreuzberg/_ocr/_pool.py +3 -5
- kreuzberg-3.8.0/kreuzberg/_ocr/_sync.py +566 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_ocr/_tesseract.py +6 -2
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_registry.py +4 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_types.py +131 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/_cache.py +17 -2
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/process_manager.py → kreuzberg-3.8.0/kreuzberg/_utils/_process_pool.py +90 -2
- kreuzberg-3.8.0/kreuzberg/_utils/_quality.py +237 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg-3.8.0/kreuzberg/_utils/_string.py +182 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/_sync.py +5 -2
- kreuzberg-3.8.0/kreuzberg/_utils/_table.py +261 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/pyproject.toml +25 -4
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/api/main_test.py +162 -2
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extraction_test.py +12 -3
- kreuzberg-3.8.0/tests/extractors/email_comprehensive_test.py +326 -0
- kreuzberg-3.8.0/tests/extractors/email_test.py +31 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extractors/image_test.py +22 -10
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extractors/pdf_test.py +2 -2
- kreuzberg-3.8.0/tests/extractors/structured_test.py +90 -0
- kreuzberg-3.8.0/tests/multiprocessing/gmft_isolated_test.py +488 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/multiprocessing/process_manager_test.py +1 -1
- kreuzberg-3.8.0/tests/multiprocessing/sync_easyocr_test.py +640 -0
- kreuzberg-3.8.0/tests/multiprocessing/sync_paddleocr_test.py +529 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/multiprocessing/sync_tesseract_test.py +29 -33
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/multiprocessing/tesseract_pool_test.py +2 -2
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/ocr/paddleocr_test.py +4 -4
- kreuzberg-3.8.0/tests/test_source_files/better-ocr-image.jpg +0 -0
- kreuzberg-3.8.0/tests/test_source_files/email/sample-email.eml +11 -0
- kreuzberg-3.8.0/tests/test_source_files/json/sample-document.json +1 -0
- kreuzberg-3.8.0/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- kreuzberg-3.8.0/tests/test_source_files/toml/sample-config.toml +33 -0
- kreuzberg-3.8.0/tests/test_source_files/yaml/sample-config.yaml +15 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/string_test.py +3 -3
- kreuzberg-3.8.0/tests/utils/table_test.py +413 -0
- kreuzberg-3.8.0/uv.lock +6637 -0
- kreuzberg-3.7.0/.gitmodules +0 -3
- kreuzberg-3.7.0/docs/index.md +0 -16
- kreuzberg-3.7.0/kreuzberg/_extractors/_spread_sheet.py +0 -183
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg-3.7.0/kreuzberg/_ocr/__init__.py +0 -17
- kreuzberg-3.7.0/kreuzberg/_utils/_process_pool.py +0 -100
- kreuzberg-3.7.0/kreuzberg/_utils/_string.py +0 -39
- kreuzberg-3.7.0/uv.lock +0 -4369
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.commitlintrc +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.docker/README.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.dockerignore +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/LICENSE +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/ai-rulez.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/changelog.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/cli.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/contributing.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/mkdocs.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/cli_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/conftest.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/gmft_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/types_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.0}/tests/utils/tmp_test.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.8.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -23,9 +23,9 @@ Classifier: Topic :: Utilities
|
|
23
23
|
Classifier: Typing :: Typed
|
24
24
|
Requires-Python: >=3.10
|
25
25
|
Requires-Dist: anyio>=4.9.0
|
26
|
-
Requires-Dist:
|
26
|
+
Requires-Dist: chardetng-py>=0.3.4
|
27
27
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
28
|
+
Requires-Dist: html-to-markdown[lxml]>=1.8.0
|
29
29
|
Requires-Dist: mcp>=1.11.0
|
30
30
|
Requires-Dist: msgspec>=0.18.0
|
31
31
|
Requires-Dist: playa-pdf>=0.6.1
|
@@ -34,6 +34,9 @@ Requires-Dist: pypdfium2==4.30.0
|
|
34
34
|
Requires-Dist: python-calamine>=0.3.2
|
35
35
|
Requires-Dist: python-pptx>=1.0.2
|
36
36
|
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
37
|
+
Provides-Extra: additional-extensions
|
38
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
39
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
37
40
|
Provides-Extra: all
|
38
41
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
39
42
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
@@ -41,6 +44,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
|
41
44
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
42
45
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
43
46
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
47
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
44
48
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
45
49
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
46
50
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
@@ -77,22 +81,51 @@ Description-Content-Type: text/markdown
|
|
77
81
|
[](https://badge.fury.io/py/kreuzberg)
|
78
82
|
[](https://goldziher.github.io/kreuzberg/)
|
79
83
|
[](https://opensource.org/licenses/MIT)
|
84
|
+
[](https://github.com/Goldziher/kreuzberg)
|
80
85
|
|
81
|
-
**High-performance
|
86
|
+
**High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
|
82
87
|
|
83
88
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
84
89
|
|
85
|
-
## Why Kreuzberg?
|
90
|
+
## Why Choose Kreuzberg?
|
86
91
|
|
87
|
-
|
88
|
-
|
89
|
-
-
|
90
|
-
-
|
91
|
-
-
|
92
|
-
-
|
93
|
-
|
94
|
-
|
95
|
-
|
92
|
+
### 🚀 Performance
|
93
|
+
|
94
|
+
- [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
|
95
|
+
- Minimal footprint: 71MB install vs 1GB+ for competitors
|
96
|
+
- Lowest memory usage (~530MB average) optimized for production workloads
|
97
|
+
- Edge and serverless ready - deploy anywhere without heavy dependencies
|
98
|
+
|
99
|
+
### 🛠️ Engineering Quality
|
100
|
+
|
101
|
+
- Built by software engineers with modern Python best practices
|
102
|
+
- 95%+ test coverage with comprehensive test suite
|
103
|
+
- Thoroughly benchmarked and profiled for real-world performance
|
104
|
+
- Only framework offering true async/await support alongside sync APIs
|
105
|
+
- Robust error handling and detailed logging
|
106
|
+
|
107
|
+
### 🎯 Developer Experience
|
108
|
+
|
109
|
+
- Works out of the box with sane defaults, scales with your needs
|
110
|
+
- Native MCP server for AI tool integration (Claude Desktop, Cursor)
|
111
|
+
- Full type safety with excellent IDE support (completions)
|
112
|
+
- Comprehensive documentation including full API reference
|
113
|
+
|
114
|
+
### 🌍 Deployment Options
|
115
|
+
|
116
|
+
- Docker images for all architectures (AMD64, ARM64)
|
117
|
+
- Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
|
118
|
+
- CPU-only processing - no GPU requirements, lower energy consumption
|
119
|
+
- 100% local processing - no external API dependencies
|
120
|
+
- Multiple deployment modes: CLI, REST API, MCP server
|
121
|
+
|
122
|
+
### 🎯 Complete Solution
|
123
|
+
|
124
|
+
- Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
|
125
|
+
- Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
|
126
|
+
- Advanced features: Table extraction, metadata extraction, content chunking for RAG
|
127
|
+
- Production tools: REST API, CLI tools, batch processing, custom extractors
|
128
|
+
- Fully extensible: Add your own extractors
|
96
129
|
|
97
130
|
## Quick Start
|
98
131
|
|
@@ -197,7 +230,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
|
|
197
230
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
198
231
|
```
|
199
232
|
|
200
|
-
Available variants: `latest`, `
|
233
|
+
Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
|
201
234
|
|
202
235
|
### 🌐 REST API
|
203
236
|
|
@@ -240,23 +273,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
240
273
|
| **Web** | HTML, XML, MHTML |
|
241
274
|
| **Archives** | Support via extraction |
|
242
275
|
|
243
|
-
## Performance
|
276
|
+
## 📊 Performance Comparison
|
244
277
|
|
245
|
-
|
278
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
246
279
|
|
247
|
-
|
|
248
|
-
| ------------- |
|
249
|
-
| **Kreuzberg** |
|
250
|
-
| Unstructured |
|
251
|
-
| MarkItDown |
|
252
|
-
| Docling |
|
280
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
281
|
+
| ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
|
282
|
+
| **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
|
283
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
284
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
285
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
253
286
|
|
254
|
-
\*
|
255
|
-
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
256
|
-
‡_Frequently fails/times out on medium files (>1MB)_
|
287
|
+
\*_Performance varies significantly with document complexity and size_
|
257
288
|
|
258
|
-
|
259
|
-
|
289
|
+
**Key strengths:**
|
290
|
+
|
291
|
+
- 2-3x faster processing than comparable frameworks
|
292
|
+
- Smallest installation footprint and memory usage
|
293
|
+
- Only framework with built-in async/await support
|
294
|
+
- CPU-only processing - no GPU dependencies
|
295
|
+
- Built by software engineers for production reliability
|
296
|
+
|
297
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
260
298
|
|
261
299
|
## Documentation
|
262
300
|
|
@@ -270,28 +308,6 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
270
308
|
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
271
309
|
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
272
310
|
|
273
|
-
## Advanced Features
|
274
|
-
|
275
|
-
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
276
|
-
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
277
|
-
- **🧩 Content Chunking**: Split documents for RAG applications
|
278
|
-
- **🎯 Custom Extractors**: Extend with your own document handlers
|
279
|
-
- **🔧 Configuration**: Flexible TOML-based configuration
|
280
|
-
- **🪝 Hooks**: Pre/post-processing customization
|
281
|
-
- **🌍 Multi-language OCR**: 100+ languages supported
|
282
|
-
- **⚙️ Metadata Extraction**: Rich document metadata
|
283
|
-
- **🔄 Batch Processing**: Efficient bulk document processing
|
284
|
-
|
285
311
|
## License
|
286
312
|
|
287
313
|
MIT License - see [LICENSE](LICENSE) for details.
|
288
|
-
|
289
|
-
______________________________________________________________________
|
290
|
-
|
291
|
-
<div align="center">
|
292
|
-
|
293
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
294
|
-
|
295
|
-
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
296
|
-
|
297
|
-
</div>
|
@@ -4,22 +4,51 @@
|
|
4
4
|
[](https://badge.fury.io/py/kreuzberg)
|
5
5
|
[](https://goldziher.github.io/kreuzberg/)
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
7
|
+
[](https://github.com/Goldziher/kreuzberg)
|
7
8
|
|
8
|
-
**High-performance
|
9
|
+
**High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
|
9
10
|
|
10
11
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
11
12
|
|
12
|
-
## Why Kreuzberg?
|
13
|
+
## Why Choose Kreuzberg?
|
13
14
|
|
14
|
-
|
15
|
-
|
16
|
-
-
|
17
|
-
-
|
18
|
-
-
|
19
|
-
-
|
20
|
-
|
21
|
-
|
22
|
-
|
15
|
+
### 🚀 Performance
|
16
|
+
|
17
|
+
- [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
|
18
|
+
- Minimal footprint: 71MB install vs 1GB+ for competitors
|
19
|
+
- Lowest memory usage (~530MB average) optimized for production workloads
|
20
|
+
- Edge and serverless ready - deploy anywhere without heavy dependencies
|
21
|
+
|
22
|
+
### 🛠️ Engineering Quality
|
23
|
+
|
24
|
+
- Built by software engineers with modern Python best practices
|
25
|
+
- 95%+ test coverage with comprehensive test suite
|
26
|
+
- Thoroughly benchmarked and profiled for real-world performance
|
27
|
+
- Only framework offering true async/await support alongside sync APIs
|
28
|
+
- Robust error handling and detailed logging
|
29
|
+
|
30
|
+
### 🎯 Developer Experience
|
31
|
+
|
32
|
+
- Works out of the box with sane defaults, scales with your needs
|
33
|
+
- Native MCP server for AI tool integration (Claude Desktop, Cursor)
|
34
|
+
- Full type safety with excellent IDE support (completions)
|
35
|
+
- Comprehensive documentation including full API reference
|
36
|
+
|
37
|
+
### 🌍 Deployment Options
|
38
|
+
|
39
|
+
- Docker images for all architectures (AMD64, ARM64)
|
40
|
+
- Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
|
41
|
+
- CPU-only processing - no GPU requirements, lower energy consumption
|
42
|
+
- 100% local processing - no external API dependencies
|
43
|
+
- Multiple deployment modes: CLI, REST API, MCP server
|
44
|
+
|
45
|
+
### 🎯 Complete Solution
|
46
|
+
|
47
|
+
- Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
|
48
|
+
- Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
|
49
|
+
- Advanced features: Table extraction, metadata extraction, content chunking for RAG
|
50
|
+
- Production tools: REST API, CLI tools, batch processing, custom extractors
|
51
|
+
- Fully extensible: Add your own extractors
|
23
52
|
|
24
53
|
## Quick Start
|
25
54
|
|
@@ -124,7 +153,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
|
|
124
153
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
125
154
|
```
|
126
155
|
|
127
|
-
Available variants: `latest`, `
|
156
|
+
Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
|
128
157
|
|
129
158
|
### 🌐 REST API
|
130
159
|
|
@@ -167,23 +196,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
167
196
|
| **Web** | HTML, XML, MHTML |
|
168
197
|
| **Archives** | Support via extraction |
|
169
198
|
|
170
|
-
## Performance
|
199
|
+
## 📊 Performance Comparison
|
171
200
|
|
172
|
-
|
201
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
173
202
|
|
174
|
-
|
|
175
|
-
| ------------- |
|
176
|
-
| **Kreuzberg** |
|
177
|
-
| Unstructured |
|
178
|
-
| MarkItDown |
|
179
|
-
| Docling |
|
203
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
204
|
+
| ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
|
205
|
+
| **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
|
206
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
207
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
208
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
180
209
|
|
181
|
-
\*
|
182
|
-
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
183
|
-
‡_Frequently fails/times out on medium files (>1MB)_
|
210
|
+
\*_Performance varies significantly with document complexity and size_
|
184
211
|
|
185
|
-
|
186
|
-
|
212
|
+
**Key strengths:**
|
213
|
+
|
214
|
+
- 2-3x faster processing than comparable frameworks
|
215
|
+
- Smallest installation footprint and memory usage
|
216
|
+
- Only framework with built-in async/await support
|
217
|
+
- CPU-only processing - no GPU dependencies
|
218
|
+
- Built by software engineers for production reliability
|
219
|
+
|
220
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
187
221
|
|
188
222
|
## Documentation
|
189
223
|
|
@@ -197,28 +231,6 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
197
231
|
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
198
232
|
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
199
233
|
|
200
|
-
## Advanced Features
|
201
|
-
|
202
|
-
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
203
|
-
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
204
|
-
- **🧩 Content Chunking**: Split documents for RAG applications
|
205
|
-
- **🎯 Custom Extractors**: Extend with your own document handlers
|
206
|
-
- **🔧 Configuration**: Flexible TOML-based configuration
|
207
|
-
- **🪝 Hooks**: Pre/post-processing customization
|
208
|
-
- **🌍 Multi-language OCR**: 100+ languages supported
|
209
|
-
- **⚙️ Metadata Extraction**: Rich document metadata
|
210
|
-
- **🔄 Batch Processing**: Efficient bulk document processing
|
211
|
-
|
212
234
|
## License
|
213
235
|
|
214
236
|
MIT License - see [LICENSE](LICENSE) for details.
|
215
|
-
|
216
|
-
______________________________________________________________________
|
217
|
-
|
218
|
-
<div align="center">
|
219
|
-
|
220
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
221
|
-
|
222
|
-
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
223
|
-
|
224
|
-
</div>
|
@@ -3,9 +3,12 @@ name = "kreuzberg-benchmarks"
|
|
3
3
|
version = "0.1.0"
|
4
4
|
description = "Performance benchmarking suite for Kreuzberg text extraction library"
|
5
5
|
readme = "README.md"
|
6
|
-
requires-python = ">=3.
|
6
|
+
requires-python = ">=3.10"
|
7
7
|
classifiers = [
|
8
8
|
"Programming Language :: Python :: 3 :: Only",
|
9
|
+
"Programming Language :: Python :: 3.10",
|
10
|
+
"Programming Language :: Python :: 3.11",
|
11
|
+
"Programming Language :: Python :: 3.12",
|
9
12
|
"Programming Language :: Python :: 3.13",
|
10
13
|
]
|
11
14
|
dependencies = [
|