kreuzberg 3.7.0__tar.gz → 3.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.gitignore +2 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/PKG-INFO +58 -54
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/README.md +43 -48
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/pyproject.toml +4 -1
- kreuzberg-3.8.1/benchmarks/results/latest.json +607 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +36 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +145 -3
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/models.py +60 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +127 -3
- kreuzberg-3.8.1/docs/index.md +54 -0
- kreuzberg-3.8.1/docs/performance-analysis.md +140 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/docker.md +1 -1
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/mcp-server.md +15 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_entity_extraction.py +1 -2
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_base.py +39 -1
- kreuzberg-3.8.1/kreuzberg/_extractors/_email.py +149 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_html.py +15 -3
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_image.py +21 -36
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_pandoc.py +3 -14
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_pdf.py +81 -48
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg-3.8.1/kreuzberg/_extractors/_spread_sheet.py +358 -0
- kreuzberg-3.8.1/kreuzberg/_extractors/_structured.py +148 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_gmft.py +314 -7
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_mime_types.py +27 -1
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg-3.8.1/kreuzberg/_ocr/_base.py +113 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/_easyocr.py +91 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg-3.8.1/kreuzberg/_ocr/_tesseract.py +996 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_registry.py +4 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_types.py +131 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_cache.py +52 -4
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_errors.py +3 -7
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/process_manager.py → kreuzberg-3.8.1/kreuzberg/_utils/_process_pool.py +86 -2
- kreuzberg-3.8.1/kreuzberg/_utils/_quality.py +237 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg-3.8.1/kreuzberg/_utils/_string.py +182 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_sync.py +5 -2
- kreuzberg-3.8.1/kreuzberg/_utils/_table.py +261 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/cli.py +1 -2
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/extraction.py +4 -22
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/mkdocs.yaml +1 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/pyproject.toml +41 -15
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/api/main_test.py +162 -2
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extraction_batch_test.py +4 -4
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extraction_test.py +12 -3
- kreuzberg-3.8.1/tests/extractors/email_comprehensive_test.py +326 -0
- kreuzberg-3.8.1/tests/extractors/email_test.py +31 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/image_test.py +64 -69
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/pdf_test.py +2 -2
- kreuzberg-3.8.1/tests/extractors/structured_test.py +90 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/gmft_test.py +15 -2
- kreuzberg-3.8.1/tests/multiprocessing/gmft_isolated_test.py +489 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/multiprocessing/process_manager_test.py +1 -1
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/multiprocessing/tesseract_pool_test.py +4 -4
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/base_test.py +14 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/easyocr_test.py +36 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/paddleocr_test.py +54 -4
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/tesseract_test.py +44 -0
- kreuzberg-3.8.1/tests/test_source_files/better-ocr-image.jpg +0 -0
- kreuzberg-3.8.1/tests/test_source_files/email/sample-email.eml +11 -0
- kreuzberg-3.8.1/tests/test_source_files/json/sample-document.json +1 -0
- kreuzberg-3.8.1/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- kreuzberg-3.8.1/tests/test_source_files/toml/sample-config.toml +33 -0
- kreuzberg-3.8.1/tests/test_source_files/yaml/sample-config.yaml +15 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/process_pool_test.py +1 -1
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/string_test.py +3 -3
- kreuzberg-3.8.1/tests/utils/table_test.py +413 -0
- kreuzberg-3.8.1/uv.lock +6637 -0
- kreuzberg-3.7.0/.gitmodules +0 -3
- kreuzberg-3.7.0/docs/index.md +0 -16
- kreuzberg-3.7.0/kreuzberg/_extractors/_spread_sheet.py +0 -183
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg-3.7.0/kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.7.0/kreuzberg/_ocr/_base.py +0 -54
- kreuzberg-3.7.0/kreuzberg/_ocr/_tesseract.py +0 -436
- kreuzberg-3.7.0/kreuzberg/_utils/_process_pool.py +0 -100
- kreuzberg-3.7.0/kreuzberg/_utils/_string.py +0 -39
- kreuzberg-3.7.0/tests/multiprocessing/sync_tesseract_test.py +0 -366
- kreuzberg-3.7.0/uv.lock +0 -4369
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.commitlintrc +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.docker/README.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.dockerignore +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/LICENSE +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/ai-rulez.yaml +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/changelog.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/cli.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/contributing.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/conftest.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/types_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.7.0 → kreuzberg-3.8.1}/tests/utils/tmp_test.py +0 -0
@@ -1,14 +1,16 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
4
|
-
Summary:
|
3
|
+
Version: 3.8.1
|
4
|
+
Summary: Advanced document intelligence framework for extracting structured content from PDFs, images, and office documents
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
7
|
License: MIT
|
8
8
|
License-File: LICENSE
|
9
|
-
Keywords: document-processing,entity-extraction,image-to-text,
|
9
|
+
Keywords: automation,content-extraction,data-processing,document-analysis,document-intelligence,document-processing,entity-extraction,image-to-text,information-extraction,ocr,pdf-extraction,rag,structured-data,table-extraction,text-extraction
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: Intended Audience :: Information Technology
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
12
14
|
Classifier: License :: OSI Approved :: MIT License
|
13
15
|
Classifier: Operating System :: OS Independent
|
14
16
|
Classifier: Programming Language :: Python :: 3 :: Only
|
@@ -16,16 +18,19 @@ Classifier: Programming Language :: Python :: 3.10
|
|
16
18
|
Classifier: Programming Language :: Python :: 3.11
|
17
19
|
Classifier: Programming Language :: Python :: 3.12
|
18
20
|
Classifier: Programming Language :: Python :: 3.13
|
21
|
+
Classifier: Topic :: Database
|
22
|
+
Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
|
23
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
19
24
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
20
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
27
|
Classifier: Topic :: Text Processing :: General
|
22
|
-
Classifier: Topic :: Utilities
|
23
28
|
Classifier: Typing :: Typed
|
24
29
|
Requires-Python: >=3.10
|
25
30
|
Requires-Dist: anyio>=4.9.0
|
26
|
-
Requires-Dist:
|
31
|
+
Requires-Dist: chardetng-py>=0.3.4
|
27
32
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
33
|
+
Requires-Dist: html-to-markdown[lxml]>=1.8.0
|
29
34
|
Requires-Dist: mcp>=1.11.0
|
30
35
|
Requires-Dist: msgspec>=0.18.0
|
31
36
|
Requires-Dist: playa-pdf>=0.6.1
|
@@ -34,6 +39,9 @@ Requires-Dist: pypdfium2==4.30.0
|
|
34
39
|
Requires-Dist: python-calamine>=0.3.2
|
35
40
|
Requires-Dist: python-pptx>=1.0.2
|
36
41
|
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
42
|
+
Provides-Extra: additional-extensions
|
43
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
44
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
37
45
|
Provides-Extra: all
|
38
46
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
39
47
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
@@ -41,6 +49,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
|
41
49
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
42
50
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
43
51
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
52
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
44
53
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
45
54
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
46
55
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
@@ -77,22 +86,33 @@ Description-Content-Type: text/markdown
|
|
77
86
|
[](https://badge.fury.io/py/kreuzberg)
|
78
87
|
[](https://goldziher.github.io/kreuzberg/)
|
79
88
|
[](https://opensource.org/licenses/MIT)
|
89
|
+
[](https://github.com/Goldziher/kreuzberg)
|
80
90
|
|
81
|
-
**
|
91
|
+
**Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
|
82
92
|
|
83
93
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
84
94
|
|
85
|
-
## Why Kreuzberg?
|
95
|
+
## Why Choose Kreuzberg?
|
86
96
|
|
87
|
-
|
88
|
-
|
89
|
-
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
97
|
+
### ⚡ Proven Performance
|
98
|
+
|
99
|
+
[Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
|
100
|
+
|
101
|
+
### 🏗️ Production Engineering
|
102
|
+
|
103
|
+
Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
|
104
|
+
|
105
|
+
### 🔧 Developer Experience
|
106
|
+
|
107
|
+
Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
|
108
|
+
|
109
|
+
### 🚀 Flexible Deployment
|
110
|
+
|
111
|
+
Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
|
112
|
+
|
113
|
+
### 📄 Comprehensive Format Support
|
114
|
+
|
115
|
+
Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
|
96
116
|
|
97
117
|
## Quick Start
|
98
118
|
|
@@ -128,7 +148,7 @@ import asyncio
|
|
128
148
|
from kreuzberg import extract_file
|
129
149
|
|
130
150
|
async def main():
|
131
|
-
# Extract from
|
151
|
+
# Extract content from files
|
132
152
|
result = await extract_file("document.pdf")
|
133
153
|
print(result.content)
|
134
154
|
print(result.metadata)
|
@@ -197,7 +217,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
|
|
197
217
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
198
218
|
```
|
199
219
|
|
200
|
-
Available variants: `latest`, `
|
220
|
+
Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
|
201
221
|
|
202
222
|
### 🌐 REST API
|
203
223
|
|
@@ -240,23 +260,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
240
260
|
| **Web** | HTML, XML, MHTML |
|
241
261
|
| **Archives** | Support via extraction |
|
242
262
|
|
243
|
-
## Performance
|
263
|
+
## 📊 Performance Comparison
|
244
264
|
|
245
|
-
|
265
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
|
246
266
|
|
247
|
-
|
|
248
|
-
| ------------- |
|
249
|
-
| **Kreuzberg** |
|
250
|
-
| Unstructured |
|
251
|
-
| MarkItDown |
|
252
|
-
| Docling |
|
267
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
268
|
+
| ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
|
269
|
+
| **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
|
270
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
271
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
272
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
253
273
|
|
254
|
-
\*
|
255
|
-
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
256
|
-
‡_Frequently fails/times out on medium files (>1MB)_
|
274
|
+
\*_Performance varies significantly with document complexity and size_
|
257
275
|
|
258
|
-
|
259
|
-
|
276
|
+
**Key strengths:**
|
277
|
+
|
278
|
+
- 6-126x faster processing than comparable frameworks
|
279
|
+
- Smallest installation footprint and memory usage
|
280
|
+
- Only framework with built-in async/await support
|
281
|
+
- Supports both CPU and GPU processing
|
282
|
+
- Built by software engineers for production reliability
|
283
|
+
|
284
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
260
285
|
|
261
286
|
## Documentation
|
262
287
|
|
@@ -264,34 +289,13 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
264
289
|
|
265
290
|
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
266
291
|
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
292
|
+
- [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
|
267
293
|
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
268
294
|
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
269
295
|
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
270
296
|
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
271
297
|
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
272
298
|
|
273
|
-
## Advanced Features
|
274
|
-
|
275
|
-
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
276
|
-
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
277
|
-
- **🧩 Content Chunking**: Split documents for RAG applications
|
278
|
-
- **🎯 Custom Extractors**: Extend with your own document handlers
|
279
|
-
- **🔧 Configuration**: Flexible TOML-based configuration
|
280
|
-
- **🪝 Hooks**: Pre/post-processing customization
|
281
|
-
- **🌍 Multi-language OCR**: 100+ languages supported
|
282
|
-
- **⚙️ Metadata Extraction**: Rich document metadata
|
283
|
-
- **🔄 Batch Processing**: Efficient bulk document processing
|
284
|
-
|
285
299
|
## License
|
286
300
|
|
287
301
|
MIT License - see [LICENSE](LICENSE) for details.
|
288
|
-
|
289
|
-
______________________________________________________________________
|
290
|
-
|
291
|
-
<div align="center">
|
292
|
-
|
293
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
294
|
-
|
295
|
-
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
296
|
-
|
297
|
-
</div>
|
@@ -4,22 +4,33 @@
|
|
4
4
|
[](https://badge.fury.io/py/kreuzberg)
|
5
5
|
[](https://goldziher.github.io/kreuzberg/)
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
7
|
+
[](https://github.com/Goldziher/kreuzberg)
|
7
8
|
|
8
|
-
**
|
9
|
+
**Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
|
9
10
|
|
10
11
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
11
12
|
|
12
|
-
## Why Kreuzberg?
|
13
|
+
## Why Choose Kreuzberg?
|
13
14
|
|
14
|
-
|
15
|
-
|
16
|
-
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
15
|
+
### ⚡ Proven Performance
|
16
|
+
|
17
|
+
[Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
|
18
|
+
|
19
|
+
### 🏗️ Production Engineering
|
20
|
+
|
21
|
+
Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
|
22
|
+
|
23
|
+
### 🔧 Developer Experience
|
24
|
+
|
25
|
+
Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
|
26
|
+
|
27
|
+
### 🚀 Flexible Deployment
|
28
|
+
|
29
|
+
Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
|
30
|
+
|
31
|
+
### 📄 Comprehensive Format Support
|
32
|
+
|
33
|
+
Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
|
23
34
|
|
24
35
|
## Quick Start
|
25
36
|
|
@@ -55,7 +66,7 @@ import asyncio
|
|
55
66
|
from kreuzberg import extract_file
|
56
67
|
|
57
68
|
async def main():
|
58
|
-
# Extract from
|
69
|
+
# Extract content from files
|
59
70
|
result = await extract_file("document.pdf")
|
60
71
|
print(result.content)
|
61
72
|
print(result.metadata)
|
@@ -124,7 +135,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
|
|
124
135
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
125
136
|
```
|
126
137
|
|
127
|
-
Available variants: `latest`, `
|
138
|
+
Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
|
128
139
|
|
129
140
|
### 🌐 REST API
|
130
141
|
|
@@ -167,23 +178,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
167
178
|
| **Web** | HTML, XML, MHTML |
|
168
179
|
| **Archives** | Support via extraction |
|
169
180
|
|
170
|
-
## Performance
|
181
|
+
## 📊 Performance Comparison
|
171
182
|
|
172
|
-
|
183
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
|
173
184
|
|
174
|
-
|
|
175
|
-
| ------------- |
|
176
|
-
| **Kreuzberg** |
|
177
|
-
| Unstructured |
|
178
|
-
| MarkItDown |
|
179
|
-
| Docling |
|
185
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
186
|
+
| ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
|
187
|
+
| **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
|
188
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
189
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
190
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
180
191
|
|
181
|
-
\*
|
182
|
-
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
183
|
-
‡_Frequently fails/times out on medium files (>1MB)_
|
192
|
+
\*_Performance varies significantly with document complexity and size_
|
184
193
|
|
185
|
-
|
186
|
-
|
194
|
+
**Key strengths:**
|
195
|
+
|
196
|
+
- 6-126x faster processing than comparable frameworks
|
197
|
+
- Smallest installation footprint and memory usage
|
198
|
+
- Only framework with built-in async/await support
|
199
|
+
- Supports both CPU and GPU processing
|
200
|
+
- Built by software engineers for production reliability
|
201
|
+
|
202
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
187
203
|
|
188
204
|
## Documentation
|
189
205
|
|
@@ -191,34 +207,13 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
191
207
|
|
192
208
|
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
193
209
|
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
210
|
+
- [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
|
194
211
|
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
195
212
|
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
196
213
|
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
197
214
|
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
198
215
|
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
199
216
|
|
200
|
-
## Advanced Features
|
201
|
-
|
202
|
-
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
203
|
-
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
204
|
-
- **🧩 Content Chunking**: Split documents for RAG applications
|
205
|
-
- **🎯 Custom Extractors**: Extend with your own document handlers
|
206
|
-
- **🔧 Configuration**: Flexible TOML-based configuration
|
207
|
-
- **🪝 Hooks**: Pre/post-processing customization
|
208
|
-
- **🌍 Multi-language OCR**: 100+ languages supported
|
209
|
-
- **⚙️ Metadata Extraction**: Rich document metadata
|
210
|
-
- **🔄 Batch Processing**: Efficient bulk document processing
|
211
|
-
|
212
217
|
## License
|
213
218
|
|
214
219
|
MIT License - see [LICENSE](LICENSE) for details.
|
215
|
-
|
216
|
-
______________________________________________________________________
|
217
|
-
|
218
|
-
<div align="center">
|
219
|
-
|
220
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
221
|
-
|
222
|
-
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
223
|
-
|
224
|
-
</div>
|
@@ -3,9 +3,12 @@ name = "kreuzberg-benchmarks"
|
|
3
3
|
version = "0.1.0"
|
4
4
|
description = "Performance benchmarking suite for Kreuzberg text extraction library"
|
5
5
|
readme = "README.md"
|
6
|
-
requires-python = ">=3.
|
6
|
+
requires-python = ">=3.10"
|
7
7
|
classifiers = [
|
8
8
|
"Programming Language :: Python :: 3 :: Only",
|
9
|
+
"Programming Language :: Python :: 3.10",
|
10
|
+
"Programming Language :: Python :: 3.11",
|
11
|
+
"Programming Language :: Python :: 3.12",
|
9
12
|
"Programming Language :: Python :: 3.13",
|
10
13
|
]
|
11
14
|
dependencies = [
|