kreuzberg 3.9.1__tar.gz → 3.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/PKG-INFO +4 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/ai-rulez.yaml +11 -1
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_pdf.py +76 -5
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_playa.py +6 -4
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_types.py +2 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/pyproject.toml +3 -2
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/pdf_test.py +48 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/uv.lock +64 -7
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.commitlintrc +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.deepsource.toml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.docker/README.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.dockerignore +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.gitignore +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/LICENSE +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/README.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/changelog.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/cli.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/contributing.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/mkdocs.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/api/main_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/cli_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/config_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/conftest.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/document_classification_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extraction_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/email_comprehensive_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/gmft_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/types_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.9.1 → kreuzberg-3.10.0}/tests/utils/tmp_test.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.10.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -53,6 +53,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
|
|
53
53
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
54
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
55
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
56
|
+
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
|
56
57
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
57
58
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
58
59
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
@@ -69,6 +70,8 @@ Provides-Extra: cli
|
|
69
70
|
Requires-Dist: click>=8.2.1; extra == 'cli'
|
70
71
|
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
71
72
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
73
|
+
Provides-Extra: crypto
|
74
|
+
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
|
72
75
|
Provides-Extra: easyocr
|
73
76
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
74
77
|
Provides-Extra: entity-extraction
|
@@ -193,16 +193,18 @@ rules:
|
|
193
193
|
api = ["litestar[standard,structlog,opentelemetry]>=2.1.6"]
|
194
194
|
cli = ["click>=8.2.1", "rich>=14.0.0", "tomli>=2.0.0; python_version<'3.11'"]
|
195
195
|
chunking = ["semantic-text-splitter>=0.27.0"]
|
196
|
+
crypto = ["playa-pdf[crypto]>=0.6.4"]
|
196
197
|
easyocr = ["easyocr>=1.7.2"]
|
197
198
|
gmft = ["gmft>=0.4.2"]
|
198
199
|
langdetect = ["fast-langdetect>=0.2.0"]
|
199
200
|
paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
|
200
|
-
all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
|
201
|
+
all = ["kreuzberg[api,chunking,cli,crypto,easyocr,gmft,langdetect,paddleocr]"]
|
201
202
|
```
|
202
203
|
|
203
204
|
### Installation Patterns
|
204
205
|
- Basic: `pip install kreuzberg`
|
205
206
|
- With features: `pip install "kreuzberg[api,cli]"`
|
207
|
+
- With crypto support: `pip install "kreuzberg[crypto]"`
|
206
208
|
- All features: `pip install "kreuzberg[all]"`
|
207
209
|
- Development: `uv sync --all-extras`
|
208
210
|
|
@@ -211,6 +213,14 @@ rules:
|
|
211
213
|
- **System**: tesseract-ocr, pandoc (via package manager)
|
212
214
|
- **Development**: Uses dependency groups in pyproject.toml
|
213
215
|
|
216
|
+
### Crypto Support
|
217
|
+
The `crypto` extra adds cryptographic support for PDF processing:
|
218
|
+
- **Purpose**: Enables AES encryption/decryption for password-protected PDFs
|
219
|
+
- **Dependencies**: Adds cryptography (~22MB), cffi, and pycparser
|
220
|
+
- **Usage**: Required for PDFs with AES encryption (RC4 is supported in base installation)
|
221
|
+
- **Password Support**: Supports single password or list of passwords to try in sequence
|
222
|
+
- **Size Impact**: Increases installation size by ~24MB due to cryptography package
|
223
|
+
|
214
224
|
sections:
|
215
225
|
- title: "Language Detection"
|
216
226
|
content: |
|
@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
|
|
22
22
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
23
23
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
24
24
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
25
|
-
from kreuzberg._types import ExtractionResult, OcrBackendType
|
25
|
+
from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
|
26
26
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
27
27
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
28
28
|
from kreuzberg._utils._string import normalize_spaces
|
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
|
|
33
33
|
|
34
34
|
if TYPE_CHECKING: # pragma: no cover
|
35
35
|
from PIL.Image import Image
|
36
|
+
from playa.document import Document
|
36
37
|
|
37
38
|
|
38
39
|
class PDFExtractor(Extractor):
|
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
|
|
45
46
|
file_path, unlink = await create_temp_file(".pdf")
|
46
47
|
await AsyncPath(file_path).write_bytes(content)
|
47
48
|
try:
|
48
|
-
metadata = await
|
49
|
+
metadata = await self._extract_metadata_with_password_attempts(content)
|
49
50
|
result = await self.extract_path_async(file_path)
|
50
51
|
|
51
52
|
result.metadata = metadata
|
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
|
|
73
74
|
if not result:
|
74
75
|
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
75
76
|
|
76
|
-
result.metadata = await
|
77
|
+
result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
|
77
78
|
|
78
79
|
if self.config.extract_tables:
|
79
80
|
# GMFT is optional dependency
|
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
|
|
107
108
|
|
108
109
|
result = self.extract_path_sync(Path(temp_path))
|
109
110
|
|
110
|
-
metadata =
|
111
|
+
metadata = self._extract_metadata_with_password_attempts_sync(content)
|
111
112
|
result.metadata = metadata
|
112
113
|
|
113
114
|
return result
|
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
|
|
406
407
|
# Use list comprehension and join for efficient string building
|
407
408
|
return "\n\n".join(result.content for result in results)
|
408
409
|
|
410
|
+
def _parse_with_password_attempts(self, content: bytes) -> Document:
|
411
|
+
"""Parse PDF with password attempts."""
|
412
|
+
# Normalize password to list
|
413
|
+
if isinstance(self.config.pdf_password, str):
|
414
|
+
passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
|
415
|
+
else:
|
416
|
+
passwords = list(self.config.pdf_password)
|
417
|
+
|
418
|
+
# Try each password in sequence
|
419
|
+
last_exception = None
|
420
|
+
for password in passwords:
|
421
|
+
try:
|
422
|
+
return parse(content, max_workers=1, password=password)
|
423
|
+
except Exception as e: # noqa: PERF203, BLE001
|
424
|
+
last_exception = e
|
425
|
+
continue
|
426
|
+
|
427
|
+
# If all passwords failed, raise the last exception
|
428
|
+
if last_exception:
|
429
|
+
raise last_exception from None
|
430
|
+
|
431
|
+
# Fallback to no password
|
432
|
+
return parse(content, max_workers=1, password="")
|
433
|
+
|
434
|
+
def _get_passwords_to_try(self) -> list[str]:
|
435
|
+
"""Get list of passwords to try in sequence."""
|
436
|
+
if isinstance(self.config.pdf_password, str):
|
437
|
+
return [self.config.pdf_password] if self.config.pdf_password else [""]
|
438
|
+
return list(self.config.pdf_password) if self.config.pdf_password else [""]
|
439
|
+
|
440
|
+
async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
|
441
|
+
"""Extract PDF metadata with password attempts."""
|
442
|
+
passwords = self._get_passwords_to_try()
|
443
|
+
|
444
|
+
last_exception = None
|
445
|
+
for password in passwords:
|
446
|
+
try:
|
447
|
+
return await extract_pdf_metadata(content, password=password)
|
448
|
+
except Exception as e: # noqa: PERF203, BLE001
|
449
|
+
last_exception = e
|
450
|
+
continue
|
451
|
+
|
452
|
+
# If all passwords failed, try with empty password as fallback
|
453
|
+
try:
|
454
|
+
return await extract_pdf_metadata(content, password="")
|
455
|
+
except Exception:
|
456
|
+
if last_exception:
|
457
|
+
raise last_exception from None
|
458
|
+
raise
|
459
|
+
|
460
|
+
def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
|
461
|
+
"""Extract PDF metadata with password attempts (sync version)."""
|
462
|
+
passwords = self._get_passwords_to_try()
|
463
|
+
|
464
|
+
last_exception = None
|
465
|
+
for password in passwords:
|
466
|
+
try:
|
467
|
+
return extract_pdf_metadata_sync(content, password=password)
|
468
|
+
except Exception as e: # noqa: PERF203, BLE001
|
469
|
+
last_exception = e
|
470
|
+
continue
|
471
|
+
|
472
|
+
# If all passwords failed, try with empty password as fallback
|
473
|
+
try:
|
474
|
+
return extract_pdf_metadata_sync(content, password="")
|
475
|
+
except Exception:
|
476
|
+
if last_exception:
|
477
|
+
raise last_exception from None
|
478
|
+
raise
|
479
|
+
|
409
480
|
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
410
481
|
"""Extract text using playa for better structure preservation."""
|
411
482
|
with contextlib.suppress(Exception):
|
412
483
|
content = path.read_bytes()
|
413
|
-
document =
|
484
|
+
document = self._parse_with_password_attempts(content)
|
414
485
|
|
415
486
|
# Extract text while preserving structure
|
416
487
|
pages_text = []
|
@@ -24,11 +24,12 @@ FULL_DATE_LENGTH = 14
|
|
24
24
|
BOM_CHAR = "\ufeff"
|
25
25
|
|
26
26
|
|
27
|
-
async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
|
27
|
+
async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
|
28
28
|
"""Extract metadata from a PDF document.
|
29
29
|
|
30
30
|
Args:
|
31
31
|
pdf_content: The bytes of the PDF document.
|
32
|
+
password: Password for encrypted PDF files.
|
32
33
|
|
33
34
|
Raises:
|
34
35
|
ParsingError: If the PDF metadata could not be extracted.
|
@@ -37,7 +38,7 @@ async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
|
|
37
38
|
A dictionary of metadata extracted from the PDF.
|
38
39
|
"""
|
39
40
|
try:
|
40
|
-
document = parse(pdf_content, max_workers=1)
|
41
|
+
document = parse(pdf_content, max_workers=1, password=password)
|
41
42
|
metadata: Metadata = {}
|
42
43
|
|
43
44
|
for raw_info in document.info:
|
@@ -275,13 +276,14 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
|
|
275
276
|
result["subtitle"] = subtitle
|
276
277
|
|
277
278
|
|
278
|
-
def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
|
279
|
+
def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
|
279
280
|
"""Synchronous version of extract_pdf_metadata.
|
280
281
|
|
281
282
|
Extract metadata from a PDF document without using async/await.
|
282
283
|
|
283
284
|
Args:
|
284
285
|
pdf_content: The bytes of the PDF document.
|
286
|
+
password: Password for encrypted PDF files.
|
285
287
|
|
286
288
|
Raises:
|
287
289
|
ParsingError: If the PDF metadata could not be extracted.
|
@@ -290,7 +292,7 @@ def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
|
|
290
292
|
A dictionary of metadata extracted from the PDF.
|
291
293
|
"""
|
292
294
|
try:
|
293
|
-
document = parse(pdf_content, max_workers=1)
|
295
|
+
document = parse(pdf_content, max_workers=1, password=password)
|
294
296
|
metadata: Metadata = {}
|
295
297
|
|
296
298
|
for raw_info in document.info:
|
@@ -357,6 +357,8 @@ class ExtractionConfig:
|
|
357
357
|
"""The mode to use for document classification."""
|
358
358
|
enable_quality_processing: bool = True
|
359
359
|
"""Whether to apply quality post-processing to improve extraction results."""
|
360
|
+
pdf_password: str | list[str] = ""
|
361
|
+
"""Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
|
360
362
|
|
361
363
|
def __post_init__(self) -> None:
|
362
364
|
if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.
|
8
|
+
version = "3.10.0"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -76,7 +76,7 @@ optional-dependencies.additional-extensions = [
|
|
76
76
|
"tomli>=2.0.0; python_version<'3.11'",
|
77
77
|
]
|
78
78
|
optional-dependencies.all = [
|
79
|
-
"kreuzberg[additional-extensions,api,chunking,cli,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
|
79
|
+
"kreuzberg[additional-extensions,api,chunking,cli,crypto,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
|
80
80
|
]
|
81
81
|
optional-dependencies.api = [
|
82
82
|
"litestar[standard,structlog,opentelemetry]>=2.16.0",
|
@@ -91,6 +91,7 @@ optional-dependencies.cli = [
|
|
91
91
|
"rich>=14.1.0",
|
92
92
|
"tomli>=2.0.0; python_version<'3.11'",
|
93
93
|
]
|
94
|
+
optional-dependencies.crypto = [ "playa-pdf[crypto]>=0.6.4" ]
|
94
95
|
optional-dependencies.easyocr = [ "easyocr>=1.7.2" ]
|
95
96
|
optional-dependencies.entity-extraction = [ "keybert>=0.9.0", "spacy>=3.8.7" ]
|
96
97
|
optional-dependencies.gmft = [ "gmft>=0.4.2" ]
|
@@ -388,3 +388,51 @@ async def test_extract_pdf_searchable_text_page_errors(
|
|
388
388
|
return MockDocument()
|
389
389
|
|
390
390
|
monkeypatch.setattr(pypdfium2, "PdfDocument", mock_pdf_document)
|
391
|
+
|
392
|
+
|
393
|
+
def test_pdf_password_configuration() -> None:
|
394
|
+
"""Test PDF password configuration variations."""
|
395
|
+
# Test single password string
|
396
|
+
config = ExtractionConfig(pdf_password="test")
|
397
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
398
|
+
passwords = extractor._get_passwords_to_try()
|
399
|
+
assert passwords == ["test"]
|
400
|
+
|
401
|
+
# Test multiple passwords list
|
402
|
+
config = ExtractionConfig(pdf_password=["pass1", "pass2", "pass3"])
|
403
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
404
|
+
passwords = extractor._get_passwords_to_try()
|
405
|
+
assert passwords == ["pass1", "pass2", "pass3"]
|
406
|
+
|
407
|
+
# Test empty password string
|
408
|
+
config = ExtractionConfig(pdf_password="")
|
409
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
410
|
+
passwords = extractor._get_passwords_to_try()
|
411
|
+
assert passwords == [""]
|
412
|
+
|
413
|
+
# Test empty password list
|
414
|
+
config = ExtractionConfig(pdf_password=[])
|
415
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
416
|
+
passwords = extractor._get_passwords_to_try()
|
417
|
+
assert passwords == [""]
|
418
|
+
|
419
|
+
|
420
|
+
def test_pdf_password_attempts_with_parse_with_password_attempts(test_article: Path) -> None:
|
421
|
+
"""Test the _parse_with_password_attempts method with different password configurations."""
|
422
|
+
# Test with no password (should work with regular PDF)
|
423
|
+
config = ExtractionConfig(pdf_password="")
|
424
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
425
|
+
|
426
|
+
content = test_article.read_bytes()
|
427
|
+
document = extractor._parse_with_password_attempts(content)
|
428
|
+
|
429
|
+
assert document is not None
|
430
|
+
assert len(document.pages) > 0
|
431
|
+
|
432
|
+
# Test with wrong password but fallback should work
|
433
|
+
config = ExtractionConfig(pdf_password="wrongpassword")
|
434
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
435
|
+
|
436
|
+
document = extractor._parse_with_password_attempts(content)
|
437
|
+
assert document is not None
|
438
|
+
assert len(document.pages) > 0
|
@@ -876,6 +876,53 @@ toml = [
|
|
876
876
|
{ name = "tomli", marker = "python_full_version <= '3.11'" },
|
877
877
|
]
|
878
878
|
|
879
|
+
[[package]]
|
880
|
+
name = "cryptography"
|
881
|
+
version = "45.0.5"
|
882
|
+
source = { registry = "https://pypi.org/simple" }
|
883
|
+
dependencies = [
|
884
|
+
{ name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
|
885
|
+
]
|
886
|
+
sdist = { url = "https://files.pythonhosted.org/packages/95/1e/49527ac611af559665f71cbb8f92b332b5ec9c6fbc4e88b0f8e92f5e85df/cryptography-45.0.5.tar.gz", hash = "sha256:72e76caa004ab63accdf26023fccd1d087f6d90ec6048ff33ad0445abf7f605a", size = 744903, upload-time = "2025-07-02T13:06:25.941Z" }
|
887
|
+
wheels = [
|
888
|
+
{ url = "https://files.pythonhosted.org/packages/f0/fb/09e28bc0c46d2c547085e60897fea96310574c70fb21cd58a730a45f3403/cryptography-45.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:101ee65078f6dd3e5a028d4f19c07ffa4dd22cce6a20eaa160f8b5219911e7d8", size = 7043092, upload-time = "2025-07-02T13:05:01.514Z" },
|
889
|
+
{ url = "https://files.pythonhosted.org/packages/b1/05/2194432935e29b91fb649f6149c1a4f9e6d3d9fc880919f4ad1bcc22641e/cryptography-45.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3a264aae5f7fbb089dbc01e0242d3b67dffe3e6292e1f5182122bdf58e65215d", size = 4205926, upload-time = "2025-07-02T13:05:04.741Z" },
|
890
|
+
{ url = "https://files.pythonhosted.org/packages/07/8b/9ef5da82350175e32de245646b1884fc01124f53eb31164c77f95a08d682/cryptography-45.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e74d30ec9c7cb2f404af331d5b4099a9b322a8a6b25c4632755c8757345baac5", size = 4429235, upload-time = "2025-07-02T13:05:07.084Z" },
|
891
|
+
{ url = "https://files.pythonhosted.org/packages/7c/e1/c809f398adde1994ee53438912192d92a1d0fc0f2d7582659d9ef4c28b0c/cryptography-45.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3af26738f2db354aafe492fb3869e955b12b2ef2e16908c8b9cb928128d42c57", size = 4209785, upload-time = "2025-07-02T13:05:09.321Z" },
|
892
|
+
{ url = "https://files.pythonhosted.org/packages/d0/8b/07eb6bd5acff58406c5e806eff34a124936f41a4fb52909ffa4d00815f8c/cryptography-45.0.5-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e6c00130ed423201c5bc5544c23359141660b07999ad82e34e7bb8f882bb78e0", size = 3893050, upload-time = "2025-07-02T13:05:11.069Z" },
|
893
|
+
{ url = "https://files.pythonhosted.org/packages/ec/ef/3333295ed58d900a13c92806b67e62f27876845a9a908c939f040887cca9/cryptography-45.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:dd420e577921c8c2d31289536c386aaa30140b473835e97f83bc71ea9d2baf2d", size = 4457379, upload-time = "2025-07-02T13:05:13.32Z" },
|
894
|
+
{ url = "https://files.pythonhosted.org/packages/d9/9d/44080674dee514dbb82b21d6fa5d1055368f208304e2ab1828d85c9de8f4/cryptography-45.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:d05a38884db2ba215218745f0781775806bde4f32e07b135348355fe8e4991d9", size = 4209355, upload-time = "2025-07-02T13:05:15.017Z" },
|
895
|
+
{ url = "https://files.pythonhosted.org/packages/c9/d8/0749f7d39f53f8258e5c18a93131919ac465ee1f9dccaf1b3f420235e0b5/cryptography-45.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:ad0caded895a00261a5b4aa9af828baede54638754b51955a0ac75576b831b27", size = 4456087, upload-time = "2025-07-02T13:05:16.945Z" },
|
896
|
+
{ url = "https://files.pythonhosted.org/packages/09/d7/92acac187387bf08902b0bf0699816f08553927bdd6ba3654da0010289b4/cryptography-45.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9024beb59aca9d31d36fcdc1604dd9bbeed0a55bface9f1908df19178e2f116e", size = 4332873, upload-time = "2025-07-02T13:05:18.743Z" },
|
897
|
+
{ url = "https://files.pythonhosted.org/packages/03/c2/840e0710da5106a7c3d4153c7215b2736151bba60bf4491bdb421df5056d/cryptography-45.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:91098f02ca81579c85f66df8a588c78f331ca19089763d733e34ad359f474174", size = 4564651, upload-time = "2025-07-02T13:05:21.382Z" },
|
898
|
+
{ url = "https://files.pythonhosted.org/packages/2e/92/cc723dd6d71e9747a887b94eb3827825c6c24b9e6ce2bb33b847d31d5eaa/cryptography-45.0.5-cp311-abi3-win32.whl", hash = "sha256:926c3ea71a6043921050eaa639137e13dbe7b4ab25800932a8498364fc1abec9", size = 2929050, upload-time = "2025-07-02T13:05:23.39Z" },
|
899
|
+
{ url = "https://files.pythonhosted.org/packages/1f/10/197da38a5911a48dd5389c043de4aec4b3c94cb836299b01253940788d78/cryptography-45.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:b85980d1e345fe769cfc57c57db2b59cff5464ee0c045d52c0df087e926fbe63", size = 3403224, upload-time = "2025-07-02T13:05:25.202Z" },
|
900
|
+
{ url = "https://files.pythonhosted.org/packages/fe/2b/160ce8c2765e7a481ce57d55eba1546148583e7b6f85514472b1d151711d/cryptography-45.0.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f3562c2f23c612f2e4a6964a61d942f891d29ee320edb62ff48ffb99f3de9ae8", size = 7017143, upload-time = "2025-07-02T13:05:27.229Z" },
|
901
|
+
{ url = "https://files.pythonhosted.org/packages/c2/e7/2187be2f871c0221a81f55ee3105d3cf3e273c0a0853651d7011eada0d7e/cryptography-45.0.5-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3fcfbefc4a7f332dece7272a88e410f611e79458fab97b5efe14e54fe476f4fd", size = 4197780, upload-time = "2025-07-02T13:05:29.299Z" },
|
902
|
+
{ url = "https://files.pythonhosted.org/packages/b9/cf/84210c447c06104e6be9122661159ad4ce7a8190011669afceeaea150524/cryptography-45.0.5-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:460f8c39ba66af7db0545a8c6f2eabcbc5a5528fc1cf6c3fa9a1e44cec33385e", size = 4420091, upload-time = "2025-07-02T13:05:31.221Z" },
|
903
|
+
{ url = "https://files.pythonhosted.org/packages/3e/6a/cb8b5c8bb82fafffa23aeff8d3a39822593cee6e2f16c5ca5c2ecca344f7/cryptography-45.0.5-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:9b4cf6318915dccfe218e69bbec417fdd7c7185aa7aab139a2c0beb7468c89f0", size = 4198711, upload-time = "2025-07-02T13:05:33.062Z" },
|
904
|
+
{ url = "https://files.pythonhosted.org/packages/04/f7/36d2d69df69c94cbb2473871926daf0f01ad8e00fe3986ac3c1e8c4ca4b3/cryptography-45.0.5-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2089cc8f70a6e454601525e5bf2779e665d7865af002a5dec8d14e561002e135", size = 3883299, upload-time = "2025-07-02T13:05:34.94Z" },
|
905
|
+
{ url = "https://files.pythonhosted.org/packages/82/c7/f0ea40f016de72f81288e9fe8d1f6748036cb5ba6118774317a3ffc6022d/cryptography-45.0.5-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0027d566d65a38497bc37e0dd7c2f8ceda73597d2ac9ba93810204f56f52ebc7", size = 4450558, upload-time = "2025-07-02T13:05:37.288Z" },
|
906
|
+
{ url = "https://files.pythonhosted.org/packages/06/ae/94b504dc1a3cdf642d710407c62e86296f7da9e66f27ab12a1ee6fdf005b/cryptography-45.0.5-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:be97d3a19c16a9be00edf79dca949c8fa7eff621763666a145f9f9535a5d7f42", size = 4198020, upload-time = "2025-07-02T13:05:39.102Z" },
|
907
|
+
{ url = "https://files.pythonhosted.org/packages/05/2b/aaf0adb845d5dabb43480f18f7ca72e94f92c280aa983ddbd0bcd6ecd037/cryptography-45.0.5-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:7760c1c2e1a7084153a0f68fab76e754083b126a47d0117c9ed15e69e2103492", size = 4449759, upload-time = "2025-07-02T13:05:41.398Z" },
|
908
|
+
{ url = "https://files.pythonhosted.org/packages/91/e4/f17e02066de63e0100a3a01b56f8f1016973a1d67551beaf585157a86b3f/cryptography-45.0.5-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6ff8728d8d890b3dda5765276d1bc6fb099252915a2cd3aff960c4c195745dd0", size = 4319991, upload-time = "2025-07-02T13:05:43.64Z" },
|
909
|
+
{ url = "https://files.pythonhosted.org/packages/f2/2e/e2dbd629481b499b14516eed933f3276eb3239f7cee2dcfa4ee6b44d4711/cryptography-45.0.5-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7259038202a47fdecee7e62e0fd0b0738b6daa335354396c6ddebdbe1206af2a", size = 4554189, upload-time = "2025-07-02T13:05:46.045Z" },
|
910
|
+
{ url = "https://files.pythonhosted.org/packages/f8/ea/a78a0c38f4c8736287b71c2ea3799d173d5ce778c7d6e3c163a95a05ad2a/cryptography-45.0.5-cp37-abi3-win32.whl", hash = "sha256:1e1da5accc0c750056c556a93c3e9cb828970206c68867712ca5805e46dc806f", size = 2911769, upload-time = "2025-07-02T13:05:48.329Z" },
|
911
|
+
{ url = "https://files.pythonhosted.org/packages/79/b3/28ac139109d9005ad3f6b6f8976ffede6706a6478e21c889ce36c840918e/cryptography-45.0.5-cp37-abi3-win_amd64.whl", hash = "sha256:90cb0a7bb35959f37e23303b7eed0a32280510030daba3f7fdfbb65defde6a97", size = 3390016, upload-time = "2025-07-02T13:05:50.811Z" },
|
912
|
+
{ url = "https://files.pythonhosted.org/packages/f8/8b/34394337abe4566848a2bd49b26bcd4b07fd466afd3e8cce4cb79a390869/cryptography-45.0.5-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:206210d03c1193f4e1ff681d22885181d47efa1ab3018766a7b32a7b3d6e6afd", size = 3575762, upload-time = "2025-07-02T13:05:53.166Z" },
|
913
|
+
{ url = "https://files.pythonhosted.org/packages/8b/5d/a19441c1e89afb0f173ac13178606ca6fab0d3bd3ebc29e9ed1318b507fc/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c648025b6840fe62e57107e0a25f604db740e728bd67da4f6f060f03017d5097", size = 4140906, upload-time = "2025-07-02T13:05:55.914Z" },
|
914
|
+
{ url = "https://files.pythonhosted.org/packages/4b/db/daceb259982a3c2da4e619f45b5bfdec0e922a23de213b2636e78ef0919b/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b8fa8b0a35a9982a3c60ec79905ba5bb090fc0b9addcfd3dc2dd04267e45f25e", size = 4374411, upload-time = "2025-07-02T13:05:57.814Z" },
|
915
|
+
{ url = "https://files.pythonhosted.org/packages/6a/35/5d06ad06402fc522c8bf7eab73422d05e789b4e38fe3206a85e3d6966c11/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:14d96584701a887763384f3c47f0ca7c1cce322aa1c31172680eb596b890ec30", size = 4140942, upload-time = "2025-07-02T13:06:00.137Z" },
|
916
|
+
{ url = "https://files.pythonhosted.org/packages/65/79/020a5413347e44c382ef1f7f7e7a66817cd6273e3e6b5a72d18177b08b2f/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:57c816dfbd1659a367831baca4b775b2a5b43c003daf52e9d57e1d30bc2e1b0e", size = 4374079, upload-time = "2025-07-02T13:06:02.043Z" },
|
917
|
+
{ url = "https://files.pythonhosted.org/packages/9b/c5/c0e07d84a9a2a8a0ed4f865e58f37c71af3eab7d5e094ff1b21f3f3af3bc/cryptography-45.0.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b9e38e0a83cd51e07f5a48ff9691cae95a79bea28fe4ded168a8e5c6c77e819d", size = 3321362, upload-time = "2025-07-02T13:06:04.463Z" },
|
918
|
+
{ url = "https://files.pythonhosted.org/packages/c0/71/9bdbcfd58d6ff5084687fe722c58ac718ebedbc98b9f8f93781354e6d286/cryptography-45.0.5-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8c4a6ff8a30e9e3d38ac0539e9a9e02540ab3f827a3394f8852432f6b0ea152e", size = 3587878, upload-time = "2025-07-02T13:06:06.339Z" },
|
919
|
+
{ url = "https://files.pythonhosted.org/packages/f0/63/83516cfb87f4a8756eaa4203f93b283fda23d210fc14e1e594bd5f20edb6/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bd4c45986472694e5121084c6ebbd112aa919a25e783b87eb95953c9573906d6", size = 4152447, upload-time = "2025-07-02T13:06:08.345Z" },
|
920
|
+
{ url = "https://files.pythonhosted.org/packages/22/11/d2823d2a5a0bd5802b3565437add16f5c8ce1f0778bf3822f89ad2740a38/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:982518cd64c54fcada9d7e5cf28eabd3ee76bd03ab18e08a48cad7e8b6f31b18", size = 4386778, upload-time = "2025-07-02T13:06:10.263Z" },
|
921
|
+
{ url = "https://files.pythonhosted.org/packages/5f/38/6bf177ca6bce4fe14704ab3e93627c5b0ca05242261a2e43ef3168472540/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:12e55281d993a793b0e883066f590c1ae1e802e3acb67f8b442e721e475e6463", size = 4151627, upload-time = "2025-07-02T13:06:13.097Z" },
|
922
|
+
{ url = "https://files.pythonhosted.org/packages/38/6a/69fc67e5266bff68a91bcb81dff8fb0aba4d79a78521a08812048913e16f/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:5aa1e32983d4443e310f726ee4b071ab7569f58eedfdd65e9675484a4eb67bd1", size = 4385593, upload-time = "2025-07-02T13:06:15.689Z" },
|
923
|
+
{ url = "https://files.pythonhosted.org/packages/f6/34/31a1604c9a9ade0fdab61eb48570e09a796f4d9836121266447b0eaf7feb/cryptography-45.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:e357286c1b76403dd384d938f93c46b2b058ed4dfcdce64a770f0537ed3feb6f", size = 3331106, upload-time = "2025-07-02T13:06:18.058Z" },
|
924
|
+
]
|
925
|
+
|
879
926
|
[[package]]
|
880
927
|
name = "csscompressor"
|
881
928
|
version = "0.9.5"
|
@@ -1079,7 +1126,7 @@ name = "exceptiongroup"
|
|
1079
1126
|
version = "1.3.0"
|
1080
1127
|
source = { registry = "https://pypi.org/simple" }
|
1081
1128
|
dependencies = [
|
1082
|
-
{ name = "typing-extensions", marker = "python_full_version < '3.
|
1129
|
+
{ name = "typing-extensions", marker = "python_full_version < '3.11'" },
|
1083
1130
|
]
|
1084
1131
|
sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
|
1085
1132
|
wheels = [
|
@@ -1965,7 +2012,7 @@ wheels = [
|
|
1965
2012
|
|
1966
2013
|
[[package]]
|
1967
2014
|
name = "kreuzberg"
|
1968
|
-
version = "3.
|
2015
|
+
version = "3.10.0"
|
1969
2016
|
source = { editable = "." }
|
1970
2017
|
dependencies = [
|
1971
2018
|
{ name = "anyio" },
|
@@ -1997,6 +2044,7 @@ all = [
|
|
1997
2044
|
{ name = "mailparse" },
|
1998
2045
|
{ name = "paddleocr" },
|
1999
2046
|
{ name = "paddlepaddle" },
|
2047
|
+
{ name = "playa-pdf", extra = ["crypto"] },
|
2000
2048
|
{ name = "rich" },
|
2001
2049
|
{ name = "semantic-text-splitter" },
|
2002
2050
|
{ name = "setuptools" },
|
@@ -2018,6 +2066,9 @@ cli = [
|
|
2018
2066
|
{ name = "rich" },
|
2019
2067
|
{ name = "tomli", marker = "python_full_version < '3.11'" },
|
2020
2068
|
]
|
2069
|
+
crypto = [
|
2070
|
+
{ name = "playa-pdf", extra = ["crypto"] },
|
2071
|
+
]
|
2021
2072
|
easyocr = [
|
2022
2073
|
{ name = "easyocr" },
|
2023
2074
|
]
|
@@ -2070,7 +2121,7 @@ requires-dist = [
|
|
2070
2121
|
{ name = "gmft", marker = "extra == 'gmft'", specifier = ">=0.4.2" },
|
2071
2122
|
{ name = "html-to-markdown", extras = ["lxml"], specifier = ">=1.9.0" },
|
2072
2123
|
{ name = "keybert", marker = "extra == 'entity-extraction'", specifier = ">=0.9.0" },
|
2073
|
-
{ name = "kreuzberg", extras = ["additional-extensions", "api", "chunking", "cli", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"], marker = "extra == 'all'" },
|
2124
|
+
{ name = "kreuzberg", extras = ["additional-extensions", "api", "chunking", "cli", "crypto", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"], marker = "extra == 'all'" },
|
2074
2125
|
{ name = "litestar", extras = ["standard", "structlog", "opentelemetry"], marker = "extra == 'api'", specifier = ">=2.16.0" },
|
2075
2126
|
{ name = "mailparse", marker = "extra == 'additional-extensions'", specifier = ">=1.0.15" },
|
2076
2127
|
{ name = "mcp", specifier = ">=1.12.2" },
|
@@ -2079,6 +2130,7 @@ requires-dist = [
|
|
2079
2130
|
{ name = "paddlepaddle", marker = "extra == 'paddleocr'", specifier = ">=3.1.0" },
|
2080
2131
|
{ name = "pandas", marker = "extra == 'auto-classify-document-type'", specifier = ">=2.3.1" },
|
2081
2132
|
{ name = "playa-pdf", specifier = ">=0.6.4" },
|
2133
|
+
{ name = "playa-pdf", extras = ["crypto"], marker = "extra == 'crypto'", specifier = ">=0.6.4" },
|
2082
2134
|
{ name = "psutil", specifier = ">=7.0.0" },
|
2083
2135
|
{ name = "pypdfium2", specifier = "==4.30.0" },
|
2084
2136
|
{ name = "python-calamine", specifier = ">=0.3.2" },
|
@@ -2091,7 +2143,7 @@ requires-dist = [
|
|
2091
2143
|
{ name = "tomli", marker = "python_full_version < '3.11' and extra == 'cli'", specifier = ">=2.0.0" },
|
2092
2144
|
{ name = "typing-extensions", marker = "python_full_version < '3.12'", specifier = ">=4.14.0" },
|
2093
2145
|
]
|
2094
|
-
provides-extras = ["additional-extensions", "all", "api", "auto-classify-document-type", "chunking", "cli", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"]
|
2146
|
+
provides-extras = ["additional-extensions", "all", "api", "auto-classify-document-type", "chunking", "cli", "crypto", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"]
|
2095
2147
|
|
2096
2148
|
[package.metadata.requires-dev]
|
2097
2149
|
dev = [
|
@@ -3988,6 +4040,11 @@ wheels = [
|
|
3988
4040
|
{ url = "https://files.pythonhosted.org/packages/42/66/5362cccdabd6b425cbf6cf2e115560255066427d52a45c36891e63f7be97/playa_pdf-0.6.4-py3-none-any.whl", hash = "sha256:d8ff856bab8be784fd39ad83bbd7bdd09db382b1cb6923d14a0cb03e0e6841f0", size = 5661468, upload-time = "2025-07-26T16:09:33.04Z" },
|
3989
4041
|
]
|
3990
4042
|
|
4043
|
+
[package.optional-dependencies]
|
4044
|
+
crypto = [
|
4045
|
+
{ name = "cryptography" },
|
4046
|
+
]
|
4047
|
+
|
3991
4048
|
[[package]]
|
3992
4049
|
name = "pluggy"
|
3993
4050
|
version = "1.6.0"
|
@@ -6081,7 +6138,7 @@ wheels = [
|
|
6081
6138
|
|
6082
6139
|
[[package]]
|
6083
6140
|
name = "transformers"
|
6084
|
-
version = "4.54.
|
6141
|
+
version = "4.54.1"
|
6085
6142
|
source = { registry = "https://pypi.org/simple" }
|
6086
6143
|
dependencies = [
|
6087
6144
|
{ name = "filelock" },
|
@@ -6096,9 +6153,9 @@ dependencies = [
|
|
6096
6153
|
{ name = "tokenizers" },
|
6097
6154
|
{ name = "tqdm" },
|
6098
6155
|
]
|
6099
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
6156
|
+
sdist = { url = "https://files.pythonhosted.org/packages/21/6c/4caeb57926f91d943f309b062e22ad1eb24a9f530421c5a65c1d89378a7a/transformers-4.54.1.tar.gz", hash = "sha256:b2551bb97903f13bd90c9467d0a144d41ca4d142defc044a99502bb77c5c1052", size = 9514288, upload-time = "2025-07-29T15:57:22.826Z" }
|
6100
6157
|
wheels = [
|
6101
|
-
{ url = "https://files.pythonhosted.org/packages/
|
6158
|
+
{ url = "https://files.pythonhosted.org/packages/cf/18/eb7578f84ef5a080d4e5ca9bc4f7c68e7aa9c1e464f1b3d3001e4c642fce/transformers-4.54.1-py3-none-any.whl", hash = "sha256:c89965a4f62a0d07009d45927a9c6372848a02ab9ead9c318c3d082708bab529", size = 11176397, upload-time = "2025-07-29T15:57:19.692Z" },
|
6102
6159
|
]
|
6103
6160
|
|
6104
6161
|
[[package]]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{kreuzberg-3.9.1 → kreuzberg-3.10.0}/benchmarks/results/serialization_benchmark_results.json
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|