kreuzberg 3.4.2__tar.gz → 3.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/publish-docker.yml +2 -3
- kreuzberg-3.5.0/.gitmodules +3 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/PKG-INFO +3 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/benchmark_baseline.py +1 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/final_benchmark.py +1 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/serialization_benchmark.py +0 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/performance.md +46 -9
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/changelog.md +2 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/examples/extraction-examples.md +41 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/getting-started/installation.md +9 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/docker.md +49 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/extraction-configuration.md +53 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/ocr-configuration.md +86 -19
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/__init__.py +2 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_image.py +21 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_pdf.py +44 -14
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_spread_sheet.py +2 -2
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_gmft.py +4 -4
- kreuzberg-3.5.0/kreuzberg/_language_detection.py +95 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/process_manager.py +2 -1
- kreuzberg-3.5.0/kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
- kreuzberg-3.5.0/kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/_easyocr.py +1 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/_tesseract.py +7 -3
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_types.py +11 -4
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_device.py +2 -2
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_process_pool.py +2 -2
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_sync.py +1 -5
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_tmp.py +2 -2
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/extraction.py +10 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/mkdocs.yaml +1 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/pyproject.toml +22 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/api/main_test.py +2 -5
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/cli_integration_test.py +9 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extraction_test.py +10 -2
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/image_test.py +17 -4
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/pdf_test.py +8 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/gmft_extended_test.py +6 -17
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/gmft_test.py +0 -3
- kreuzberg-3.5.0/tests/language_detection_test.py +237 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/sync_tesseract_test.py +2 -3
- kreuzberg-3.5.0/tests/test_source_files/french-text.txt +2 -0
- kreuzberg-3.5.0/tests/test_source_files/german-text.txt +2 -0
- kreuzberg-3.5.0/tests/test_source_files/spanish-text.txt +2 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/cache_test.py +0 -3
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/errors_test.py +0 -1
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/process_pool_test.py +0 -3
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/sync_test.py +0 -7
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/uv.lock +68 -2
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.commitlintrc +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.docker/README.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.dockerignore +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.gitignore +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/LICENSE +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/README.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/ai-rulez.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/cli.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/contributing.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/cli_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/conftest.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/types_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/tmp_test.py +0 -0
@@ -1,4 +1,3 @@
|
|
1
|
-
# .github/workflows/publish-docker.yml
|
2
1
|
|
3
2
|
name: Publish Docker Images
|
4
3
|
|
@@ -24,7 +23,7 @@ jobs:
|
|
24
23
|
include:
|
25
24
|
- name: core
|
26
25
|
extras: ""
|
27
|
-
tag_suffix: ""
|
26
|
+
tag_suffix: ""
|
28
27
|
- name: easyocr
|
29
28
|
extras: "easyocr"
|
30
29
|
tag_suffix: "-easyocr"
|
@@ -89,7 +88,7 @@ jobs:
|
|
89
88
|
type=raw,value=latest${{ matrix.tag_suffix }}
|
90
89
|
|
91
90
|
- name: Build and push Docker image
|
92
|
-
uses: docker/build-push-action@
|
91
|
+
uses: docker/build-push-action@v6
|
93
92
|
with:
|
94
93
|
context: .
|
95
94
|
file: ./.docker/Dockerfile
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.5.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -56,6 +56,8 @@ Provides-Extra: easyocr
|
|
56
56
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
57
57
|
Provides-Extra: gmft
|
58
58
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
59
|
+
Provides-Extra: langdetect
|
60
|
+
Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
|
59
61
|
Provides-Extra: paddleocr
|
60
62
|
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
61
63
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
@@ -8,7 +8,7 @@ from kreuzberg import ExtractionConfig, batch_extract_file, extract_file_sync
|
|
8
8
|
from kreuzberg._utils._document_cache import clear_document_cache, get_document_cache
|
9
9
|
|
10
10
|
|
11
|
-
async def run_baseline_benchmark() -> dict[str, object] | None:
|
11
|
+
async def run_baseline_benchmark() -> dict[str, object] | None:
|
12
12
|
"""Run comprehensive baseline benchmark."""
|
13
13
|
test_files_dir = Path("tests/test_source_files")
|
14
14
|
test_files = list(test_files_dir.glob("*.pdf"))
|
@@ -15,7 +15,7 @@ from kreuzberg._utils._cache import (
|
|
15
15
|
)
|
16
16
|
|
17
17
|
|
18
|
-
async def run_final_benchmark() -> dict[str, object] | None:
|
18
|
+
async def run_final_benchmark() -> dict[str, object] | None:
|
19
19
|
"""Run comprehensive benchmark of all caching improvements."""
|
20
20
|
test_files_dir = Path("tests/test_source_files")
|
21
21
|
pdf_files = list(test_files_dir.glob("*.pdf"))
|
@@ -101,7 +101,6 @@ def benchmark_serialization() -> dict[str, object]:
|
|
101
101
|
json_serialize = analyze_times(json_serialize_times, "JSON Serialize")
|
102
102
|
json_deserialize = analyze_times(json_deserialize_times, "JSON Deserialize")
|
103
103
|
|
104
|
-
# Type casting for arithmetic operations
|
105
104
|
json_ser_mean = json_serialize["mean"]
|
106
105
|
json_deser_mean = json_deserialize["mean"]
|
107
106
|
msgpack_ser_mean = msgpack_serialize["mean"]
|
@@ -125,22 +125,59 @@ The async API leverages Python's asyncio with intelligent task scheduling:
|
|
125
125
|
1. **Configure OCR appropriately** for your document types
|
126
126
|
1. **Profile your specific workload** - results vary by content
|
127
127
|
|
128
|
-
### Configuration
|
128
|
+
### Optimized Default Configuration
|
129
|
+
|
130
|
+
Kreuzberg's default configuration is **optimized out-of-the-box for modern PDFs and standard documents**:
|
129
131
|
|
130
132
|
```python
|
131
|
-
from kreuzberg import ExtractionConfig
|
132
|
-
from kreuzberg._ocr import TesseractConfig
|
133
|
+
from kreuzberg import ExtractionConfig
|
133
134
|
|
134
|
-
#
|
135
|
-
|
135
|
+
# Default configuration - already optimized for modern documents
|
136
|
+
config = ExtractionConfig() # Uses optimized defaults:
|
137
|
+
# - PSM: AUTO_ONLY (fast without orientation detection)
|
138
|
+
# - Language model: Disabled for performance
|
139
|
+
# - Dictionary correction: Enabled for accuracy
|
140
|
+
```
|
136
141
|
|
137
|
-
|
138
|
-
accurate_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=1)) # Auto page segmentation
|
142
|
+
### Advanced Configuration Examples
|
139
143
|
|
140
|
-
|
141
|
-
|
144
|
+
```python
|
145
|
+
from kreuzberg import ExtractionConfig, extract_file_sync
|
146
|
+
from kreuzberg._ocr._tesseract import TesseractConfig, PSMMode
|
147
|
+
|
148
|
+
# Maximum speed configuration (for high-volume processing)
|
149
|
+
speed_config = ExtractionConfig(
|
150
|
+
ocr_backend="tesseract",
|
151
|
+
ocr_config=TesseractConfig(
|
152
|
+
psm=PSMMode.SINGLE_BLOCK, # Assume simple layout
|
153
|
+
language_model_ngram_on=False, # Already disabled by default
|
154
|
+
tessedit_enable_dict_correction=False, # Disable for maximum speed
|
155
|
+
),
|
156
|
+
)
|
157
|
+
|
158
|
+
# Maximum accuracy configuration (for degraded documents)
|
159
|
+
accuracy_config = ExtractionConfig(
|
160
|
+
ocr_backend="tesseract",
|
161
|
+
ocr_config=TesseractConfig(
|
162
|
+
psm=PSMMode.AUTO, # Full analysis with orientation detection
|
163
|
+
language_model_ngram_on=True, # Enable for historical/degraded text
|
164
|
+
tessedit_enable_dict_correction=True, # Default - keep enabled
|
165
|
+
),
|
166
|
+
)
|
167
|
+
|
168
|
+
# No OCR configuration (text documents only)
|
169
|
+
text_only_config = ExtractionConfig(ocr_backend=None, force_ocr=False)
|
142
170
|
```
|
143
171
|
|
172
|
+
### Performance Optimization Tips
|
173
|
+
|
174
|
+
Based on comprehensive benchmarking with 138+ documents:
|
175
|
+
|
176
|
+
1. **Disable OCR for text documents**: Setting `ocr_backend=None` provides significant speedup for documents with text layers
|
177
|
+
1. **Use PSM `AUTO_ONLY` (default)**: Optimized for modern documents without orientation detection overhead
|
178
|
+
1. **Language model trade-offs**: Disabling `language_model_ngram_on` can provide 30x+ speedup with minimal quality impact on clean documents
|
179
|
+
1. **Dictionary correction**: Disabling `tessedit_enable_dict_correction` speeds up processing for technical documents
|
180
|
+
|
144
181
|
### Batch Processing Best Practices
|
145
182
|
|
146
183
|
```python
|
@@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
12
12
|
- Documentation site with comprehensive examples and API reference
|
13
13
|
- Improved configuration for all OCR backends
|
14
14
|
- Added hooks system for validation and post-processing
|
15
|
+
- Language detection feature with `auto_detect_language` configuration option
|
16
|
+
- New optional dependency group `langdetect` for automatic language detection
|
15
17
|
|
16
18
|
### Changed
|
17
19
|
|
@@ -79,6 +79,47 @@ async def extract_with_different_backends():
|
|
79
79
|
print(f"No OCR result: {result.content[:100]}...")
|
80
80
|
```
|
81
81
|
|
82
|
+
## Language Detection
|
83
|
+
|
84
|
+
```python
|
85
|
+
from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
|
86
|
+
|
87
|
+
async def detect_document_language():
|
88
|
+
# Simple automatic language detection
|
89
|
+
result = await extract_file("document.pdf", config=ExtractionConfig(auto_detect_language=True))
|
90
|
+
|
91
|
+
# Access detected languages
|
92
|
+
if result.detected_languages:
|
93
|
+
print(f"Detected languages: {', '.join(result.detected_languages)}")
|
94
|
+
# Example output: "Detected languages: en, de, fr"
|
95
|
+
|
96
|
+
async def detect_multilingual_document():
|
97
|
+
# Advanced multilingual detection with custom configuration
|
98
|
+
lang_config = LanguageDetectionConfig(
|
99
|
+
multilingual=True, # Detect multiple languages in mixed text
|
100
|
+
top_k=5, # Return top 5 languages
|
101
|
+
low_memory=False, # Use high accuracy mode
|
102
|
+
)
|
103
|
+
|
104
|
+
result = await extract_file(
|
105
|
+
"multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
|
106
|
+
)
|
107
|
+
|
108
|
+
if result.detected_languages:
|
109
|
+
print(f"Detected languages: {result.detected_languages}")
|
110
|
+
|
111
|
+
# Use detected languages for OCR
|
112
|
+
from kreuzberg import TesseractConfig
|
113
|
+
|
114
|
+
# Create language string for Tesseract (e.g., "eng+deu+fra")
|
115
|
+
tesseract_langs = "+".join(result.detected_languages[:3])
|
116
|
+
|
117
|
+
result_with_ocr = await extract_file(
|
118
|
+
"multilingual_document.pdf",
|
119
|
+
config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=tesseract_langs)),
|
120
|
+
)
|
121
|
+
```
|
122
|
+
|
82
123
|
## Table Extraction
|
83
124
|
|
84
125
|
```python
|
@@ -102,6 +102,14 @@ Table extraction is an optional feature that allows Kreuzberg to extract tables
|
|
102
102
|
pip install "kreuzberg[gmft]"
|
103
103
|
```
|
104
104
|
|
105
|
+
### Language Detection
|
106
|
+
|
107
|
+
Language detection is an optional feature that automatically detects the language of extracted text. It uses the [fast-langdetect](https://github.com/LlmKira/fast-langdetect) package. To install Kreuzberg with language detection support, you can use:
|
108
|
+
|
109
|
+
```shell
|
110
|
+
pip install "kreuzberg[langdetect]"
|
111
|
+
```
|
112
|
+
|
105
113
|
### All Optional Dependencies
|
106
114
|
|
107
115
|
To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
|
@@ -113,5 +121,5 @@ pip install "kreuzberg[all]"
|
|
113
121
|
This is equivalent to:
|
114
122
|
|
115
123
|
```shell
|
116
|
-
pip install "kreuzberg[chunking,easyocr,gmft,paddleocr]"
|
124
|
+
pip install "kreuzberg[chunking,easyocr,gmft,langdetect,paddleocr]"
|
117
125
|
```
|
@@ -124,6 +124,34 @@ Additional dependencies by variant:
|
|
124
124
|
- **gmft**: GMFT for table extraction
|
125
125
|
- **all**: All optional dependencies
|
126
126
|
|
127
|
+
### Health Check
|
128
|
+
|
129
|
+
All Docker images include a health check endpoint:
|
130
|
+
|
131
|
+
```bash
|
132
|
+
# Check API health
|
133
|
+
curl http://localhost:8000/health
|
134
|
+
```
|
135
|
+
|
136
|
+
Returns a JSON response with service status and version information.
|
137
|
+
|
138
|
+
### Observability
|
139
|
+
|
140
|
+
The Docker images include built-in OpenTelemetry instrumentation via Litestar:
|
141
|
+
|
142
|
+
- **Tracing**: Automatic request/response tracing
|
143
|
+
- **Metrics**: Performance and usage metrics
|
144
|
+
- **Logging**: Structured JSON logging
|
145
|
+
|
146
|
+
Configure via standard OpenTelemetry environment variables:
|
147
|
+
|
148
|
+
```bash
|
149
|
+
docker run -p 8000:8000 \
|
150
|
+
-e OTEL_SERVICE_NAME=kreuzberg-api \
|
151
|
+
-e OTEL_EXPORTER_OTLP_ENDPOINT=http://your-collector:4317 \
|
152
|
+
goldziher/kreuzberg:latest
|
153
|
+
```
|
154
|
+
|
127
155
|
### Environment Variables
|
128
156
|
|
129
157
|
- `PYTHONUNBUFFERED=1` - Ensures proper logging output
|
@@ -150,6 +178,12 @@ server {
|
|
150
178
|
client_max_body_size 100M;
|
151
179
|
proxy_read_timeout 300s;
|
152
180
|
}
|
181
|
+
|
182
|
+
# Health check endpoint
|
183
|
+
location /health {
|
184
|
+
proxy_pass http://localhost:8000/health;
|
185
|
+
access_log off;
|
186
|
+
}
|
153
187
|
}
|
154
188
|
```
|
155
189
|
|
@@ -175,6 +209,21 @@ spec:
|
|
175
209
|
image: goldziher/kreuzberg:latest
|
176
210
|
ports:
|
177
211
|
- containerPort: 8000
|
212
|
+
livenessProbe:
|
213
|
+
httpGet:
|
214
|
+
path: /health
|
215
|
+
port: 8000
|
216
|
+
initialDelaySeconds: 30
|
217
|
+
periodSeconds: 10
|
218
|
+
readinessProbe:
|
219
|
+
httpGet:
|
220
|
+
path: /health
|
221
|
+
port: 8000
|
222
|
+
initialDelaySeconds: 5
|
223
|
+
periodSeconds: 5
|
224
|
+
env:
|
225
|
+
- name: OTEL_SERVICE_NAME
|
226
|
+
value: "kreuzberg-api"
|
178
227
|
resources:
|
179
228
|
requests:
|
180
229
|
memory: "512Mi"
|
@@ -9,6 +9,7 @@ All extraction functions accept an optional `config` parameter of type `Extracti
|
|
9
9
|
- Control OCR behavior with `force_ocr` and `ocr_backend`
|
10
10
|
- Provide engine-specific OCR configuration via `ocr_config`
|
11
11
|
- Enable table extraction with `extract_tables` and configure it via `gmft_config`
|
12
|
+
- Enable automatic language detection with `auto_detect_language`
|
12
13
|
- Add validation and post-processing hooks
|
13
14
|
- Configure custom extractors
|
14
15
|
|
@@ -100,6 +101,58 @@ Note that table extraction requires the `gmft` dependency. You can install it wi
|
|
100
101
|
pip install "kreuzberg[gmft]"
|
101
102
|
```
|
102
103
|
|
104
|
+
### Language Detection
|
105
|
+
|
106
|
+
Kreuzberg can automatically detect the language of extracted text using fast-langdetect:
|
107
|
+
|
108
|
+
```python
|
109
|
+
from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
|
110
|
+
|
111
|
+
# Simple automatic language detection
|
112
|
+
result = await extract_file("multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True))
|
113
|
+
|
114
|
+
# Access detected languages (lowercase ISO 639-1 codes)
|
115
|
+
if result.detected_languages:
|
116
|
+
print(f"Detected languages: {', '.join(result.detected_languages)}")
|
117
|
+
# Example output: "Detected languages: en, de, fr"
|
118
|
+
|
119
|
+
# Advanced configuration with multilingual detection
|
120
|
+
lang_config = LanguageDetectionConfig(
|
121
|
+
multilingual=True, # Enable mixed-language detection
|
122
|
+
top_k=5, # Return top 5 languages
|
123
|
+
low_memory=False, # Use high accuracy mode
|
124
|
+
cache_dir="/tmp/lang_models", # Custom model cache directory
|
125
|
+
)
|
126
|
+
|
127
|
+
result = await extract_file(
|
128
|
+
"multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
|
129
|
+
)
|
130
|
+
|
131
|
+
# Use detected languages for OCR
|
132
|
+
if result.detected_languages:
|
133
|
+
# Re-extract with OCR using the primary detected language
|
134
|
+
from kreuzberg import TesseractConfig
|
135
|
+
|
136
|
+
result_with_ocr = await extract_file(
|
137
|
+
"multilingual_document.pdf",
|
138
|
+
config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=result.detected_languages[0])),
|
139
|
+
)
|
140
|
+
```
|
141
|
+
|
142
|
+
#### Language Detection Configuration Options
|
143
|
+
|
144
|
+
- `low_memory` (default: `True`): Use smaller model (~200MB) vs larger, more accurate model
|
145
|
+
- `multilingual` (default: `False`): Enable detection of multiple languages in mixed text
|
146
|
+
- `top_k` (default: `3`): Maximum number of languages to return
|
147
|
+
- `cache_dir`: Custom directory for language model storage
|
148
|
+
- `allow_fallback` (default: `True`): Fall back to small model if large model fails
|
149
|
+
|
150
|
+
The feature requires the `langdetect` dependency:
|
151
|
+
|
152
|
+
```shell
|
153
|
+
pip install "kreuzberg[langdetect]"
|
154
|
+
```
|
155
|
+
|
103
156
|
### Batch Processing
|
104
157
|
|
105
158
|
```python
|
@@ -62,15 +62,15 @@ result = await extract_file("document.pdf", config=ExtractionConfig(ocr_config=T
|
|
62
62
|
|
63
63
|
#### Available PSM Modes
|
64
64
|
|
65
|
-
| Mode
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
| Single
|
70
|
-
| Single
|
71
|
-
| Single
|
72
|
-
|
|
73
|
-
| Sparse Text
|
65
|
+
| Mode | Enum Value | Description | Best For |
|
66
|
+
| ------------- | ----------------------- | -------------------------------------------------------- | ---------------------------------------------- |
|
67
|
+
| Auto Only | `PSMMode.AUTO_ONLY` | Automatic segmentation without orientation detection | Modern documents (default - fastest) |
|
68
|
+
| Automatic | `PSMMode.AUTO` | Automatic page segmentation with orientation detection | Rotated/skewed documents |
|
69
|
+
| Single Block | `PSMMode.SINGLE_BLOCK` | Treat the image as a single text block | Simple layouts, preserving paragraph structure |
|
70
|
+
| Single Column | `PSMMode.SINGLE_COLUMN` | Assume a single column of text | Books, articles, single-column documents |
|
71
|
+
| Single Line | `PSMMode.SINGLE_LINE` | Treat the image as a single text line | Receipts, labels, single-line text |
|
72
|
+
| Single Word | `PSMMode.SINGLE_WORD` | Treat the image as a single word | Word recognition tasks |
|
73
|
+
| Sparse Text | `PSMMode.SPARSE_TEXT` | Find as much text as possible without assuming structure | Forms, tables, scattered text |
|
74
74
|
|
75
75
|
### Forcing OCR
|
76
76
|
|
@@ -139,23 +139,90 @@ result = await extract_file(
|
|
139
139
|
|
140
140
|
## Performance Optimization
|
141
141
|
|
142
|
-
|
142
|
+
### Default Configuration
|
143
143
|
|
144
|
-
|
144
|
+
Kreuzberg's defaults are optimized out-of-the-box for modern PDFs and standard documents:
|
145
145
|
|
146
|
-
-
|
147
|
-
-
|
148
|
-
-
|
146
|
+
- **PSM Mode**: `AUTO_ONLY` - Faster than `AUTO` without orientation detection overhead
|
147
|
+
- **Language Model**: Disabled by default for optimal performance on modern documents
|
148
|
+
- **Dictionary Correction**: Enabled for accuracy
|
149
|
+
|
150
|
+
The default configuration provides excellent extraction quality for:
|
151
|
+
|
152
|
+
- Modern PDFs with embedded text
|
153
|
+
- Scanned documents with clear printing
|
154
|
+
- Office documents (DOCX, PPTX, XLSX)
|
155
|
+
- Standard business documents
|
156
|
+
|
157
|
+
### Speed vs Quality Trade-offs
|
158
|
+
|
159
|
+
```python
|
160
|
+
from kreuzberg import ExtractionConfig, TesseractConfig, PSMMode
|
161
|
+
|
162
|
+
# Default configuration (optimized for modern documents)
|
163
|
+
default_config = ExtractionConfig() # Already optimized for speed and quality
|
164
|
+
|
165
|
+
# Maximum speed configuration
|
166
|
+
speed_config = ExtractionConfig(
|
167
|
+
ocr_backend="tesseract",
|
168
|
+
ocr_config=TesseractConfig(
|
169
|
+
psm=PSMMode.SINGLE_BLOCK, # Assume simple layout
|
170
|
+
tessedit_enable_dict_correction=False, # Skip dictionary correction
|
171
|
+
),
|
172
|
+
)
|
173
|
+
|
174
|
+
# Maximum accuracy configuration (for degraded/historical documents)
|
175
|
+
accuracy_config = ExtractionConfig(
|
176
|
+
ocr_backend="tesseract",
|
177
|
+
ocr_config=TesseractConfig(
|
178
|
+
psm=PSMMode.AUTO, # Full analysis with orientation detection
|
179
|
+
language_model_ngram_on=True, # Enable for degraded/historical text
|
180
|
+
tessedit_enable_dict_correction=True, # Correct OCR errors
|
181
|
+
),
|
182
|
+
)
|
183
|
+
```
|
184
|
+
|
185
|
+
### Language Model N-gram Settings
|
186
|
+
|
187
|
+
The `language_model_ngram_on` parameter controls Tesseract's use of n-gram language models:
|
188
|
+
|
189
|
+
- **Default (False)**: Optimized for modern documents with clear text
|
190
|
+
- **When to enable**: Historical documents, degraded scans, handwritten text, or noisy images
|
191
|
+
|
192
|
+
```python
|
193
|
+
# For degraded or historical documents
|
194
|
+
historical_config = ExtractionConfig(
|
195
|
+
ocr_backend="tesseract",
|
196
|
+
ocr_config=TesseractConfig(
|
197
|
+
language_model_ngram_on=True, # Enable for better accuracy on poor quality text
|
198
|
+
),
|
199
|
+
)
|
200
|
+
```
|
201
|
+
|
202
|
+
### When to Disable OCR
|
203
|
+
|
204
|
+
For documents with text layers (searchable PDFs, Office docs), disable OCR entirely:
|
205
|
+
|
206
|
+
```python
|
207
|
+
# No OCR overhead for text documents
|
208
|
+
text_config = ExtractionConfig(ocr_backend=None)
|
209
|
+
```
|
210
|
+
|
211
|
+
This provides significant speedup (78% of PDFs have text layers and extract in \<0.01s)
|
149
212
|
|
150
213
|
## Best Practices
|
151
214
|
|
152
215
|
- **Language Selection**: Always specify the correct language for your documents to improve OCR accuracy
|
153
216
|
- **PSM Mode Selection**: Choose the appropriate PSM mode based on your document layout:
|
154
|
-
- Use `
|
155
|
-
- Use `
|
156
|
-
- Use `
|
217
|
+
- Use `PSMMode.AUTO_ONLY` (default) for modern, well-formatted documents
|
218
|
+
- Use `PSMMode.SINGLE_BLOCK` for simple layouts with faster processing
|
219
|
+
- Use `PSMMode.SPARSE_TEXT` for forms or documents with tables
|
220
|
+
- Use `PSMMode.AUTO` only when orientation detection is needed
|
221
|
+
- **Performance Optimization**:
|
222
|
+
- Disable OCR (`ocr_backend=None`) for documents with text layers
|
223
|
+
- Disable language model for clean documents (`language_model_ngram_on=False`)
|
224
|
+
- Disable dictionary correction for technical documents
|
157
225
|
- **Image Quality**: For best results, ensure images are:
|
158
226
|
- High resolution (at least 300 DPI)
|
159
227
|
- Well-lit with good contrast
|
160
|
-
- Not skewed or rotated
|
161
|
-
- **Performance**: For batch processing, adjust `max_processes` based on your system's capabilities
|
228
|
+
- Not skewed or rotated (unless using `PSMMode.AUTO`)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from importlib.metadata import version
|
2
2
|
|
3
3
|
from kreuzberg._gmft import GMFTConfig
|
4
|
+
from kreuzberg._language_detection import LanguageDetectionConfig
|
4
5
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
5
6
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
6
7
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
@@ -29,6 +30,7 @@ __all__ = [
|
|
29
30
|
"ExtractorRegistry",
|
30
31
|
"GMFTConfig",
|
31
32
|
"KreuzbergError",
|
33
|
+
"LanguageDetectionConfig",
|
32
34
|
"Metadata",
|
33
35
|
"MissingDependencyError",
|
34
36
|
"OCRError",
|
@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
|
|
80
80
|
if self.config.ocr_backend is None:
|
81
81
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
82
82
|
|
83
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
84
83
|
from kreuzberg._types import ExtractionResult
|
85
84
|
|
86
85
|
if self.config.ocr_backend == "tesseract":
|
87
86
|
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
87
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
88
88
|
|
89
89
|
if isinstance(self.config.ocr_config, TesseractConfig):
|
90
90
|
config = self.config.ocr_config
|
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
|
|
96
96
|
return results[0]
|
97
97
|
return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
|
98
98
|
|
99
|
+
if self.config.ocr_backend == "paddleocr":
|
100
|
+
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
101
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
102
|
+
|
103
|
+
paddle_config = (
|
104
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
105
|
+
)
|
106
|
+
|
107
|
+
return paddle_process(path, paddle_config)
|
108
|
+
|
109
|
+
if self.config.ocr_backend == "easyocr":
|
110
|
+
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
111
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
112
|
+
|
113
|
+
easy_config = (
|
114
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
115
|
+
)
|
116
|
+
|
117
|
+
return easy_process(path, easy_config)
|
118
|
+
|
99
119
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
100
120
|
|
101
121
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
@@ -299,8 +299,6 @@ class PDFExtractor(Extractor):
|
|
299
299
|
"""Extract text from PDF using OCR (sync version)."""
|
300
300
|
pdf = None
|
301
301
|
try:
|
302
|
-
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
303
|
-
|
304
302
|
images = []
|
305
303
|
with pypdfium_file_lock(path):
|
306
304
|
pdf = pypdfium2.PdfDocument(str(path))
|
@@ -325,18 +323,7 @@ class PDFExtractor(Extractor):
|
|
325
323
|
os.close(fd)
|
326
324
|
image_paths.append(temp_path)
|
327
325
|
|
328
|
-
|
329
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
330
|
-
|
331
|
-
if isinstance(self.config.ocr_config, TesseractConfig):
|
332
|
-
config = self.config.ocr_config
|
333
|
-
else:
|
334
|
-
config = TesseractConfig()
|
335
|
-
results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
|
336
|
-
text_parts = [r.content for r in results]
|
337
|
-
return "\n\n".join(text_parts)
|
338
|
-
|
339
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
326
|
+
return self._process_pdf_images_with_ocr(image_paths)
|
340
327
|
|
341
328
|
finally:
|
342
329
|
for _, temp_path in temp_files:
|
@@ -349,3 +336,46 @@ class PDFExtractor(Extractor):
|
|
349
336
|
if pdf:
|
350
337
|
with pypdfium_file_lock(path), contextlib.suppress(Exception):
|
351
338
|
pdf.close()
|
339
|
+
|
340
|
+
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
341
|
+
"""Process PDF images with the configured OCR backend."""
|
342
|
+
if self.config.ocr_backend == "tesseract":
|
343
|
+
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
344
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
345
|
+
|
346
|
+
tesseract_config = (
|
347
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
348
|
+
)
|
349
|
+
results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
|
350
|
+
text_parts = [r.content for r in results]
|
351
|
+
return "\n\n".join(text_parts)
|
352
|
+
|
353
|
+
if self.config.ocr_backend == "paddleocr":
|
354
|
+
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
355
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
356
|
+
|
357
|
+
paddle_config = (
|
358
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
359
|
+
)
|
360
|
+
|
361
|
+
text_parts = []
|
362
|
+
for image_path in image_paths:
|
363
|
+
result = paddle_process(Path(image_path), paddle_config)
|
364
|
+
text_parts.append(result.content)
|
365
|
+
return "\n\n".join(text_parts)
|
366
|
+
|
367
|
+
if self.config.ocr_backend == "easyocr":
|
368
|
+
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
369
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
370
|
+
|
371
|
+
easy_config = (
|
372
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
373
|
+
)
|
374
|
+
|
375
|
+
text_parts = []
|
376
|
+
for image_path in image_paths:
|
377
|
+
result = easy_process(Path(image_path), easy_config)
|
378
|
+
text_parts.append(result.content)
|
379
|
+
return "\n\n".join(text_parts)
|
380
|
+
|
381
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
@@ -6,7 +6,7 @@ import sys
|
|
6
6
|
from datetime import date, datetime, time, timedelta
|
7
7
|
from io import StringIO
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Any
|
9
|
+
from typing import Any
|
10
10
|
|
11
11
|
from anyio import Path as AsyncPath
|
12
12
|
from python_calamine import CalamineWorkbook
|
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11): # pragma: no cover
|
|
23
23
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
24
24
|
|
25
25
|
|
26
|
-
CellValue =
|
26
|
+
CellValue = int | float | str | bool | time | date | datetime | timedelta
|
27
27
|
|
28
28
|
|
29
29
|
class SpreadSheetExtractor(Extractor):
|