kreuzberg 3.3.0__tar.gz → 3.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-3.4.0/.docker/Dockerfile +21 -0
- kreuzberg-3.4.0/.docker/README.md +87 -0
- kreuzberg-3.4.0/.dockerignore +15 -0
- kreuzberg-3.4.0/.github/workflows/publish-docker.yml +101 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.gitignore +9 -8
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/PKG-INFO +63 -8
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/README.md +59 -7
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/performance.md +55 -10
- kreuzberg-3.4.0/docs/user-guide/api-server.md +169 -0
- kreuzberg-3.4.0/docs/user-guide/docker.md +249 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/index.md +2 -0
- kreuzberg-3.4.0/kreuzberg/_api/main.py +87 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_types.py +4 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/mkdocs.yaml +2 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/pyproject.toml +18 -16
- kreuzberg-3.4.0/tests/api/main_test.py +252 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/cli_integration_test.py +2 -2
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/gmft_test.py +18 -14
- kreuzberg-3.4.0/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/tesseract_test.py +27 -25
- kreuzberg-3.4.0/tests/utils/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/uv.lock +441 -32
- kreuzberg-3.3.0/run_benchmarks.py +0 -195
- kreuzberg-3.3.0/scripts/__init__.py +0 -1
- kreuzberg-3.3.0/scripts/compare_benchmarks.py +0 -100
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.commitlintrc +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/benchmarks/README.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/LICENSE +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/ai-rulez.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/changelog.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/cli.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/contributing.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.3.0/kreuzberg/_extractors → kreuzberg-3.4.0/kreuzberg/_api}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.3.0/kreuzberg/_utils → kreuzberg-3.4.0/kreuzberg/_extractors}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/process_manager.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.3.0/tests → kreuzberg-3.4.0/kreuzberg/_utils}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.3.0/tests/extractors → kreuzberg-3.4.0/tests}/__init__.py +0 -0
- {kreuzberg-3.3.0/tests/ocr → kreuzberg-3.4.0/tests/api}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/cli_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/conftest.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extraction_test.py +0 -0
- {kreuzberg-3.3.0/tests/utils → kreuzberg-3.4.0/tests/extractors}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/sync_tesseract_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/types_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.0}/tests/utils/tmp_test.py +0 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm as app
|
2
|
+
ARG EXTRAS=""
|
3
|
+
WORKDIR /app
|
4
|
+
ENV PYTHONDONTWRITEBYTECODE 1
|
5
|
+
ENV PYTHONUNBUFFERED 1
|
6
|
+
ENV UV_LINK_MODE=copy
|
7
|
+
|
8
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
9
|
+
pandoc \
|
10
|
+
tesseract-ocr \
|
11
|
+
&& apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
12
|
+
|
13
|
+
|
14
|
+
COPY pyproject.toml uv.lock README.md ./
|
15
|
+
COPY kreuzberg kreuzberg
|
16
|
+
|
17
|
+
RUN uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --compile-bytecode
|
18
|
+
|
19
|
+
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
|
20
|
+
USER appuser
|
21
|
+
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# Kreuzberg Docker Images
|
2
|
+
|
3
|
+
[](https://github.com/Goldziher/kreuzberg)
|
4
|
+
[](https://badge.fury.io/py/kreuzberg)
|
5
|
+
[](https://goldziher.github.io/kreuzberg/)
|
6
|
+
[](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE)
|
7
|
+
|
8
|
+
High-performance Python library for text extraction from documents, available as Docker images.
|
9
|
+
|
10
|
+
**Source Code**: [github.com/Goldziher/kreuzberg](https://github.com/Goldziher/kreuzberg)
|
11
|
+
|
12
|
+
## Quick Start
|
13
|
+
|
14
|
+
```bash
|
15
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
16
|
+
```
|
17
|
+
|
18
|
+
## Available Tags
|
19
|
+
|
20
|
+
- `latest` - Latest stable release with API server and Tesseract OCR
|
21
|
+
- `X.Y.Z` - Specific version (e.g., `3.0.0`)
|
22
|
+
- `X.Y.Z-easyocr` - With EasyOCR support
|
23
|
+
- `X.Y.Z-paddle` - With PaddleOCR support
|
24
|
+
- `X.Y.Z-gmft` - With GMFT table extraction
|
25
|
+
- `X.Y.Z-all` - With all optional dependencies
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
### Extract Files via API
|
30
|
+
|
31
|
+
```bash
|
32
|
+
# Single file
|
33
|
+
curl -X POST http://localhost:8000/extract \
|
34
|
+
-F "data=@document.pdf"
|
35
|
+
|
36
|
+
# Multiple files
|
37
|
+
curl -X POST http://localhost:8000/extract \
|
38
|
+
-F "data=@document1.pdf" \
|
39
|
+
-F "data=@document2.docx"
|
40
|
+
```
|
41
|
+
|
42
|
+
### Docker Compose
|
43
|
+
|
44
|
+
```yaml
|
45
|
+
version: '3.8'
|
46
|
+
|
47
|
+
services:
|
48
|
+
kreuzberg:
|
49
|
+
image: goldziher/kreuzberg:latest
|
50
|
+
ports:
|
51
|
+
- "8000:8000"
|
52
|
+
restart: unless-stopped
|
53
|
+
```
|
54
|
+
|
55
|
+
## Features
|
56
|
+
|
57
|
+
- **🚀 High Performance**: Optimized for speed and efficiency
|
58
|
+
- **📄 Multiple Formats**: PDF, DOCX, images, HTML, and more
|
59
|
+
- **🔍 OCR Support**: Built-in Tesseract, optional EasyOCR/PaddleOCR
|
60
|
+
- **📊 Table Extraction**: Extract tables with GMFT
|
61
|
+
- **🔒 Secure**: Runs as non-root user, no external API calls
|
62
|
+
- **📦 Ready to Use**: Pre-configured API server
|
63
|
+
|
64
|
+
## Documentation
|
65
|
+
|
66
|
+
- **[GitHub Repository](https://github.com/Goldziher/kreuzberg)** - Source code and issue tracking
|
67
|
+
- **[Full Documentation](https://goldziher.github.io/kreuzberg/)** - Complete user guide and API reference
|
68
|
+
- **[API Documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/)** - REST API endpoints and usage
|
69
|
+
- **[Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/)** - Detailed Docker usage guide
|
70
|
+
|
71
|
+
## Support
|
72
|
+
|
73
|
+
- **Issues**: [github.com/Goldziher/kreuzberg/issues](https://github.com/Goldziher/kreuzberg/issues)
|
74
|
+
- **Discussions**: [github.com/Goldziher/kreuzberg/discussions](https://github.com/Goldziher/kreuzberg/discussions)
|
75
|
+
- **Discord**: [Join our community](https://discord.gg/pXxagNK2zN)
|
76
|
+
|
77
|
+
## Contributing
|
78
|
+
|
79
|
+
Contributions are welcome! See our [Contributing Guide](https://github.com/Goldziher/kreuzberg/blob/main/docs/contributing.md).
|
80
|
+
|
81
|
+
## License
|
82
|
+
|
83
|
+
MIT License - see [LICENSE](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE) for details.
|
84
|
+
|
85
|
+
______________________________________________________________________
|
86
|
+
|
87
|
+
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# .github/workflows/publish-docker.yml
|
2
|
+
|
3
|
+
name: Publish Docker Images
|
4
|
+
|
5
|
+
on:
|
6
|
+
workflow_run:
|
7
|
+
workflows: ["Release"]
|
8
|
+
types:
|
9
|
+
- completed
|
10
|
+
branches:
|
11
|
+
- main
|
12
|
+
|
13
|
+
jobs:
|
14
|
+
build-and-push:
|
15
|
+
runs-on: ubuntu-latest
|
16
|
+
if: ${{ github.event.workflow_run.conclusion == 'success' }}
|
17
|
+
permissions:
|
18
|
+
contents: read
|
19
|
+
packages: write
|
20
|
+
|
21
|
+
strategy:
|
22
|
+
matrix:
|
23
|
+
include:
|
24
|
+
- name: core
|
25
|
+
extras: ""
|
26
|
+
tag_suffix: "" # The base image tag (includes API + tesseract)
|
27
|
+
- name: easyocr
|
28
|
+
extras: "easyocr"
|
29
|
+
tag_suffix: "-easyocr"
|
30
|
+
- name: paddle
|
31
|
+
extras: "paddleocr"
|
32
|
+
tag_suffix: "-paddle"
|
33
|
+
- name: gmft
|
34
|
+
extras: "gmft"
|
35
|
+
tag_suffix: "-gmft"
|
36
|
+
- name: all
|
37
|
+
extras: "all"
|
38
|
+
tag_suffix: "-all"
|
39
|
+
|
40
|
+
steps:
|
41
|
+
- name: Checkout repository
|
42
|
+
uses: actions/checkout@v4
|
43
|
+
with:
|
44
|
+
ref: ${{ github.event.workflow_run.head_branch }}
|
45
|
+
|
46
|
+
- name: Get release version
|
47
|
+
id: get_version
|
48
|
+
run: |
|
49
|
+
echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
|
50
|
+
# If triggered by a tag, extract version
|
51
|
+
if [[ "${{ github.event.workflow_run.head_branch }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
|
52
|
+
echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
|
53
|
+
else
|
54
|
+
# Get the latest tag
|
55
|
+
git fetch --tags
|
56
|
+
echo "VERSION=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
|
57
|
+
fi
|
58
|
+
|
59
|
+
- name: Set up QEMU
|
60
|
+
uses: docker/setup-qemu-action@v3
|
61
|
+
|
62
|
+
- name: Set up Docker Buildx
|
63
|
+
uses: docker/setup-buildx-action@v3
|
64
|
+
|
65
|
+
- name: Log in to Docker Hub
|
66
|
+
uses: docker/login-action@v3
|
67
|
+
with:
|
68
|
+
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
69
|
+
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
70
|
+
|
71
|
+
- name: Extract metadata (tags, labels) for Docker
|
72
|
+
id: meta
|
73
|
+
uses: docker/metadata-action@v5
|
74
|
+
with:
|
75
|
+
images: goldziher/kreuzberg
|
76
|
+
tags: |
|
77
|
+
# Release version tag (e.g., v3.0.0-easyocr)
|
78
|
+
type=raw,value=${{ steps.get_version.outputs.VERSION }}${{ matrix.tag_suffix }}
|
79
|
+
# Latest tag for each variant (e.g., latest-easyocr)
|
80
|
+
type=raw,value=latest${{ matrix.tag_suffix }}
|
81
|
+
|
82
|
+
- name: Build and push Docker image
|
83
|
+
uses: docker/build-push-action@v5
|
84
|
+
with:
|
85
|
+
context: .
|
86
|
+
file: ./.docker/Dockerfile
|
87
|
+
platforms: linux/amd64,linux/arm64
|
88
|
+
push: true
|
89
|
+
build-args: |
|
90
|
+
EXTRAS=${{ matrix.extras }}
|
91
|
+
tags: ${{ steps.meta.outputs.tags }}
|
92
|
+
labels: ${{ steps.meta.outputs.labels }}
|
93
|
+
|
94
|
+
- name: Update Docker Hub README
|
95
|
+
uses: peter-evans/dockerhub-description@v4
|
96
|
+
if: matrix.name == 'core'
|
97
|
+
with:
|
98
|
+
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
99
|
+
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
100
|
+
repository: goldziher/kreuzberg
|
101
|
+
readme-filepath: ./.docker/README.md
|
@@ -6,28 +6,29 @@
|
|
6
6
|
*.py[cod]
|
7
7
|
*.suo
|
8
8
|
*.user
|
9
|
-
|
9
|
+
*temp/
|
10
10
|
.coverage
|
11
11
|
.coverage*
|
12
|
+
.cursorrules
|
12
13
|
.dist/
|
14
|
+
.DS_store
|
13
15
|
.env
|
14
16
|
.idea/
|
17
|
+
.kreuzberg/
|
15
18
|
.mypy_cache/
|
16
19
|
.pytest_cache/
|
17
20
|
.python-version
|
21
|
+
.ropeproject
|
18
22
|
.ruff_cache/
|
19
23
|
.run/
|
20
24
|
.venv/
|
21
25
|
.vscode/
|
22
26
|
.windsurfrules
|
23
|
-
.cursorrules
|
24
|
-
CLAUDE.md
|
25
|
-
GEMINI.md
|
26
27
|
__pycache__/
|
28
|
+
benchmark_results.json
|
29
|
+
CLAUDE.md
|
27
30
|
coverage.xml
|
31
|
+
docker-compose.yaml
|
32
|
+
GEMINI.md
|
28
33
|
prompt_template.egg-info/
|
29
34
|
requirements.txt
|
30
|
-
Dockerfile
|
31
|
-
docker-compose.yaml
|
32
|
-
benchmark_results.json
|
33
|
-
.kreuzberg/
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.4.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -34,12 +34,15 @@ Provides-Extra: all
|
|
34
34
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
35
35
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
36
36
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
37
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
|
37
38
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
38
39
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
39
40
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
40
41
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
41
42
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
42
43
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
44
|
+
Provides-Extra: api
|
45
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
|
43
46
|
Provides-Extra: chunking
|
44
47
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
45
48
|
Provides-Extra: cli
|
@@ -63,10 +66,14 @@ Description-Content-Type: text/markdown
|
|
63
66
|
[](https://goldziher.github.io/kreuzberg/)
|
64
67
|
[](https://opensource.org/licenses/MIT)
|
65
68
|
|
66
|
-
Kreuzberg is a Python library for text extraction from documents.
|
69
|
+
Kreuzberg is a **high-performance** Python library for text extraction from documents. **Benchmarked as one of the fastest text extraction libraries available**, it provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs optimized for speed and efficiency.
|
67
70
|
|
68
71
|
## Why Kreuzberg?
|
69
72
|
|
73
|
+
- **🚀 Substantially Faster**: Extraction speeds that significantly outperform other text extraction libraries
|
74
|
+
- **⚡ Unique Dual API**: The only framework supporting both sync and async APIs for maximum flexibility
|
75
|
+
- **💾 Memory Efficient**: Lower memory footprint compared to competing libraries
|
76
|
+
- **📊 Proven Performance**: [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) demonstrate superior performance across formats
|
70
77
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
71
78
|
- **Local Processing**: No external API calls or cloud dependencies required
|
72
79
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
@@ -85,6 +92,9 @@ pip install kreuzberg
|
|
85
92
|
|
86
93
|
# Or install with CLI support
|
87
94
|
pip install "kreuzberg[cli]"
|
95
|
+
|
96
|
+
# Or install with API server
|
97
|
+
pip install "kreuzberg[api]"
|
88
98
|
```
|
89
99
|
|
90
100
|
Install pandoc:
|
@@ -134,6 +144,31 @@ async def main():
|
|
134
144
|
asyncio.run(main())
|
135
145
|
```
|
136
146
|
|
147
|
+
## Docker
|
148
|
+
|
149
|
+
Docker images are available for easy deployment:
|
150
|
+
|
151
|
+
```bash
|
152
|
+
# Run the API server
|
153
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
154
|
+
|
155
|
+
# Extract files via API
|
156
|
+
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
157
|
+
```
|
158
|
+
|
159
|
+
See the [Docker documentation](https://goldziher.github.io/kreuzberg/user-guide/docker/) for more options.
|
160
|
+
|
161
|
+
## REST API
|
162
|
+
|
163
|
+
Run Kreuzberg as a REST API server:
|
164
|
+
|
165
|
+
```bash
|
166
|
+
pip install "kreuzberg[api]"
|
167
|
+
litestar --app kreuzberg._api.main:app run
|
168
|
+
```
|
169
|
+
|
170
|
+
See the [API documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/) for endpoints and usage.
|
171
|
+
|
137
172
|
## Command Line Interface
|
138
173
|
|
139
174
|
Kreuzberg includes a powerful CLI for processing documents from the command line:
|
@@ -208,7 +243,31 @@ For comparison and selection guidance, see the [OCR Backends](https://goldziher.
|
|
208
243
|
|
209
244
|
## Performance
|
210
245
|
|
211
|
-
Kreuzberg
|
246
|
+
Kreuzberg delivers **exceptional performance** compared to other text extraction libraries:
|
247
|
+
|
248
|
+
### 🏆 Competitive Benchmarks
|
249
|
+
|
250
|
+
[Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries show:
|
251
|
+
|
252
|
+
- **Fastest Extraction**: Consistently fastest processing times across file formats
|
253
|
+
- **Lowest Memory Usage**: Most memory-efficient text extraction solution
|
254
|
+
- **100% Success Rate**: Reliable extraction across all tested document types
|
255
|
+
- **Optimal for High-Throughput**: Designed for real-time, production applications
|
256
|
+
|
257
|
+
### 💾 Installation Size Efficiency
|
258
|
+
|
259
|
+
Kreuzberg delivers maximum performance with minimal overhead:
|
260
|
+
|
261
|
+
1. **Kreuzberg**: 71.0 MB (20 deps) - Most lightweight
|
262
|
+
1. **Unstructured**: 145.8 MB (54 deps) - Moderate footprint
|
263
|
+
1. **MarkItDown**: 250.7 MB (25 deps) - ML inference overhead
|
264
|
+
1. **Docling**: 1,031.9 MB (88 deps) - Full ML stack included
|
265
|
+
|
266
|
+
**Kreuzberg is up to 14x smaller** than competing solutions while delivering superior performance.
|
267
|
+
|
268
|
+
### ⚡ Sync vs Async Performance
|
269
|
+
|
270
|
+
Kreuzberg is the only library offering both sync and async APIs. Choose based on your use case:
|
212
271
|
|
213
272
|
| Operation | Sync Time | Async Time | Async Advantage |
|
214
273
|
| ---------------------- | --------- | ---------- | ------------------ |
|
@@ -218,11 +277,7 @@ Kreuzberg offers both sync and async APIs. Choose the right one based on your us
|
|
218
277
|
| OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
|
219
278
|
| Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
|
220
279
|
|
221
|
-
**Rule of thumb:**
|
222
|
-
|
223
|
-
- Use **sync** for simple documents and CLI applications
|
224
|
-
- Use **async** for complex PDFs, OCR, and batch processing
|
225
|
-
- Use **batch operations** for multiple files
|
280
|
+
**Rule of thumb:** Use async for complex documents, OCR, batch processing, and backend APIs.
|
226
281
|
|
227
282
|
For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
|
228
283
|
|
@@ -5,10 +5,14 @@
|
|
5
5
|
[](https://goldziher.github.io/kreuzberg/)
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
7
7
|
|
8
|
-
Kreuzberg is a Python library for text extraction from documents.
|
8
|
+
Kreuzberg is a **high-performance** Python library for text extraction from documents. **Benchmarked as one of the fastest text extraction libraries available**, it provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs optimized for speed and efficiency.
|
9
9
|
|
10
10
|
## Why Kreuzberg?
|
11
11
|
|
12
|
+
- **🚀 Substantially Faster**: Extraction speeds that significantly outperform other text extraction libraries
|
13
|
+
- **⚡ Unique Dual API**: The only framework supporting both sync and async APIs for maximum flexibility
|
14
|
+
- **💾 Memory Efficient**: Lower memory footprint compared to competing libraries
|
15
|
+
- **📊 Proven Performance**: [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) demonstrate superior performance across formats
|
12
16
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
13
17
|
- **Local Processing**: No external API calls or cloud dependencies required
|
14
18
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
@@ -27,6 +31,9 @@ pip install kreuzberg
|
|
27
31
|
|
28
32
|
# Or install with CLI support
|
29
33
|
pip install "kreuzberg[cli]"
|
34
|
+
|
35
|
+
# Or install with API server
|
36
|
+
pip install "kreuzberg[api]"
|
30
37
|
```
|
31
38
|
|
32
39
|
Install pandoc:
|
@@ -76,6 +83,31 @@ async def main():
|
|
76
83
|
asyncio.run(main())
|
77
84
|
```
|
78
85
|
|
86
|
+
## Docker
|
87
|
+
|
88
|
+
Docker images are available for easy deployment:
|
89
|
+
|
90
|
+
```bash
|
91
|
+
# Run the API server
|
92
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
93
|
+
|
94
|
+
# Extract files via API
|
95
|
+
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
96
|
+
```
|
97
|
+
|
98
|
+
See the [Docker documentation](https://goldziher.github.io/kreuzberg/user-guide/docker/) for more options.
|
99
|
+
|
100
|
+
## REST API
|
101
|
+
|
102
|
+
Run Kreuzberg as a REST API server:
|
103
|
+
|
104
|
+
```bash
|
105
|
+
pip install "kreuzberg[api]"
|
106
|
+
litestar --app kreuzberg._api.main:app run
|
107
|
+
```
|
108
|
+
|
109
|
+
See the [API documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/) for endpoints and usage.
|
110
|
+
|
79
111
|
## Command Line Interface
|
80
112
|
|
81
113
|
Kreuzberg includes a powerful CLI for processing documents from the command line:
|
@@ -150,7 +182,31 @@ For comparison and selection guidance, see the [OCR Backends](https://goldziher.
|
|
150
182
|
|
151
183
|
## Performance
|
152
184
|
|
153
|
-
Kreuzberg
|
185
|
+
Kreuzberg delivers **exceptional performance** compared to other text extraction libraries:
|
186
|
+
|
187
|
+
### 🏆 Competitive Benchmarks
|
188
|
+
|
189
|
+
[Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries show:
|
190
|
+
|
191
|
+
- **Fastest Extraction**: Consistently fastest processing times across file formats
|
192
|
+
- **Lowest Memory Usage**: Most memory-efficient text extraction solution
|
193
|
+
- **100% Success Rate**: Reliable extraction across all tested document types
|
194
|
+
- **Optimal for High-Throughput**: Designed for real-time, production applications
|
195
|
+
|
196
|
+
### 💾 Installation Size Efficiency
|
197
|
+
|
198
|
+
Kreuzberg delivers maximum performance with minimal overhead:
|
199
|
+
|
200
|
+
1. **Kreuzberg**: 71.0 MB (20 deps) - Most lightweight
|
201
|
+
1. **Unstructured**: 145.8 MB (54 deps) - Moderate footprint
|
202
|
+
1. **MarkItDown**: 250.7 MB (25 deps) - ML inference overhead
|
203
|
+
1. **Docling**: 1,031.9 MB (88 deps) - Full ML stack included
|
204
|
+
|
205
|
+
**Kreuzberg is up to 14x smaller** than competing solutions while delivering superior performance.
|
206
|
+
|
207
|
+
### ⚡ Sync vs Async Performance
|
208
|
+
|
209
|
+
Kreuzberg is the only library offering both sync and async APIs. Choose based on your use case:
|
154
210
|
|
155
211
|
| Operation | Sync Time | Async Time | Async Advantage |
|
156
212
|
| ---------------------- | --------- | ---------- | ------------------ |
|
@@ -160,11 +216,7 @@ Kreuzberg offers both sync and async APIs. Choose the right one based on your us
|
|
160
216
|
| OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
|
161
217
|
| Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
|
162
218
|
|
163
|
-
**Rule of thumb:**
|
164
|
-
|
165
|
-
- Use **sync** for simple documents and CLI applications
|
166
|
-
- Use **async** for complex PDFs, OCR, and batch processing
|
167
|
-
- Use **batch operations** for multiple files
|
219
|
+
**Rule of thumb:** Use async for complex documents, OCR, batch processing, and backend APIs.
|
168
220
|
|
169
221
|
For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
|
170
222
|
|
@@ -4,18 +4,29 @@ Kreuzberg provides both synchronous and asynchronous APIs, each optimized for di
|
|
4
4
|
|
5
5
|
## Quick Reference
|
6
6
|
|
7
|
-
| Use Case | Recommended API | Reason
|
8
|
-
| ------------------- | ---------------------------- |
|
9
|
-
| CLI tools | `extract_file_sync()` | Lower overhead, simpler code
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
7
|
+
| Use Case | Recommended API | Reason |
|
8
|
+
| ------------------- | ---------------------------- | -------------------------------------- |
|
9
|
+
| CLI tools | `extract_file_sync()` | Lower overhead, simpler code |
|
10
|
+
| **Backend APIs** | `await extract_file()` | **Always use async in async contexts** |
|
11
|
+
| Web applications | `await extract_file()` | Better concurrency |
|
12
|
+
| Simple documents | `extract_file_sync()` | Faster for small files |
|
13
|
+
| Complex PDFs | `await extract_file()` | Parallelized processing |
|
14
|
+
| Batch processing | `await batch_extract_file()` | Concurrent execution |
|
15
|
+
| OCR-heavy workloads | `await extract_file()` | Multiprocessing benefits |
|
15
16
|
|
16
|
-
##
|
17
|
+
## Competitive Performance
|
17
18
|
|
18
|
-
|
19
|
+
[Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries demonstrate:
|
20
|
+
|
21
|
+
- **Fastest Extraction**: Consistently fastest processing times across file formats
|
22
|
+
- **Lowest Memory Usage**: Most memory-efficient text extraction solution
|
23
|
+
- **Smallest Installation**: 71.0 MB vs competitors ranging from 145.8 MB to 1,031.9 MB
|
24
|
+
- **100% Success Rate**: Reliable extraction across all tested document types
|
25
|
+
- **Optimal for High-Throughput**: Designed for real-time, production applications
|
26
|
+
|
27
|
+
## Internal Benchmark Results
|
28
|
+
|
29
|
+
All internal benchmarks were conducted on macOS 15.5 with ARM64 (14 cores, 48GB RAM) using Python 3.13.3.
|
19
30
|
|
20
31
|
### Single Document Processing
|
21
32
|
|
@@ -50,6 +61,29 @@ All benchmarks were conducted on macOS 15.5 with ARM64 (14 cores, 48GB RAM) usin
|
|
50
61
|
1. **Simpler Path**: Direct execution without thread/process coordination
|
51
62
|
1. **Fast Startup**: Immediate execution for quick operations
|
52
63
|
|
64
|
+
### Backend API Considerations
|
65
|
+
|
66
|
+
**Important**: When working in an async context (like FastAPI, Django async views, aiohttp), **always use the async API** even for simple documents:
|
67
|
+
|
68
|
+
```python
|
69
|
+
# ✅ Correct: Use async in async contexts
|
70
|
+
async def extract_endpoint(file_path: str):
|
71
|
+
result = await extract_file(file_path) # Non-blocking
|
72
|
+
return result
|
73
|
+
|
74
|
+
# ❌ Wrong: Sync in async context blocks the event loop
|
75
|
+
async def extract_endpoint(file_path: str):
|
76
|
+
result = extract_file_sync(file_path) # Blocks event loop!
|
77
|
+
return result
|
78
|
+
```
|
79
|
+
|
80
|
+
**Why this matters:**
|
81
|
+
|
82
|
+
- Sync operations in async contexts block the entire event loop
|
83
|
+
- This prevents other requests from being processed concurrently
|
84
|
+
- Backend throughput drops dramatically
|
85
|
+
- Use async consistently throughout your async application stack
|
86
|
+
|
53
87
|
### The Crossover Point
|
54
88
|
|
55
89
|
The performance crossover occurs around **10KB file size** or when **OCR is required**:
|
@@ -219,6 +253,17 @@ Choose your API based on your specific needs:
|
|
219
253
|
|
220
254
|
- **Sync for simplicity**: CLI tools, simple documents, single-threaded applications
|
221
255
|
- **Async for scale**: Web applications, batch processing, complex documents
|
256
|
+
- **Async for backends**: **Always use async in async contexts** (FastAPI, Django async, etc.)
|
222
257
|
- **Batch for efficiency**: Multiple files, concurrent processing requirements
|
223
258
|
|
259
|
+
### Key Decision Points
|
260
|
+
|
261
|
+
1. **Are you in an async context?** → Use async API
|
262
|
+
1. **Processing multiple files?** → Use batch operations
|
263
|
+
1. **Simple single document in sync context?** → Sync may be faster
|
264
|
+
1. **Complex documents or OCR required?** → Use async API
|
265
|
+
1. **Building a web API?** → Use async API
|
266
|
+
|
224
267
|
The performance characteristics will vary based on your specific documents, hardware, and usage patterns. We recommend benchmarking with your actual data to make informed decisions.
|
268
|
+
|
269
|
+
**Remember**: Kreuzberg is benchmarked as one of the fastest text extraction libraries available, delivering superior performance regardless of which API you choose.
|