kreuzberg 3.3.0__tar.gz → 3.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-3.4.1/.docker/Dockerfile +21 -0
- kreuzberg-3.4.1/.docker/README.md +87 -0
- kreuzberg-3.4.1/.dockerignore +15 -0
- kreuzberg-3.4.1/.github/workflows/docs.yml +66 -0
- kreuzberg-3.4.1/.github/workflows/publish-docker.yml +111 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/.gitignore +10 -8
- kreuzberg-3.4.1/PKG-INFO +233 -0
- kreuzberg-3.4.1/README.md +168 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/ai-rulez.yaml +56 -3
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/advanced/performance.md +55 -10
- kreuzberg-3.4.1/docs/user-guide/api-server.md +169 -0
- kreuzberg-3.4.1/docs/user-guide/docker.md +245 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/user-guide/index.md +2 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/__init__.py +3 -1
- kreuzberg-3.4.1/kreuzberg/_api/main.py +87 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_types.py +4 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/mkdocs.yaml +2 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/pyproject.toml +24 -18
- kreuzberg-3.4.1/tests/api/main_test.py +252 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/cli_integration_test.py +2 -2
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/gmft_test.py +18 -14
- kreuzberg-3.4.1/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/ocr/tesseract_test.py +27 -25
- kreuzberg-3.4.1/tests/utils/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/uv.lock +441 -41
- kreuzberg-3.3.0/.github/benchmarks/README.md +0 -15
- kreuzberg-3.3.0/PKG-INFO +0 -235
- kreuzberg-3.3.0/README.md +0 -177
- kreuzberg-3.3.0/run_benchmarks.py +0 -195
- kreuzberg-3.3.0/scripts/__init__.py +0 -1
- kreuzberg-3.3.0/scripts/compare_benchmarks.py +0 -100
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/.commitlintrc +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/LICENSE +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/changelog.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/cli.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/contributing.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/index.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.3.0/kreuzberg/_extractors → kreuzberg-3.4.1/kreuzberg/_api}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.3.0/kreuzberg/_utils → kreuzberg-3.4.1/kreuzberg/_extractors}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/process_manager.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.3.0/tests → kreuzberg-3.4.1/kreuzberg/_utils}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.3.0/tests/extractors → kreuzberg-3.4.1/tests}/__init__.py +0 -0
- {kreuzberg-3.3.0/tests/ocr → kreuzberg-3.4.1/tests/api}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/conftest.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extraction_test.py +0 -0
- {kreuzberg-3.3.0/tests/utils → kreuzberg-3.4.1/tests/extractors}/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/multiprocessing/sync_tesseract_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/types_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.3.0 → kreuzberg-3.4.1}/tests/utils/tmp_test.py +0 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm as app
|
2
|
+
ARG EXTRAS=""
|
3
|
+
WORKDIR /app
|
4
|
+
ENV PYTHONDONTWRITEBYTECODE 1
|
5
|
+
ENV PYTHONUNBUFFERED 1
|
6
|
+
ENV UV_LINK_MODE=copy
|
7
|
+
|
8
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
9
|
+
pandoc \
|
10
|
+
tesseract-ocr \
|
11
|
+
&& apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
12
|
+
|
13
|
+
|
14
|
+
COPY pyproject.toml uv.lock README.md ./
|
15
|
+
COPY kreuzberg kreuzberg
|
16
|
+
|
17
|
+
RUN uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --compile-bytecode
|
18
|
+
|
19
|
+
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
|
20
|
+
USER appuser
|
21
|
+
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# Kreuzberg Docker Images
|
2
|
+
|
3
|
+
[](https://github.com/Goldziher/kreuzberg)
|
4
|
+
[](https://badge.fury.io/py/kreuzberg)
|
5
|
+
[](https://goldziher.github.io/kreuzberg/)
|
6
|
+
[](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE)
|
7
|
+
|
8
|
+
High-performance Python library for text extraction from documents, available as Docker images.
|
9
|
+
|
10
|
+
**Source Code**: [github.com/Goldziher/kreuzberg](https://github.com/Goldziher/kreuzberg)
|
11
|
+
|
12
|
+
## Quick Start
|
13
|
+
|
14
|
+
```bash
|
15
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
16
|
+
```
|
17
|
+
|
18
|
+
## Available Tags
|
19
|
+
|
20
|
+
- `latest` - Latest stable release with API server and Tesseract OCR
|
21
|
+
- `X.Y.Z` - Specific version (e.g., `3.0.0`)
|
22
|
+
- `X.Y.Z-easyocr` - With EasyOCR support
|
23
|
+
- `X.Y.Z-paddle` - With PaddleOCR support
|
24
|
+
- `X.Y.Z-gmft` - With GMFT table extraction
|
25
|
+
- `X.Y.Z-all` - With all optional dependencies
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
### Extract Files via API
|
30
|
+
|
31
|
+
```bash
|
32
|
+
# Single file
|
33
|
+
curl -X POST http://localhost:8000/extract \
|
34
|
+
-F "data=@document.pdf"
|
35
|
+
|
36
|
+
# Multiple files
|
37
|
+
curl -X POST http://localhost:8000/extract \
|
38
|
+
-F "data=@document1.pdf" \
|
39
|
+
-F "data=@document2.docx"
|
40
|
+
```
|
41
|
+
|
42
|
+
### Docker Compose
|
43
|
+
|
44
|
+
```yaml
|
45
|
+
version: '3.8'
|
46
|
+
|
47
|
+
services:
|
48
|
+
kreuzberg:
|
49
|
+
image: goldziher/kreuzberg:latest
|
50
|
+
ports:
|
51
|
+
- "8000:8000"
|
52
|
+
restart: unless-stopped
|
53
|
+
```
|
54
|
+
|
55
|
+
## Features
|
56
|
+
|
57
|
+
- **🚀 High Performance**: Optimized for speed and efficiency
|
58
|
+
- **📄 Multiple Formats**: PDF, DOCX, images, HTML, and more
|
59
|
+
- **🔍 OCR Support**: Built-in Tesseract, optional EasyOCR/PaddleOCR
|
60
|
+
- **📊 Table Extraction**: Extract tables with GMFT
|
61
|
+
- **🔒 Secure**: Runs as non-root user, no external API calls
|
62
|
+
- **📦 Ready to Use**: Pre-configured API server
|
63
|
+
|
64
|
+
## Documentation
|
65
|
+
|
66
|
+
- **[GitHub Repository](https://github.com/Goldziher/kreuzberg)** - Source code and issue tracking
|
67
|
+
- **[Full Documentation](https://goldziher.github.io/kreuzberg/)** - Complete user guide and API reference
|
68
|
+
- **[API Documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/)** - REST API endpoints and usage
|
69
|
+
- **[Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/)** - Detailed Docker usage guide
|
70
|
+
|
71
|
+
## Support
|
72
|
+
|
73
|
+
- **Issues**: [github.com/Goldziher/kreuzberg/issues](https://github.com/Goldziher/kreuzberg/issues)
|
74
|
+
- **Discussions**: [github.com/Goldziher/kreuzberg/discussions](https://github.com/Goldziher/kreuzberg/discussions)
|
75
|
+
- **Discord**: [Join our community](https://discord.gg/pXxagNK2zN)
|
76
|
+
|
77
|
+
## Contributing
|
78
|
+
|
79
|
+
Contributions are welcome! See our [Contributing Guide](https://github.com/Goldziher/kreuzberg/blob/main/docs/contributing.md).
|
80
|
+
|
81
|
+
## License
|
82
|
+
|
83
|
+
MIT License - see [LICENSE](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE) for details.
|
84
|
+
|
85
|
+
______________________________________________________________________
|
86
|
+
|
87
|
+
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
@@ -0,0 +1,66 @@
|
|
1
|
+
name: Deploy Documentation
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches:
|
6
|
+
- main
|
7
|
+
paths:
|
8
|
+
- 'docs/**'
|
9
|
+
- 'mkdocs.yaml'
|
10
|
+
- '.github/workflows/docs.yml'
|
11
|
+
workflow_dispatch:
|
12
|
+
|
13
|
+
permissions:
|
14
|
+
contents: read
|
15
|
+
pages: write
|
16
|
+
id-token: write
|
17
|
+
|
18
|
+
concurrency:
|
19
|
+
group: "pages"
|
20
|
+
cancel-in-progress: false
|
21
|
+
|
22
|
+
jobs:
|
23
|
+
build:
|
24
|
+
runs-on: ubuntu-latest
|
25
|
+
steps:
|
26
|
+
- name: Checkout repository
|
27
|
+
uses: actions/checkout@v4
|
28
|
+
with:
|
29
|
+
fetch-depth: 0
|
30
|
+
|
31
|
+
- name: Setup Python
|
32
|
+
uses: actions/setup-python@v5
|
33
|
+
with:
|
34
|
+
python-version: '3.11'
|
35
|
+
|
36
|
+
- name: Install uv
|
37
|
+
uses: astral-sh/setup-uv@v6
|
38
|
+
with:
|
39
|
+
enable-cache: true
|
40
|
+
|
41
|
+
- name: Install dependencies
|
42
|
+
run: |
|
43
|
+
uv sync --group doc
|
44
|
+
|
45
|
+
- name: Setup Pages
|
46
|
+
uses: actions/configure-pages@v5
|
47
|
+
|
48
|
+
- name: Build documentation
|
49
|
+
run: |
|
50
|
+
uv run mkdocs build --clean --strict
|
51
|
+
|
52
|
+
- name: Upload artifact
|
53
|
+
uses: actions/upload-pages-artifact@v3
|
54
|
+
with:
|
55
|
+
path: ./site
|
56
|
+
|
57
|
+
deploy:
|
58
|
+
environment:
|
59
|
+
name: github-pages
|
60
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
61
|
+
runs-on: ubuntu-latest
|
62
|
+
needs: build
|
63
|
+
steps:
|
64
|
+
- name: Deploy to GitHub Pages
|
65
|
+
id: deployment
|
66
|
+
uses: actions/deploy-pages@v4
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# .github/workflows/publish-docker.yml
|
2
|
+
|
3
|
+
name: Publish Docker Images
|
4
|
+
|
5
|
+
on:
|
6
|
+
workflow_run:
|
7
|
+
workflows: ["Release"]
|
8
|
+
types:
|
9
|
+
- completed
|
10
|
+
branches:
|
11
|
+
- main
|
12
|
+
workflow_dispatch:
|
13
|
+
|
14
|
+
jobs:
|
15
|
+
build-and-push:
|
16
|
+
runs-on: ubuntu-latest
|
17
|
+
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
|
18
|
+
permissions:
|
19
|
+
contents: read
|
20
|
+
packages: write
|
21
|
+
|
22
|
+
strategy:
|
23
|
+
matrix:
|
24
|
+
include:
|
25
|
+
- name: core
|
26
|
+
extras: ""
|
27
|
+
tag_suffix: "" # The base image tag (includes API + tesseract)
|
28
|
+
- name: easyocr
|
29
|
+
extras: "easyocr"
|
30
|
+
tag_suffix: "-easyocr"
|
31
|
+
- name: paddle
|
32
|
+
extras: "paddleocr"
|
33
|
+
tag_suffix: "-paddle"
|
34
|
+
- name: gmft
|
35
|
+
extras: "gmft"
|
36
|
+
tag_suffix: "-gmft"
|
37
|
+
- name: all
|
38
|
+
extras: "all"
|
39
|
+
tag_suffix: "-all"
|
40
|
+
|
41
|
+
steps:
|
42
|
+
- name: Checkout repository
|
43
|
+
uses: actions/checkout@v4
|
44
|
+
with:
|
45
|
+
ref: ${{ github.event.workflow_run.head_branch || github.ref }}
|
46
|
+
|
47
|
+
- name: Get release version
|
48
|
+
id: get_version
|
49
|
+
run: |
|
50
|
+
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
51
|
+
# For manual dispatch, get the latest tag by listing all tags
|
52
|
+
git fetch --tags
|
53
|
+
VERSION=$(git tag --sort=-version:refname | head -n1)
|
54
|
+
else
|
55
|
+
# For workflow_run, use the head branch
|
56
|
+
VERSION="${{ github.event.workflow_run.head_branch }}"
|
57
|
+
# If triggered by a tag, extract version
|
58
|
+
if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
|
59
|
+
VERSION="$VERSION"
|
60
|
+
else
|
61
|
+
# Get the latest tag by listing all tags
|
62
|
+
git fetch --tags
|
63
|
+
VERSION=$(git tag --sort=-version:refname | head -n1)
|
64
|
+
fi
|
65
|
+
fi
|
66
|
+
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
|
67
|
+
|
68
|
+
- name: Set up QEMU
|
69
|
+
uses: docker/setup-qemu-action@v3
|
70
|
+
|
71
|
+
- name: Set up Docker Buildx
|
72
|
+
uses: docker/setup-buildx-action@v3
|
73
|
+
|
74
|
+
- name: Log in to Docker Hub
|
75
|
+
uses: docker/login-action@v3
|
76
|
+
with:
|
77
|
+
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
78
|
+
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
79
|
+
|
80
|
+
- name: Extract metadata (tags, labels) for Docker
|
81
|
+
id: meta
|
82
|
+
uses: docker/metadata-action@v5
|
83
|
+
with:
|
84
|
+
images: goldziher/kreuzberg
|
85
|
+
tags: |
|
86
|
+
# Release version tag (e.g., v3.0.0-easyocr)
|
87
|
+
type=raw,value=${{ steps.get_version.outputs.VERSION }}${{ matrix.tag_suffix }}
|
88
|
+
# Latest tag for each variant (e.g., latest-easyocr)
|
89
|
+
type=raw,value=latest${{ matrix.tag_suffix }}
|
90
|
+
|
91
|
+
- name: Build and push Docker image
|
92
|
+
uses: docker/build-push-action@v5
|
93
|
+
with:
|
94
|
+
context: .
|
95
|
+
file: ./.docker/Dockerfile
|
96
|
+
platforms: linux/amd64,linux/arm64
|
97
|
+
push: true
|
98
|
+
build-args: |
|
99
|
+
EXTRAS=${{ matrix.extras }}
|
100
|
+
tags: ${{ steps.meta.outputs.tags }}
|
101
|
+
labels: ${{ steps.meta.outputs.labels }}
|
102
|
+
|
103
|
+
- name: Update Docker Hub README
|
104
|
+
uses: peter-evans/dockerhub-description@v4
|
105
|
+
if: matrix.name == 'core'
|
106
|
+
continue-on-error: true
|
107
|
+
with:
|
108
|
+
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
109
|
+
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
110
|
+
repository: goldziher/kreuzberg
|
111
|
+
readme-filepath: ./.docker/README.md
|
@@ -6,28 +6,30 @@
|
|
6
6
|
*.py[cod]
|
7
7
|
*.suo
|
8
8
|
*.user
|
9
|
-
|
9
|
+
*temp/
|
10
10
|
.coverage
|
11
11
|
.coverage*
|
12
|
+
.cursorrules
|
12
13
|
.dist/
|
14
|
+
.DS_store
|
13
15
|
.env
|
14
16
|
.idea/
|
17
|
+
.kreuzberg/
|
15
18
|
.mypy_cache/
|
16
19
|
.pytest_cache/
|
17
20
|
.python-version
|
21
|
+
.ropeproject
|
18
22
|
.ruff_cache/
|
19
23
|
.run/
|
20
24
|
.venv/
|
21
25
|
.vscode/
|
22
26
|
.windsurfrules
|
23
|
-
.cursorrules
|
24
|
-
CLAUDE.md
|
25
|
-
GEMINI.md
|
26
27
|
__pycache__/
|
28
|
+
benchmark_results.json
|
29
|
+
CLAUDE.md
|
27
30
|
coverage.xml
|
31
|
+
docker-compose.yaml
|
32
|
+
GEMINI.md
|
28
33
|
prompt_template.egg-info/
|
29
34
|
requirements.txt
|
30
|
-
|
31
|
-
docker-compose.yaml
|
32
|
-
benchmark_results.json
|
33
|
-
.kreuzberg/
|
35
|
+
site/
|
kreuzberg-3.4.1/PKG-INFO
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 3.4.1
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
|
+
License: MIT
|
8
|
+
License-File: LICENSE
|
9
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
13
|
+
Classifier: Operating System :: OS Independent
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
|
+
Classifier: Topic :: Text Processing :: General
|
23
|
+
Classifier: Topic :: Utilities
|
24
|
+
Classifier: Typing :: Typed
|
25
|
+
Requires-Python: >=3.9
|
26
|
+
Requires-Dist: anyio>=4.9.0
|
27
|
+
Requires-Dist: charset-normalizer>=3.4.2
|
28
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
29
|
+
Requires-Dist: html-to-markdown>=1.4.0
|
30
|
+
Requires-Dist: msgspec>=0.18.0
|
31
|
+
Requires-Dist: playa-pdf>=0.6.1
|
32
|
+
Requires-Dist: psutil>=7.0.0
|
33
|
+
Requires-Dist: pypdfium2==4.30.0
|
34
|
+
Requires-Dist: python-calamine>=0.3.2
|
35
|
+
Requires-Dist: python-pptx>=1.0.2
|
36
|
+
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
37
|
+
Provides-Extra: all
|
38
|
+
Requires-Dist: click>=8.2.1; extra == 'all'
|
39
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
40
|
+
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
41
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
|
42
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
43
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
44
|
+
Requires-Dist: rich>=14.0.0; extra == 'all'
|
45
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
46
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
47
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
48
|
+
Provides-Extra: api
|
49
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
|
50
|
+
Provides-Extra: chunking
|
51
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
52
|
+
Provides-Extra: cli
|
53
|
+
Requires-Dist: click>=8.2.1; extra == 'cli'
|
54
|
+
Requires-Dist: rich>=14.0.0; extra == 'cli'
|
55
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
56
|
+
Provides-Extra: easyocr
|
57
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
58
|
+
Provides-Extra: gmft
|
59
|
+
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
60
|
+
Provides-Extra: paddleocr
|
61
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
62
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
63
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
64
|
+
Description-Content-Type: text/markdown
|
65
|
+
|
66
|
+
# Kreuzberg
|
67
|
+
|
68
|
+
[](https://discord.gg/pXxagNK2zN)
|
69
|
+
[](https://badge.fury.io/py/kreuzberg)
|
70
|
+
[](https://goldziher.github.io/kreuzberg/)
|
71
|
+
[](https://opensource.org/licenses/MIT)
|
72
|
+
|
73
|
+
**High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
|
74
|
+
|
75
|
+
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
76
|
+
|
77
|
+
## Why Kreuzberg?
|
78
|
+
|
79
|
+
- **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
|
80
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
81
|
+
- **⚡ Dual APIs**: Only library with both sync and async support
|
82
|
+
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
83
|
+
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
84
|
+
- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
|
85
|
+
- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
|
86
|
+
- **🐳 Production Ready**: CLI, REST API, and Docker images included
|
87
|
+
|
88
|
+
## Quick Start
|
89
|
+
|
90
|
+
### Installation
|
91
|
+
|
92
|
+
```bash
|
93
|
+
# Basic installation
|
94
|
+
pip install kreuzberg
|
95
|
+
|
96
|
+
# With optional features
|
97
|
+
pip install "kreuzberg[cli,api]" # CLI + REST API
|
98
|
+
pip install "kreuzberg[easyocr,gmft]" # EasyOCR + table extraction
|
99
|
+
pip install "kreuzberg[all]" # Everything
|
100
|
+
```
|
101
|
+
|
102
|
+
### System Dependencies
|
103
|
+
|
104
|
+
```bash
|
105
|
+
# Ubuntu/Debian
|
106
|
+
sudo apt-get install tesseract-ocr pandoc
|
107
|
+
|
108
|
+
# macOS
|
109
|
+
brew install tesseract pandoc
|
110
|
+
|
111
|
+
# Windows
|
112
|
+
choco install tesseract pandoc
|
113
|
+
```
|
114
|
+
|
115
|
+
### Basic Usage
|
116
|
+
|
117
|
+
```python
|
118
|
+
import asyncio
|
119
|
+
from kreuzberg import extract_file
|
120
|
+
|
121
|
+
async def main():
|
122
|
+
# Extract from any document type
|
123
|
+
result = await extract_file("document.pdf")
|
124
|
+
print(result.content)
|
125
|
+
print(result.metadata)
|
126
|
+
|
127
|
+
asyncio.run(main())
|
128
|
+
```
|
129
|
+
|
130
|
+
## Deployment Options
|
131
|
+
|
132
|
+
### 🐳 Docker (Recommended)
|
133
|
+
|
134
|
+
```bash
|
135
|
+
# Run API server
|
136
|
+
docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
|
137
|
+
|
138
|
+
# Extract files
|
139
|
+
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
140
|
+
```
|
141
|
+
|
142
|
+
Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
|
143
|
+
|
144
|
+
### 🌐 REST API
|
145
|
+
|
146
|
+
```bash
|
147
|
+
# Install and run
|
148
|
+
pip install "kreuzberg[api]"
|
149
|
+
litestar --app kreuzberg._api.main:app run
|
150
|
+
|
151
|
+
# Health check
|
152
|
+
curl http://localhost:8000/health
|
153
|
+
|
154
|
+
# Extract files
|
155
|
+
curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
|
156
|
+
```
|
157
|
+
|
158
|
+
### 💻 Command Line
|
159
|
+
|
160
|
+
```bash
|
161
|
+
# Install CLI
|
162
|
+
pip install "kreuzberg[cli]"
|
163
|
+
|
164
|
+
# Extract to stdout
|
165
|
+
kreuzberg extract document.pdf
|
166
|
+
|
167
|
+
# JSON output with metadata
|
168
|
+
kreuzberg extract document.pdf --output-format json --show-metadata
|
169
|
+
|
170
|
+
# Batch processing
|
171
|
+
kreuzberg extract *.pdf --output-dir ./extracted/
|
172
|
+
```
|
173
|
+
|
174
|
+
## Supported Formats
|
175
|
+
|
176
|
+
| Category | Formats |
|
177
|
+
| ----------------- | ------------------------------ |
|
178
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
179
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
180
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
181
|
+
| **Presentations** | PPTX, PPT, ODP |
|
182
|
+
| **Web** | HTML, XML, MHTML |
|
183
|
+
| **Archives** | Support via extraction |
|
184
|
+
|
185
|
+
## Performance
|
186
|
+
|
187
|
+
**Fastest extraction speeds** with minimal resource usage:
|
188
|
+
|
189
|
+
| Library | Speed | Memory | Size | Success Rate |
|
190
|
+
| ------------- | -------------- | ------------- | ----------- | ------------ |
|
191
|
+
| **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%** |
|
192
|
+
| Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
|
193
|
+
| MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
|
194
|
+
| Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
|
195
|
+
|
196
|
+
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
197
|
+
|
198
|
+
## Documentation
|
199
|
+
|
200
|
+
### Quick Links
|
201
|
+
|
202
|
+
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
203
|
+
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
204
|
+
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
205
|
+
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
206
|
+
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
207
|
+
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
208
|
+
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
209
|
+
|
210
|
+
## Advanced Features
|
211
|
+
|
212
|
+
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
213
|
+
- **🧩 Content Chunking**: Split documents for RAG applications
|
214
|
+
- **🎯 Custom Extractors**: Extend with your own document handlers
|
215
|
+
- **🔧 Configuration**: Flexible TOML-based configuration
|
216
|
+
- **🪝 Hooks**: Pre/post-processing customization
|
217
|
+
- **🌍 Multi-language OCR**: 100+ languages supported
|
218
|
+
- **⚙️ Metadata Extraction**: Rich document metadata
|
219
|
+
- **🔄 Batch Processing**: Efficient bulk document processing
|
220
|
+
|
221
|
+
## License
|
222
|
+
|
223
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
224
|
+
|
225
|
+
______________________________________________________________________
|
226
|
+
|
227
|
+
<div align="center">
|
228
|
+
|
229
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
230
|
+
|
231
|
+
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
232
|
+
|
233
|
+
</div>
|