kreuzberg 3.6.1__tar.gz → 3.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.docker/Dockerfile +4 -4
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.github/workflows/publish-docker.yml +14 -1
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.pre-commit-config.yaml +2 -2
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/PKG-INFO +72 -15
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/README.md +69 -13
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/index.md +1 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/index.md +1 -0
- kreuzberg-3.7.0/docs/user-guide/mcp-server.md +571 -0
- kreuzberg-3.7.0/kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg-3.7.0/kreuzberg/_mcp/server.py +227 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/mkdocs.yaml +1 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/pyproject.toml +4 -2
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extraction_test.py +2 -2
- kreuzberg-3.7.0/tests/mcp_server_test.py +374 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/uv.lock +228 -54
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.commitlintrc +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.docker/README.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.dockerignore +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.gitignore +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.gitmodules +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/LICENSE +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/ai-rulez.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/changelog.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/cli.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/contributing.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_multiprocessing/process_manager.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_multiprocessing/sync_easyocr.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/api/main_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/cli_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/conftest.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/gmft_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/multiprocessing/sync_tesseract_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/types_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.7.0}/tests/utils/tmp_test.py +0 -0
@@ -1,8 +1,8 @@
|
|
1
|
-
FROM ghcr.io/astral-sh/uv:python3.13-bookworm
|
1
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app
|
2
2
|
ARG EXTRAS=""
|
3
3
|
WORKDIR /app
|
4
|
-
ENV PYTHONDONTWRITEBYTECODE
|
5
|
-
ENV PYTHONUNBUFFERED
|
4
|
+
ENV PYTHONDONTWRITEBYTECODE=1
|
5
|
+
ENV PYTHONUNBUFFERED=1
|
6
6
|
ENV UV_LINK_MODE=copy
|
7
7
|
|
8
8
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
@@ -18,4 +18,4 @@ RUN uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --co
|
|
18
18
|
|
19
19
|
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
|
20
20
|
USER appuser
|
21
|
-
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
21
|
+
CMD ["/app/.venv/bin/litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -15,6 +15,7 @@ jobs:
|
|
15
15
|
packages: write
|
16
16
|
|
17
17
|
strategy:
|
18
|
+
max-parallel: 2
|
18
19
|
matrix:
|
19
20
|
include:
|
20
21
|
- name: core
|
@@ -34,6 +35,16 @@ jobs:
|
|
34
35
|
tag_suffix: "-all"
|
35
36
|
|
36
37
|
steps:
|
38
|
+
- name: Free up disk space
|
39
|
+
run: |
|
40
|
+
# Remove large unnecessary packages to free up space
|
41
|
+
sudo rm -rf /usr/share/dotnet
|
42
|
+
sudo rm -rf /usr/local/lib/android
|
43
|
+
sudo rm -rf /opt/ghc
|
44
|
+
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
45
|
+
sudo docker system prune -af
|
46
|
+
df -h
|
47
|
+
|
37
48
|
- name: Checkout repository
|
38
49
|
uses: actions/checkout@v4
|
39
50
|
with:
|
@@ -81,12 +92,14 @@ jobs:
|
|
81
92
|
with:
|
82
93
|
context: .
|
83
94
|
file: ./.docker/Dockerfile
|
84
|
-
platforms: linux/amd64,linux/arm64
|
95
|
+
platforms: ${{ matrix.name == 'all' && 'linux/amd64' || 'linux/amd64,linux/arm64' }}
|
85
96
|
push: true
|
86
97
|
build-args: |
|
87
98
|
EXTRAS=${{ matrix.extras }}
|
88
99
|
tags: ${{ steps.meta.outputs.tags }}
|
89
100
|
labels: ${{ steps.meta.outputs.labels }}
|
101
|
+
cache-from: type=gha
|
102
|
+
cache-to: type=gha,mode=max
|
90
103
|
|
91
104
|
- name: Update Docker Hub README
|
92
105
|
uses: peter-evans/dockerhub-description@v4
|
@@ -6,7 +6,7 @@ repos:
|
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
8
|
- repo: https://github.com/Goldziher/ai-rulez
|
9
|
-
rev: v1.1.
|
9
|
+
rev: v1.1.4
|
10
10
|
hooks:
|
11
11
|
- id: ai-rulez-validate
|
12
12
|
- id: ai-rulez-generate
|
@@ -53,7 +53,7 @@ repos:
|
|
53
53
|
hooks:
|
54
54
|
- id: pyproject-fmt
|
55
55
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
56
|
-
rev: v0.12.
|
56
|
+
rev: v0.12.2
|
57
57
|
hooks:
|
58
58
|
- id: ruff
|
59
59
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.7.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -25,7 +25,8 @@ Requires-Python: >=3.10
|
|
25
25
|
Requires-Dist: anyio>=4.9.0
|
26
26
|
Requires-Dist: charset-normalizer>=3.4.2
|
27
27
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
|
-
Requires-Dist: html-to-markdown>=1.
|
28
|
+
Requires-Dist: html-to-markdown[lxml]>=1.6.0
|
29
|
+
Requires-Dist: mcp>=1.11.0
|
29
30
|
Requires-Dist: msgspec>=0.18.0
|
30
31
|
Requires-Dist: playa-pdf>=0.6.1
|
31
32
|
Requires-Dist: psutil>=7.0.0
|
@@ -83,14 +84,15 @@ Description-Content-Type: text/markdown
|
|
83
84
|
|
84
85
|
## Why Kreuzberg?
|
85
86
|
|
86
|
-
- **🚀 Fastest Performance**: [
|
87
|
-
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
87
|
+
- **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
|
88
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
|
88
89
|
- **⚡ Dual APIs**: Only library with both sync and async support
|
89
90
|
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
90
91
|
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
91
92
|
- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
|
92
93
|
- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
|
93
|
-
-
|
94
|
+
- **🤖 AI Integration**: Native MCP server for Claude and other AI tools
|
95
|
+
- **🐳 Production Ready**: CLI, REST API, MCP server, and Docker images included
|
94
96
|
|
95
97
|
## Quick Start
|
96
98
|
|
@@ -136,17 +138,66 @@ asyncio.run(main())
|
|
136
138
|
|
137
139
|
## Deployment Options
|
138
140
|
|
141
|
+
### 🤖 MCP Server (AI Integration)
|
142
|
+
|
143
|
+
**Connect directly to Claude Desktop, Cursor, and other AI tools with the Model Context Protocol:**
|
144
|
+
|
145
|
+
```bash
|
146
|
+
# Install and run MCP server with all features (recommended)
|
147
|
+
pip install "kreuzberg[all]"
|
148
|
+
kreuzberg-mcp
|
149
|
+
|
150
|
+
# Or with uvx (recommended for Claude Desktop)
|
151
|
+
uvx --with "kreuzberg[all]" kreuzberg-mcp
|
152
|
+
|
153
|
+
# Basic installation (core features only)
|
154
|
+
pip install kreuzberg
|
155
|
+
kreuzberg-mcp
|
156
|
+
```
|
157
|
+
|
158
|
+
**Configure in Claude Desktop (`claude_desktop_config.json`):**
|
159
|
+
|
160
|
+
```json
|
161
|
+
{
|
162
|
+
"mcpServers": {
|
163
|
+
"kreuzberg": {
|
164
|
+
"command": "uvx",
|
165
|
+
"args": ["--with", "kreuzberg[all]", "kreuzberg-mcp"]
|
166
|
+
}
|
167
|
+
}
|
168
|
+
}
|
169
|
+
```
|
170
|
+
|
171
|
+
**Basic configuration (core features only):**
|
172
|
+
|
173
|
+
```json
|
174
|
+
{
|
175
|
+
"mcpServers": {
|
176
|
+
"kreuzberg": {
|
177
|
+
"command": "uvx",
|
178
|
+
"args": ["kreuzberg-mcp"]
|
179
|
+
}
|
180
|
+
}
|
181
|
+
}
|
182
|
+
```
|
183
|
+
|
184
|
+
**Available MCP capabilities:**
|
185
|
+
|
186
|
+
- **Tools**: `extract_document`, `extract_bytes`, `extract_simple`
|
187
|
+
- **Resources**: Configuration, supported formats, OCR backends
|
188
|
+
- **Prompts**: Extract-and-summarize, structured analysis workflows
|
189
|
+
|
139
190
|
### 🐳 Docker (Recommended)
|
140
191
|
|
141
192
|
```bash
|
142
193
|
# Run API server
|
143
|
-
docker run -p 8000:8000 goldziher/kreuzberg:
|
194
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
144
195
|
|
145
196
|
# Extract files
|
146
197
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
147
198
|
```
|
148
199
|
|
149
|
-
Available variants: `3.
|
200
|
+
Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
|
150
201
|
|
151
202
|
### 🌐 REST API
|
152
203
|
|
@@ -191,15 +242,20 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
191
242
|
|
192
243
|
## Performance
|
193
244
|
|
194
|
-
**
|
245
|
+
**[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
246
|
+
|
247
|
+
| Library | Speed | Memory | Install Size | Dependencies | Success Rate |
|
248
|
+
| ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
|
249
|
+
| **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
|
250
|
+
| Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
|
251
|
+
| MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
|
252
|
+
| Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
|
195
253
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
| Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
|
200
|
-
| MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
|
201
|
-
| Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
|
254
|
+
\*_Can achieve 75% reliability with 15% performance trade-off when configured_
|
255
|
+
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
256
|
+
‡_Frequently fails/times out on medium files (>1MB)_
|
202
257
|
|
258
|
+
> **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
|
203
259
|
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
204
260
|
|
205
261
|
## Documentation
|
@@ -216,6 +272,7 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
216
272
|
|
217
273
|
## Advanced Features
|
218
274
|
|
275
|
+
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
219
276
|
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
220
277
|
- **🧩 Content Chunking**: Split documents for RAG applications
|
221
278
|
- **🎯 Custom Extractors**: Extend with your own document handlers
|
@@ -233,7 +290,7 @@ ______________________________________________________________________
|
|
233
290
|
|
234
291
|
<div align="center">
|
235
292
|
|
236
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
293
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
237
294
|
|
238
295
|
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
239
296
|
|
@@ -11,14 +11,15 @@
|
|
11
11
|
|
12
12
|
## Why Kreuzberg?
|
13
13
|
|
14
|
-
- **🚀 Fastest Performance**: [
|
15
|
-
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
14
|
+
- **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
|
15
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
|
16
16
|
- **⚡ Dual APIs**: Only library with both sync and async support
|
17
17
|
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
18
18
|
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
19
19
|
- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
|
20
20
|
- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
|
21
|
-
-
|
21
|
+
- **🤖 AI Integration**: Native MCP server for Claude and other AI tools
|
22
|
+
- **🐳 Production Ready**: CLI, REST API, MCP server, and Docker images included
|
22
23
|
|
23
24
|
## Quick Start
|
24
25
|
|
@@ -64,17 +65,66 @@ asyncio.run(main())
|
|
64
65
|
|
65
66
|
## Deployment Options
|
66
67
|
|
68
|
+
### 🤖 MCP Server (AI Integration)
|
69
|
+
|
70
|
+
**Connect directly to Claude Desktop, Cursor, and other AI tools with the Model Context Protocol:**
|
71
|
+
|
72
|
+
```bash
|
73
|
+
# Install and run MCP server with all features (recommended)
|
74
|
+
pip install "kreuzberg[all]"
|
75
|
+
kreuzberg-mcp
|
76
|
+
|
77
|
+
# Or with uvx (recommended for Claude Desktop)
|
78
|
+
uvx --with "kreuzberg[all]" kreuzberg-mcp
|
79
|
+
|
80
|
+
# Basic installation (core features only)
|
81
|
+
pip install kreuzberg
|
82
|
+
kreuzberg-mcp
|
83
|
+
```
|
84
|
+
|
85
|
+
**Configure in Claude Desktop (`claude_desktop_config.json`):**
|
86
|
+
|
87
|
+
```json
|
88
|
+
{
|
89
|
+
"mcpServers": {
|
90
|
+
"kreuzberg": {
|
91
|
+
"command": "uvx",
|
92
|
+
"args": ["--with", "kreuzberg[all]", "kreuzberg-mcp"]
|
93
|
+
}
|
94
|
+
}
|
95
|
+
}
|
96
|
+
```
|
97
|
+
|
98
|
+
**Basic configuration (core features only):**
|
99
|
+
|
100
|
+
```json
|
101
|
+
{
|
102
|
+
"mcpServers": {
|
103
|
+
"kreuzberg": {
|
104
|
+
"command": "uvx",
|
105
|
+
"args": ["kreuzberg-mcp"]
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
```
|
110
|
+
|
111
|
+
**Available MCP capabilities:**
|
112
|
+
|
113
|
+
- **Tools**: `extract_document`, `extract_bytes`, `extract_simple`
|
114
|
+
- **Resources**: Configuration, supported formats, OCR backends
|
115
|
+
- **Prompts**: Extract-and-summarize, structured analysis workflows
|
116
|
+
|
67
117
|
### 🐳 Docker (Recommended)
|
68
118
|
|
69
119
|
```bash
|
70
120
|
# Run API server
|
71
|
-
docker run -p 8000:8000 goldziher/kreuzberg:
|
121
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
72
122
|
|
73
123
|
# Extract files
|
74
124
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
75
125
|
```
|
76
126
|
|
77
|
-
Available variants: `3.
|
127
|
+
Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
|
78
128
|
|
79
129
|
### 🌐 REST API
|
80
130
|
|
@@ -119,15 +169,20 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
119
169
|
|
120
170
|
## Performance
|
121
171
|
|
122
|
-
**
|
172
|
+
**[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
173
|
+
|
174
|
+
| Library | Speed | Memory | Install Size | Dependencies | Success Rate |
|
175
|
+
| ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
|
176
|
+
| **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
|
177
|
+
| Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
|
178
|
+
| MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
|
179
|
+
| Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
|
123
180
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
| Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
|
128
|
-
| MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
|
129
|
-
| Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
|
181
|
+
\*_Can achieve 75% reliability with 15% performance trade-off when configured_
|
182
|
+
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
183
|
+
‡_Frequently fails/times out on medium files (>1MB)_
|
130
184
|
|
185
|
+
> **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
|
131
186
|
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
132
187
|
|
133
188
|
## Documentation
|
@@ -144,6 +199,7 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
144
199
|
|
145
200
|
## Advanced Features
|
146
201
|
|
202
|
+
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
147
203
|
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
148
204
|
- **🧩 Content Chunking**: Split documents for RAG applications
|
149
205
|
- **🎯 Custom Extractors**: Extend with your own document handlers
|
@@ -161,7 +217,7 @@ ______________________________________________________________________
|
|
161
217
|
|
162
218
|
<div align="center">
|
163
219
|
|
164
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
220
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
165
221
|
|
166
222
|
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
167
223
|
|
@@ -6,6 +6,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
6
6
|
|
7
7
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
8
8
|
- **Local Processing**: No external API calls or cloud dependencies required
|
9
|
+
- **AI Integration**: Native MCP server for Claude Desktop and other AI tools
|
9
10
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
10
11
|
- **Small Package Size**: Has few curated dependencies and a minimal footprint
|
11
12
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
@@ -11,6 +11,7 @@ This guide covers the main concepts and usage patterns of Kreuzberg.
|
|
11
11
|
- [OCR Configuration](ocr-configuration.md) - Configure OCR settings ([API](../api-reference/ocr-configuration.md))
|
12
12
|
- [OCR Backends](ocr-backends.md) - Choose and configure different OCR engines
|
13
13
|
- [Supported Formats](supported-formats.md) - All supported document formats
|
14
|
+
- [MCP Server](mcp-server.md) - Model Context Protocol server for AI integration
|
14
15
|
- [API Server](api-server.md) - REST API for document extraction
|
15
16
|
- [Docker](docker.md) - Using Kreuzberg with Docker
|
16
17
|
|