kreuzberg 3.6.1__tar.gz → 3.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.docker/Dockerfile +4 -4
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.github/workflows/publish-docker.yml +14 -1
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.pre-commit-config.yaml +2 -2
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/PKG-INFO +19 -14
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/README.md +17 -12
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/pyproject.toml +2 -2
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extraction_test.py +2 -2
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/uv.lock +60 -55
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.commitlintrc +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.docker/README.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.dockerignore +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.gitignore +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.gitmodules +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/.markdownlint.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/LICENSE +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/ai-rulez.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/README.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/advanced/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/assets/logo.png +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/changelog.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/cli.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/contributing.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/css/extra.css +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/examples/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/process_manager.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/sync_easyocr.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/mkdocs.yaml +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/api/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/api/main_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/chunker_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/cli_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/conftest.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/gmft_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/hooks_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/multiprocessing/sync_tesseract_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/playa_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/registry_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/types_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.6.1 → kreuzberg-3.6.2}/tests/utils/tmp_test.py +0 -0
@@ -1,8 +1,8 @@
|
|
1
|
-
FROM ghcr.io/astral-sh/uv:python3.13-bookworm
|
1
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app
|
2
2
|
ARG EXTRAS=""
|
3
3
|
WORKDIR /app
|
4
|
-
ENV PYTHONDONTWRITEBYTECODE
|
5
|
-
ENV PYTHONUNBUFFERED
|
4
|
+
ENV PYTHONDONTWRITEBYTECODE=1
|
5
|
+
ENV PYTHONUNBUFFERED=1
|
6
6
|
ENV UV_LINK_MODE=copy
|
7
7
|
|
8
8
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
@@ -18,4 +18,4 @@ RUN uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --co
|
|
18
18
|
|
19
19
|
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
|
20
20
|
USER appuser
|
21
|
-
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
21
|
+
CMD ["/app/.venv/bin/litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -15,6 +15,7 @@ jobs:
|
|
15
15
|
packages: write
|
16
16
|
|
17
17
|
strategy:
|
18
|
+
max-parallel: 2
|
18
19
|
matrix:
|
19
20
|
include:
|
20
21
|
- name: core
|
@@ -34,6 +35,16 @@ jobs:
|
|
34
35
|
tag_suffix: "-all"
|
35
36
|
|
36
37
|
steps:
|
38
|
+
- name: Free up disk space
|
39
|
+
run: |
|
40
|
+
# Remove large unnecessary packages to free up space
|
41
|
+
sudo rm -rf /usr/share/dotnet
|
42
|
+
sudo rm -rf /usr/local/lib/android
|
43
|
+
sudo rm -rf /opt/ghc
|
44
|
+
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
45
|
+
sudo docker system prune -af
|
46
|
+
df -h
|
47
|
+
|
37
48
|
- name: Checkout repository
|
38
49
|
uses: actions/checkout@v4
|
39
50
|
with:
|
@@ -81,12 +92,14 @@ jobs:
|
|
81
92
|
with:
|
82
93
|
context: .
|
83
94
|
file: ./.docker/Dockerfile
|
84
|
-
platforms: linux/amd64,linux/arm64
|
95
|
+
platforms: ${{ matrix.name == 'all' && 'linux/amd64' || 'linux/amd64,linux/arm64' }}
|
85
96
|
push: true
|
86
97
|
build-args: |
|
87
98
|
EXTRAS=${{ matrix.extras }}
|
88
99
|
tags: ${{ steps.meta.outputs.tags }}
|
89
100
|
labels: ${{ steps.meta.outputs.labels }}
|
101
|
+
cache-from: type=gha
|
102
|
+
cache-to: type=gha,mode=max
|
90
103
|
|
91
104
|
- name: Update Docker Hub README
|
92
105
|
uses: peter-evans/dockerhub-description@v4
|
@@ -6,7 +6,7 @@ repos:
|
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
8
|
- repo: https://github.com/Goldziher/ai-rulez
|
9
|
-
rev: v1.1.
|
9
|
+
rev: v1.1.4
|
10
10
|
hooks:
|
11
11
|
- id: ai-rulez-validate
|
12
12
|
- id: ai-rulez-generate
|
@@ -53,7 +53,7 @@ repos:
|
|
53
53
|
hooks:
|
54
54
|
- id: pyproject-fmt
|
55
55
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
56
|
-
rev: v0.12.
|
56
|
+
rev: v0.12.2
|
57
57
|
hooks:
|
58
58
|
- id: ruff
|
59
59
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.6.
|
3
|
+
Version: 3.6.2
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -25,7 +25,7 @@ Requires-Python: >=3.10
|
|
25
25
|
Requires-Dist: anyio>=4.9.0
|
26
26
|
Requires-Dist: charset-normalizer>=3.4.2
|
27
27
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
|
-
Requires-Dist: html-to-markdown>=1.
|
28
|
+
Requires-Dist: html-to-markdown[lxml]>=1.6.0
|
29
29
|
Requires-Dist: msgspec>=0.18.0
|
30
30
|
Requires-Dist: playa-pdf>=0.6.1
|
31
31
|
Requires-Dist: psutil>=7.0.0
|
@@ -83,8 +83,8 @@ Description-Content-Type: text/markdown
|
|
83
83
|
|
84
84
|
## Why Kreuzberg?
|
85
85
|
|
86
|
-
- **🚀 Fastest Performance**: [
|
87
|
-
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
86
|
+
- **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
|
87
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
|
88
88
|
- **⚡ Dual APIs**: Only library with both sync and async support
|
89
89
|
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
90
90
|
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
@@ -140,13 +140,13 @@ asyncio.run(main())
|
|
140
140
|
|
141
141
|
```bash
|
142
142
|
# Run API server
|
143
|
-
docker run -p 8000:8000 goldziher/kreuzberg:
|
143
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
144
144
|
|
145
145
|
# Extract files
|
146
146
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
147
147
|
```
|
148
148
|
|
149
|
-
Available variants: `3.
|
149
|
+
Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
|
150
150
|
|
151
151
|
### 🌐 REST API
|
152
152
|
|
@@ -191,15 +191,20 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
191
191
|
|
192
192
|
## Performance
|
193
193
|
|
194
|
-
**
|
194
|
+
**[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
195
195
|
|
196
|
-
| Library | Speed
|
197
|
-
| ------------- |
|
198
|
-
| **Kreuzberg** |
|
199
|
-
| Unstructured |
|
200
|
-
| MarkItDown |
|
201
|
-
| Docling |
|
196
|
+
| Library | Speed | Memory | Install Size | Dependencies | Success Rate |
|
197
|
+
| ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
|
198
|
+
| **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
|
199
|
+
| Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
|
200
|
+
| MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
|
201
|
+
| Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
|
202
202
|
|
203
|
+
\*_Can achieve 75% reliability with 15% performance trade-off when configured_
|
204
|
+
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
205
|
+
‡_Frequently fails/times out on medium files (>1MB)_
|
206
|
+
|
207
|
+
> **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
|
203
208
|
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
204
209
|
|
205
210
|
## Documentation
|
@@ -233,7 +238,7 @@ ______________________________________________________________________
|
|
233
238
|
|
234
239
|
<div align="center">
|
235
240
|
|
236
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
241
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
237
242
|
|
238
243
|
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
239
244
|
|
@@ -11,8 +11,8 @@
|
|
11
11
|
|
12
12
|
## Why Kreuzberg?
|
13
13
|
|
14
|
-
- **🚀 Fastest Performance**: [
|
15
|
-
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
14
|
+
- **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
|
15
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
|
16
16
|
- **⚡ Dual APIs**: Only library with both sync and async support
|
17
17
|
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
18
18
|
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
@@ -68,13 +68,13 @@ asyncio.run(main())
|
|
68
68
|
|
69
69
|
```bash
|
70
70
|
# Run API server
|
71
|
-
docker run -p 8000:8000 goldziher/kreuzberg:
|
71
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
72
72
|
|
73
73
|
# Extract files
|
74
74
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
75
75
|
```
|
76
76
|
|
77
|
-
Available variants: `3.
|
77
|
+
Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
|
78
78
|
|
79
79
|
### 🌐 REST API
|
80
80
|
|
@@ -119,15 +119,20 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
119
119
|
|
120
120
|
## Performance
|
121
121
|
|
122
|
-
**
|
122
|
+
**[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
123
123
|
|
124
|
-
| Library | Speed
|
125
|
-
| ------------- |
|
126
|
-
| **Kreuzberg** |
|
127
|
-
| Unstructured |
|
128
|
-
| MarkItDown |
|
129
|
-
| Docling |
|
124
|
+
| Library | Speed | Memory | Install Size | Dependencies | Success Rate |
|
125
|
+
| ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
|
126
|
+
| **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
|
127
|
+
| Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
|
128
|
+
| MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
|
129
|
+
| Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
|
130
130
|
|
131
|
+
\*_Can achieve 75% reliability with 15% performance trade-off when configured_
|
132
|
+
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
133
|
+
‡_Frequently fails/times out on medium files (>1MB)_
|
134
|
+
|
135
|
+
> **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
|
131
136
|
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
132
137
|
|
133
138
|
## Documentation
|
@@ -161,7 +166,7 @@ ______________________________________________________________________
|
|
161
166
|
|
162
167
|
<div align="center">
|
163
168
|
|
164
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
169
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
165
170
|
|
166
171
|
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
167
172
|
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.6.
|
8
|
+
version = "3.6.2"
|
9
9
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -49,7 +49,7 @@ dependencies = [
|
|
49
49
|
"anyio>=4.9.0",
|
50
50
|
"charset-normalizer>=3.4.2",
|
51
51
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
52
|
-
"html-to-markdown>=1.
|
52
|
+
"html-to-markdown[lxml]>=1.6.0",
|
53
53
|
"msgspec>=0.18.0",
|
54
54
|
"playa-pdf>=0.6.1", # pinned due to breaking changes in 0.5.0
|
55
55
|
"psutil>=7.0.0",
|
@@ -100,7 +100,7 @@ async def test_extract_bytes_html(html_document: Path) -> None:
|
|
100
100
|
assert_extraction_result(result, mime_type=MARKDOWN_MIME_TYPE)
|
101
101
|
assert (
|
102
102
|
result.content
|
103
|
-
==
|
103
|
+
== 'Browsers usually insert quotation marks around the q element. WWF\'s goal is to: "Build a future where people live in harmony with nature."'
|
104
104
|
)
|
105
105
|
|
106
106
|
|
@@ -169,7 +169,7 @@ async def test_extract_file_html(html_document: Path) -> None:
|
|
169
169
|
assert_extraction_result(result, mime_type=MARKDOWN_MIME_TYPE)
|
170
170
|
assert (
|
171
171
|
result.content
|
172
|
-
==
|
172
|
+
== 'Browsers usually insert quotation marks around the q element. WWF\'s goal is to: "Build a future where people live in harmony with nature."'
|
173
173
|
)
|
174
174
|
|
175
175
|
|
@@ -24,7 +24,7 @@ wheels = [
|
|
24
24
|
|
25
25
|
[[package]]
|
26
26
|
name = "aiohttp"
|
27
|
-
version = "3.12.
|
27
|
+
version = "3.12.14"
|
28
28
|
source = { registry = "https://pypi.org/simple" }
|
29
29
|
dependencies = [
|
30
30
|
{ name = "aiohappyeyeballs" },
|
@@ -35,25 +35,25 @@ dependencies = [
|
|
35
35
|
{ name = "propcache" },
|
36
36
|
{ name = "yarl" },
|
37
37
|
]
|
38
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
38
|
+
sdist = { url = "https://files.pythonhosted.org/packages/e6/0b/e39ad954107ebf213a2325038a3e7a506be3d98e1435e1f82086eec4cde2/aiohttp-3.12.14.tar.gz", hash = "sha256:6e06e120e34d93100de448fd941522e11dafa78ef1a893c179901b7d66aa29f2", size = 7822921, upload-time = "2025-07-10T13:05:33.968Z" }
|
39
39
|
wheels = [
|
40
|
-
{ url = "https://files.pythonhosted.org/packages/
|
41
|
-
{ url = "https://files.pythonhosted.org/packages/
|
42
|
-
{ url = "https://files.pythonhosted.org/packages/
|
43
|
-
{ url = "https://files.pythonhosted.org/packages/
|
44
|
-
{ url = "https://files.pythonhosted.org/packages/
|
45
|
-
{ url = "https://files.pythonhosted.org/packages/
|
46
|
-
{ url = "https://files.pythonhosted.org/packages/
|
47
|
-
{ url = "https://files.pythonhosted.org/packages/
|
48
|
-
{ url = "https://files.pythonhosted.org/packages/
|
49
|
-
{ url = "https://files.pythonhosted.org/packages/
|
50
|
-
{ url = "https://files.pythonhosted.org/packages/
|
51
|
-
{ url = "https://files.pythonhosted.org/packages/
|
52
|
-
{ url = "https://files.pythonhosted.org/packages/
|
53
|
-
{ url = "https://files.pythonhosted.org/packages/
|
54
|
-
{ url = "https://files.pythonhosted.org/packages/
|
55
|
-
{ url = "https://files.pythonhosted.org/packages/
|
56
|
-
{ url = "https://files.pythonhosted.org/packages/
|
40
|
+
{ url = "https://files.pythonhosted.org/packages/06/48/e0d2fa8ac778008071e7b79b93ab31ef14ab88804d7ba71b5c964a7c844e/aiohttp-3.12.14-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3143a7893d94dc82bc409f7308bc10d60285a3cd831a68faf1aa0836c5c3c767", size = 695471, upload-time = "2025-07-10T13:04:20.124Z" },
|
41
|
+
{ url = "https://files.pythonhosted.org/packages/8d/e7/f73206afa33100804f790b71092888f47df65fd9a4cd0e6800d7c6826441/aiohttp-3.12.14-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3d62ac3d506cef54b355bd34c2a7c230eb693880001dfcda0bf88b38f5d7af7e", size = 473128, upload-time = "2025-07-10T13:04:21.928Z" },
|
42
|
+
{ url = "https://files.pythonhosted.org/packages/df/e2/4dd00180be551a6e7ee979c20fc7c32727f4889ee3fd5b0586e0d47f30e1/aiohttp-3.12.14-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:48e43e075c6a438937c4de48ec30fa8ad8e6dfef122a038847456bfe7b947b63", size = 465426, upload-time = "2025-07-10T13:04:24.071Z" },
|
43
|
+
{ url = "https://files.pythonhosted.org/packages/de/dd/525ed198a0bb674a323e93e4d928443a680860802c44fa7922d39436b48b/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:077b4488411a9724cecc436cbc8c133e0d61e694995b8de51aaf351c7578949d", size = 1704252, upload-time = "2025-07-10T13:04:26.049Z" },
|
44
|
+
{ url = "https://files.pythonhosted.org/packages/d8/b1/01e542aed560a968f692ab4fc4323286e8bc4daae83348cd63588e4f33e3/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d8c35632575653f297dcbc9546305b2c1133391089ab925a6a3706dfa775ccab", size = 1685514, upload-time = "2025-07-10T13:04:28.186Z" },
|
45
|
+
{ url = "https://files.pythonhosted.org/packages/b3/06/93669694dc5fdabdc01338791e70452d60ce21ea0946a878715688d5a191/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b8ce87963f0035c6834b28f061df90cf525ff7c9b6283a8ac23acee6502afd4", size = 1737586, upload-time = "2025-07-10T13:04:30.195Z" },
|
46
|
+
{ url = "https://files.pythonhosted.org/packages/a5/3a/18991048ffc1407ca51efb49ba8bcc1645961f97f563a6c480cdf0286310/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0a2cf66e32a2563bb0766eb24eae7e9a269ac0dc48db0aae90b575dc9583026", size = 1786958, upload-time = "2025-07-10T13:04:32.482Z" },
|
47
|
+
{ url = "https://files.pythonhosted.org/packages/30/a8/81e237f89a32029f9b4a805af6dffc378f8459c7b9942712c809ff9e76e5/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdea089caf6d5cde975084a884c72d901e36ef9c2fd972c9f51efbbc64e96fbd", size = 1709287, upload-time = "2025-07-10T13:04:34.493Z" },
|
48
|
+
{ url = "https://files.pythonhosted.org/packages/8c/e3/bd67a11b0fe7fc12c6030473afd9e44223d456f500f7cf526dbaa259ae46/aiohttp-3.12.14-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a7865f27db67d49e81d463da64a59365ebd6b826e0e4847aa111056dcb9dc88", size = 1622990, upload-time = "2025-07-10T13:04:36.433Z" },
|
49
|
+
{ url = "https://files.pythonhosted.org/packages/83/ba/e0cc8e0f0d9ce0904e3cf2d6fa41904e379e718a013c721b781d53dcbcca/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0ab5b38a6a39781d77713ad930cb5e7feea6f253de656a5f9f281a8f5931b086", size = 1676015, upload-time = "2025-07-10T13:04:38.958Z" },
|
50
|
+
{ url = "https://files.pythonhosted.org/packages/d8/b3/1e6c960520bda094c48b56de29a3d978254637ace7168dd97ddc273d0d6c/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9b3b15acee5c17e8848d90a4ebc27853f37077ba6aec4d8cb4dbbea56d156933", size = 1707678, upload-time = "2025-07-10T13:04:41.275Z" },
|
51
|
+
{ url = "https://files.pythonhosted.org/packages/0a/19/929a3eb8c35b7f9f076a462eaa9830b32c7f27d3395397665caa5e975614/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e4c972b0bdaac167c1e53e16a16101b17c6d0ed7eac178e653a07b9f7fad7151", size = 1650274, upload-time = "2025-07-10T13:04:43.483Z" },
|
52
|
+
{ url = "https://files.pythonhosted.org/packages/22/e5/81682a6f20dd1b18ce3d747de8eba11cbef9b270f567426ff7880b096b48/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7442488b0039257a3bdbc55f7209587911f143fca11df9869578db6c26feeeb8", size = 1726408, upload-time = "2025-07-10T13:04:45.577Z" },
|
53
|
+
{ url = "https://files.pythonhosted.org/packages/8c/17/884938dffaa4048302985483f77dfce5ac18339aad9b04ad4aaa5e32b028/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f68d3067eecb64c5e9bab4a26aa11bd676f4c70eea9ef6536b0a4e490639add3", size = 1759879, upload-time = "2025-07-10T13:04:47.663Z" },
|
54
|
+
{ url = "https://files.pythonhosted.org/packages/95/78/53b081980f50b5cf874359bde707a6eacd6c4be3f5f5c93937e48c9d0025/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f88d3704c8b3d598a08ad17d06006cb1ca52a1182291f04979e305c8be6c9758", size = 1708770, upload-time = "2025-07-10T13:04:49.944Z" },
|
55
|
+
{ url = "https://files.pythonhosted.org/packages/ed/91/228eeddb008ecbe3ffa6c77b440597fdf640307162f0c6488e72c5a2d112/aiohttp-3.12.14-cp313-cp313-win32.whl", hash = "sha256:a3c99ab19c7bf375c4ae3debd91ca5d394b98b6089a03231d4c580ef3c2ae4c5", size = 421688, upload-time = "2025-07-10T13:04:51.993Z" },
|
56
|
+
{ url = "https://files.pythonhosted.org/packages/66/5f/8427618903343402fdafe2850738f735fd1d9409d2a8f9bcaae5e630d3ba/aiohttp-3.12.14-cp313-cp313-win_amd64.whl", hash = "sha256:3f8aad695e12edc9d571f878c62bedc91adf30c760c8632f09663e5f564f4baa", size = 448098, upload-time = "2025-07-10T13:04:53.999Z" },
|
57
57
|
]
|
58
58
|
|
59
59
|
[[package]]
|
@@ -92,11 +92,11 @@ wheels = [
|
|
92
92
|
|
93
93
|
[[package]]
|
94
94
|
name = "asgiref"
|
95
|
-
version = "3.9.
|
95
|
+
version = "3.9.1"
|
96
96
|
source = { registry = "https://pypi.org/simple" }
|
97
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
97
|
+
sdist = { url = "https://files.pythonhosted.org/packages/90/61/0aa957eec22ff70b830b22ff91f825e70e1ef732c06666a805730f28b36b/asgiref-3.9.1.tar.gz", hash = "sha256:a5ab6582236218e5ef1648f242fd9f10626cfd4de8dc377db215d5d5098e3142", size = 36870, upload-time = "2025-07-08T09:07:43.344Z" }
|
98
98
|
wheels = [
|
99
|
-
{ url = "https://files.pythonhosted.org/packages/
|
99
|
+
{ url = "https://files.pythonhosted.org/packages/7c/3c/0464dcada90d5da0e71018c04a140ad6349558afb30b3051b4264cc5b965/asgiref-3.9.1-py3-none-any.whl", hash = "sha256:f3bba7092a48005b5f5bacd747d36ee4a5a61f4a269a6df590b43144355ebd2c", size = 23790, upload-time = "2025-07-08T09:07:41.548Z" },
|
100
100
|
]
|
101
101
|
|
102
102
|
[[package]]
|
@@ -212,11 +212,11 @@ wheels = [
|
|
212
212
|
|
213
213
|
[[package]]
|
214
214
|
name = "certifi"
|
215
|
-
version = "2025.
|
215
|
+
version = "2025.7.9"
|
216
216
|
source = { registry = "https://pypi.org/simple" }
|
217
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
217
|
+
sdist = { url = "https://files.pythonhosted.org/packages/de/8a/c729b6b60c66a38f590c4e774decc4b2ec7b0576be8f1aa984a53ffa812a/certifi-2025.7.9.tar.gz", hash = "sha256:c1d2ec05395148ee10cf672ffc28cd37ea0ab0d99f9cc74c43e588cbd111b079", size = 160386, upload-time = "2025-07-09T02:13:58.874Z" }
|
218
218
|
wheels = [
|
219
|
-
{ url = "https://files.pythonhosted.org/packages/
|
219
|
+
{ url = "https://files.pythonhosted.org/packages/66/f3/80a3f974c8b535d394ff960a11ac20368e06b736da395b551a49ce950cce/certifi-2025.7.9-py3-none-any.whl", hash = "sha256:d842783a14f8fdd646895ac26f719a061408834473cfc10203f6a575beb15d39", size = 159230, upload-time = "2025-07-09T02:13:57.007Z" },
|
220
220
|
]
|
221
221
|
|
222
222
|
[[package]]
|
@@ -866,14 +866,19 @@ wheels = [
|
|
866
866
|
|
867
867
|
[[package]]
|
868
868
|
name = "html-to-markdown"
|
869
|
-
version = "1.
|
869
|
+
version = "1.6.0"
|
870
870
|
source = { registry = "https://pypi.org/simple" }
|
871
871
|
dependencies = [
|
872
872
|
{ name = "beautifulsoup4" },
|
873
873
|
]
|
874
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
874
|
+
sdist = { url = "https://files.pythonhosted.org/packages/6b/33/041a35156641991d6805af0847e82467c69a6071738e982379ff9a55151b/html_to_markdown-1.6.0.tar.gz", hash = "sha256:b2916f9d78f4faac866935688ff3f05f7e0a873554f9d797b8bca0e32afdc921", size = 35099, upload-time = "2025-07-11T06:14:12.227Z" }
|
875
875
|
wheels = [
|
876
|
-
{ url = "https://files.pythonhosted.org/packages/
|
876
|
+
{ url = "https://files.pythonhosted.org/packages/c9/23/38e07928c22dd1facd00781758bc4efb5f94c0240aa59e22b9a290fe44a1/html_to_markdown-1.6.0-py3-none-any.whl", hash = "sha256:88848e851ea80719397fc5356018c367d415bbd1ace030f3ad380fafd49fa0ed", size = 31616, upload-time = "2025-07-11T06:14:10.544Z" },
|
877
|
+
]
|
878
|
+
|
879
|
+
[package.optional-dependencies]
|
880
|
+
lxml = [
|
881
|
+
{ name = "lxml" },
|
877
882
|
]
|
878
883
|
|
879
884
|
[[package]]
|
@@ -1166,12 +1171,12 @@ wheels = [
|
|
1166
1171
|
|
1167
1172
|
[[package]]
|
1168
1173
|
name = "kreuzberg"
|
1169
|
-
version = "3.6.
|
1174
|
+
version = "3.6.2"
|
1170
1175
|
source = { editable = "." }
|
1171
1176
|
dependencies = [
|
1172
1177
|
{ name = "anyio" },
|
1173
1178
|
{ name = "charset-normalizer" },
|
1174
|
-
{ name = "html-to-markdown" },
|
1179
|
+
{ name = "html-to-markdown", extra = ["lxml"] },
|
1175
1180
|
{ name = "msgspec" },
|
1176
1181
|
{ name = "playa-pdf" },
|
1177
1182
|
{ name = "psutil" },
|
@@ -1254,7 +1259,7 @@ requires-dist = [
|
|
1254
1259
|
{ name = "exceptiongroup", marker = "python_full_version < '3.11'", specifier = ">=1.2.2" },
|
1255
1260
|
{ name = "fast-langdetect", marker = "extra == 'langdetect'", specifier = ">=0.3.2" },
|
1256
1261
|
{ name = "gmft", marker = "extra == 'gmft'", specifier = ">=0.4.2" },
|
1257
|
-
{ name = "html-to-markdown", specifier = ">=1.
|
1262
|
+
{ name = "html-to-markdown", extras = ["lxml"], specifier = ">=1.6.0" },
|
1258
1263
|
{ name = "keybert", marker = "extra == 'entity-extraction'", specifier = ">=0.9.0" },
|
1259
1264
|
{ name = "kreuzberg", extras = ["api", "chunking", "cli", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"], marker = "extra == 'all'" },
|
1260
1265
|
{ name = "litestar", extras = ["standard", "structlog", "opentelemetry"], marker = "extra == 'api'", specifier = ">=2.16.0" },
|
@@ -1422,7 +1427,7 @@ wheels = [
|
|
1422
1427
|
|
1423
1428
|
[[package]]
|
1424
1429
|
name = "langsmith"
|
1425
|
-
version = "0.4.
|
1430
|
+
version = "0.4.5"
|
1426
1431
|
source = { registry = "https://pypi.org/simple" }
|
1427
1432
|
dependencies = [
|
1428
1433
|
{ name = "httpx" },
|
@@ -1433,9 +1438,9 @@ dependencies = [
|
|
1433
1438
|
{ name = "requests-toolbelt" },
|
1434
1439
|
{ name = "zstandard" },
|
1435
1440
|
]
|
1436
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
1441
|
+
sdist = { url = "https://files.pythonhosted.org/packages/5c/92/7885823f3d13222f57773921f0da19b37d628c64607491233dc853a0f6ea/langsmith-0.4.5.tar.gz", hash = "sha256:49444bd8ccd4e46402f1b9ff1d686fa8e3a31b175e7085e72175ab8ec6164a34", size = 352235, upload-time = "2025-07-10T22:08:04.505Z" }
|
1437
1442
|
wheels = [
|
1438
|
-
{ url = "https://files.pythonhosted.org/packages/
|
1443
|
+
{ url = "https://files.pythonhosted.org/packages/c8/10/ad3107b666c3203b7938d10ea6b8746b9735c399cf737a51386d58e41d34/langsmith-0.4.5-py3-none-any.whl", hash = "sha256:4167717a2cccc4dff5809dbddc439628e836f6fd13d4fdb31ea013bc8d5cfaf5", size = 367795, upload-time = "2025-07-10T22:08:02.548Z" },
|
1439
1444
|
]
|
1440
1445
|
|
1441
1446
|
[[package]]
|
@@ -2170,7 +2175,7 @@ wheels = [
|
|
2170
2175
|
|
2171
2176
|
[[package]]
|
2172
2177
|
name = "openai"
|
2173
|
-
version = "1.
|
2178
|
+
version = "1.95.0"
|
2174
2179
|
source = { registry = "https://pypi.org/simple" }
|
2175
2180
|
dependencies = [
|
2176
2181
|
{ name = "anyio" },
|
@@ -2182,9 +2187,9 @@ dependencies = [
|
|
2182
2187
|
{ name = "tqdm" },
|
2183
2188
|
{ name = "typing-extensions" },
|
2184
2189
|
]
|
2185
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
2190
|
+
sdist = { url = "https://files.pythonhosted.org/packages/ef/2f/0c6f509a1585545962bfa6e201d7fb658eb2a6f52fb8c26765632d91706c/openai-1.95.0.tar.gz", hash = "sha256:54bc42df9f7142312647dd485d34cca5df20af825fa64a30ca55164be2cf4cc9", size = 488144, upload-time = "2025-07-10T18:35:49.946Z" }
|
2186
2191
|
wheels = [
|
2187
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2192
|
+
{ url = "https://files.pythonhosted.org/packages/19/a5/57d0bb58b938a3e3f352ff26e645da1660436402a6ad1b29780d261cc5a5/openai-1.95.0-py3-none-any.whl", hash = "sha256:a7afc9dca7e7d616371842af8ea6dbfbcb739a85d183f5f664ab1cc311b9ef18", size = 755572, upload-time = "2025-07-10T18:35:47.507Z" },
|
2188
2193
|
]
|
2189
2194
|
|
2190
2195
|
[[package]]
|
@@ -2392,7 +2397,7 @@ wheels = [
|
|
2392
2397
|
|
2393
2398
|
[[package]]
|
2394
2399
|
name = "paddlex"
|
2395
|
-
version = "3.1.
|
2400
|
+
version = "3.1.2"
|
2396
2401
|
source = { registry = "https://pypi.org/simple" }
|
2397
2402
|
dependencies = [
|
2398
2403
|
{ name = "chardet" },
|
@@ -2413,7 +2418,7 @@ dependencies = [
|
|
2413
2418
|
{ name = "ujson" },
|
2414
2419
|
]
|
2415
2420
|
wheels = [
|
2416
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2421
|
+
{ url = "https://files.pythonhosted.org/packages/5a/71/f824642aa730a45c6cd195c2df3d0d6376f170023f66aadc1b9f1f856ac7/paddlex-3.1.2-py3-none-any.whl", hash = "sha256:a87ede09cf6a9aebc09deadfaaedc6377505b408cac2ca4ec372c90e7399c71b", size = 1688520, upload-time = "2025-07-08T08:47:20.639Z" },
|
2417
2422
|
]
|
2418
2423
|
|
2419
2424
|
[package.optional-dependencies]
|
@@ -2489,7 +2494,7 @@ wheels = [
|
|
2489
2494
|
|
2490
2495
|
[[package]]
|
2491
2496
|
name = "pandas"
|
2492
|
-
version = "2.3.
|
2497
|
+
version = "2.3.1"
|
2493
2498
|
source = { registry = "https://pypi.org/simple" }
|
2494
2499
|
dependencies = [
|
2495
2500
|
{ name = "numpy" },
|
@@ -2497,21 +2502,21 @@ dependencies = [
|
|
2497
2502
|
{ name = "pytz" },
|
2498
2503
|
{ name = "tzdata" },
|
2499
2504
|
]
|
2500
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
2501
|
-
wheels = [
|
2502
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2503
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2504
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2505
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2506
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2507
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2508
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2509
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2510
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2511
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2512
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2513
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2514
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2505
|
+
sdist = { url = "https://files.pythonhosted.org/packages/d1/6f/75aa71f8a14267117adeeed5d21b204770189c0a0025acbdc03c337b28fc/pandas-2.3.1.tar.gz", hash = "sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2", size = 4487493, upload-time = "2025-07-07T19:20:04.079Z" }
|
2506
|
+
wheels = [
|
2507
|
+
{ url = "https://files.pythonhosted.org/packages/32/ed/ff0a67a2c5505e1854e6715586ac6693dd860fbf52ef9f81edee200266e7/pandas-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9026bd4a80108fac2239294a15ef9003c4ee191a0f64b90f170b40cfb7cf2d22", size = 11531393, upload-time = "2025-07-07T19:19:12.245Z" },
|
2508
|
+
{ url = "https://files.pythonhosted.org/packages/c7/db/d8f24a7cc9fb0972adab0cc80b6817e8bef888cfd0024eeb5a21c0bb5c4a/pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6de8547d4fdb12421e2d047a2c446c623ff4c11f47fddb6b9169eb98ffba485a", size = 10668750, upload-time = "2025-07-07T19:19:14.612Z" },
|
2509
|
+
{ url = "https://files.pythonhosted.org/packages/0f/b0/80f6ec783313f1e2356b28b4fd8d2148c378370045da918c73145e6aab50/pandas-2.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:782647ddc63c83133b2506912cc6b108140a38a37292102aaa19c81c83db2928", size = 11342004, upload-time = "2025-07-07T19:19:16.857Z" },
|
2510
|
+
{ url = "https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9", size = 12050869, upload-time = "2025-07-07T19:19:19.265Z" },
|
2511
|
+
{ url = "https://files.pythonhosted.org/packages/55/79/20d746b0a96c67203a5bee5fb4e00ac49c3e8009a39e1f78de264ecc5729/pandas-2.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e5635178b387bd2ba4ac040f82bc2ef6e6b500483975c4ebacd34bec945fda12", size = 12750218, upload-time = "2025-07-07T19:19:21.547Z" },
|
2512
|
+
{ url = "https://files.pythonhosted.org/packages/7c/0f/145c8b41e48dbf03dd18fdd7f24f8ba95b8254a97a3379048378f33e7838/pandas-2.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f3bf5ec947526106399a9e1d26d40ee2b259c66422efdf4de63c848492d91bb", size = 13416763, upload-time = "2025-07-07T19:19:23.939Z" },
|
2513
|
+
{ url = "https://files.pythonhosted.org/packages/b2/c0/54415af59db5cdd86a3d3bf79863e8cc3fa9ed265f0745254061ac09d5f2/pandas-2.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:1c78cf43c8fde236342a1cb2c34bcff89564a7bfed7e474ed2fffa6aed03a956", size = 10987482, upload-time = "2025-07-07T19:19:42.699Z" },
|
2514
|
+
{ url = "https://files.pythonhosted.org/packages/48/64/2fd2e400073a1230e13b8cd604c9bc95d9e3b962e5d44088ead2e8f0cfec/pandas-2.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8dfc17328e8da77be3cf9f47509e5637ba8f137148ed0e9b5241e1baf526e20a", size = 12029159, upload-time = "2025-07-07T19:19:26.362Z" },
|
2515
|
+
{ url = "https://files.pythonhosted.org/packages/d8/0a/d84fd79b0293b7ef88c760d7dca69828d867c89b6d9bc52d6a27e4d87316/pandas-2.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ec6c851509364c59a5344458ab935e6451b31b818be467eb24b0fe89bd05b6b9", size = 11393287, upload-time = "2025-07-07T19:19:29.157Z" },
|
2516
|
+
{ url = "https://files.pythonhosted.org/packages/50/ae/ff885d2b6e88f3c7520bb74ba319268b42f05d7e583b5dded9837da2723f/pandas-2.3.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:911580460fc4884d9b05254b38a6bfadddfcc6aaef856fb5859e7ca202e45275", size = 11309381, upload-time = "2025-07-07T19:19:31.436Z" },
|
2517
|
+
{ url = "https://files.pythonhosted.org/packages/85/86/1fa345fc17caf5d7780d2699985c03dbe186c68fee00b526813939062bb0/pandas-2.3.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f4d6feeba91744872a600e6edbbd5b033005b431d5ae8379abee5bcfa479fab", size = 11883998, upload-time = "2025-07-07T19:19:34.267Z" },
|
2518
|
+
{ url = "https://files.pythonhosted.org/packages/81/aa/e58541a49b5e6310d89474333e994ee57fea97c8aaa8fc7f00b873059bbf/pandas-2.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fe37e757f462d31a9cd7580236a82f353f5713a80e059a29753cf938c6775d96", size = 12704705, upload-time = "2025-07-07T19:19:36.856Z" },
|
2519
|
+
{ url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
|
2515
2520
|
]
|
2516
2521
|
|
2517
2522
|
[[package]]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{kreuzberg-3.6.1 → kreuzberg-3.6.2}/benchmarks/results/benchmark_msgpack_20250702_003800.json
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|