kreuzberg 3.6.0__tar.gz → 3.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.docker/Dockerfile +4 -4
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.github/workflows/publish-docker.yml +26 -5
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.github/workflows/release.yaml +5 -11
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.pre-commit-config.yaml +2 -2
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/PKG-INFO +19 -14
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/README.md +17 -12
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/pyproject.toml +2 -2
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extraction_test.py +2 -2
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/uv.lock +60 -55
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.commitlintrc +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.docker/README.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.dockerignore +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.gitignore +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.gitmodules +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/.markdownlint.yaml +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/LICENSE +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/ai-rulez.yaml +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/README.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/advanced/index.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/assets/logo.png +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/changelog.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/cli.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/contributing.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/css/extra.css +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/examples/index.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/index.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/process_manager.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/sync_easyocr.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/mkdocs.yaml +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/api/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/api/main_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/chunker_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/cli_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/conftest.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/gmft_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/hooks_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/multiprocessing/sync_tesseract_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/playa_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/registry_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/types_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.6.0 → kreuzberg-3.6.2}/tests/utils/tmp_test.py +0 -0
@@ -1,8 +1,8 @@
|
|
1
|
-
FROM ghcr.io/astral-sh/uv:python3.13-bookworm
|
1
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app
|
2
2
|
ARG EXTRAS=""
|
3
3
|
WORKDIR /app
|
4
|
-
ENV PYTHONDONTWRITEBYTECODE
|
5
|
-
ENV PYTHONUNBUFFERED
|
4
|
+
ENV PYTHONDONTWRITEBYTECODE=1
|
5
|
+
ENV PYTHONUNBUFFERED=1
|
6
6
|
ENV UV_LINK_MODE=copy
|
7
7
|
|
8
8
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
@@ -18,4 +18,4 @@ RUN uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --co
|
|
18
18
|
|
19
19
|
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
|
20
20
|
USER appuser
|
21
|
-
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
21
|
+
CMD ["/app/.venv/bin/litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -3,16 +3,19 @@ name: Publish Docker Images
|
|
3
3
|
|
4
4
|
on:
|
5
5
|
workflow_dispatch:
|
6
|
+
release:
|
7
|
+
types: [published]
|
6
8
|
|
7
9
|
jobs:
|
8
10
|
build-and-push:
|
9
11
|
runs-on: ubuntu-latest
|
10
|
-
if: ${{ github.event_name == 'workflow_dispatch' }}
|
12
|
+
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'release' }}
|
11
13
|
permissions:
|
12
14
|
contents: read
|
13
15
|
packages: write
|
14
16
|
|
15
17
|
strategy:
|
18
|
+
max-parallel: 2
|
16
19
|
matrix:
|
17
20
|
include:
|
18
21
|
- name: core
|
@@ -32,6 +35,16 @@ jobs:
|
|
32
35
|
tag_suffix: "-all"
|
33
36
|
|
34
37
|
steps:
|
38
|
+
- name: Free up disk space
|
39
|
+
run: |
|
40
|
+
# Remove large unnecessary packages to free up space
|
41
|
+
sudo rm -rf /usr/share/dotnet
|
42
|
+
sudo rm -rf /usr/local/lib/android
|
43
|
+
sudo rm -rf /opt/ghc
|
44
|
+
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
45
|
+
sudo docker system prune -af
|
46
|
+
df -h
|
47
|
+
|
35
48
|
- name: Checkout repository
|
36
49
|
uses: actions/checkout@v4
|
37
50
|
with:
|
@@ -40,10 +53,16 @@ jobs:
|
|
40
53
|
- name: Get release version
|
41
54
|
id: get_version
|
42
55
|
run: |
|
43
|
-
|
44
|
-
|
45
|
-
|
56
|
+
if [ "${{ github.event_name }}" = "release" ]; then
|
57
|
+
# For release events, use the release tag
|
58
|
+
VERSION="${{ github.event.release.tag_name }}"
|
59
|
+
else
|
60
|
+
# For workflow_dispatch, get the latest tag
|
61
|
+
git fetch --tags
|
62
|
+
VERSION=$(git tag --sort=-version:refname | head -n1)
|
63
|
+
fi
|
46
64
|
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
|
65
|
+
echo "Using version: $VERSION"
|
47
66
|
|
48
67
|
- name: Set up QEMU
|
49
68
|
uses: docker/setup-qemu-action@v3
|
@@ -73,12 +92,14 @@ jobs:
|
|
73
92
|
with:
|
74
93
|
context: .
|
75
94
|
file: ./.docker/Dockerfile
|
76
|
-
platforms: linux/amd64,linux/arm64
|
95
|
+
platforms: ${{ matrix.name == 'all' && 'linux/amd64' || 'linux/amd64,linux/arm64' }}
|
77
96
|
push: true
|
78
97
|
build-args: |
|
79
98
|
EXTRAS=${{ matrix.extras }}
|
80
99
|
tags: ${{ steps.meta.outputs.tags }}
|
81
100
|
labels: ${{ steps.meta.outputs.labels }}
|
101
|
+
cache-from: type=gha
|
102
|
+
cache-to: type=gha,mode=max
|
82
103
|
|
83
104
|
- name: Update Docker Hub README
|
84
105
|
uses: peter-evans/dockerhub-description@v4
|
@@ -10,6 +10,7 @@ jobs:
|
|
10
10
|
environment: pypi
|
11
11
|
permissions:
|
12
12
|
id-token: write
|
13
|
+
contents: read
|
13
14
|
steps:
|
14
15
|
- name: Checkout
|
15
16
|
uses: actions/checkout@v4
|
@@ -30,14 +31,7 @@ jobs:
|
|
30
31
|
- name: Publish
|
31
32
|
uses: pypa/gh-action-pypi-publish@release/v1
|
32
33
|
|
33
|
-
- name:
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
script: |
|
38
|
-
await github.rest.actions.createWorkflowDispatch({
|
39
|
-
owner: context.repo.owner,
|
40
|
-
repo: context.repo.repo,
|
41
|
-
workflow_id: 'publish-docker.yml',
|
42
|
-
ref: 'main'
|
43
|
-
});
|
34
|
+
- name: Docker Build Info
|
35
|
+
run: |
|
36
|
+
echo "Docker images will be built automatically by the publish-docker.yml workflow"
|
37
|
+
echo "triggered by this release event. No manual triggering needed."
|
@@ -6,7 +6,7 @@ repos:
|
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
8
|
- repo: https://github.com/Goldziher/ai-rulez
|
9
|
-
rev: v1.1.
|
9
|
+
rev: v1.1.4
|
10
10
|
hooks:
|
11
11
|
- id: ai-rulez-validate
|
12
12
|
- id: ai-rulez-generate
|
@@ -53,7 +53,7 @@ repos:
|
|
53
53
|
hooks:
|
54
54
|
- id: pyproject-fmt
|
55
55
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
56
|
-
rev: v0.12.
|
56
|
+
rev: v0.12.2
|
57
57
|
hooks:
|
58
58
|
- id: ruff
|
59
59
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.6.
|
3
|
+
Version: 3.6.2
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -25,7 +25,7 @@ Requires-Python: >=3.10
|
|
25
25
|
Requires-Dist: anyio>=4.9.0
|
26
26
|
Requires-Dist: charset-normalizer>=3.4.2
|
27
27
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
|
-
Requires-Dist: html-to-markdown>=1.
|
28
|
+
Requires-Dist: html-to-markdown[lxml]>=1.6.0
|
29
29
|
Requires-Dist: msgspec>=0.18.0
|
30
30
|
Requires-Dist: playa-pdf>=0.6.1
|
31
31
|
Requires-Dist: psutil>=7.0.0
|
@@ -83,8 +83,8 @@ Description-Content-Type: text/markdown
|
|
83
83
|
|
84
84
|
## Why Kreuzberg?
|
85
85
|
|
86
|
-
- **🚀 Fastest Performance**: [
|
87
|
-
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
86
|
+
- **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
|
87
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
|
88
88
|
- **⚡ Dual APIs**: Only library with both sync and async support
|
89
89
|
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
90
90
|
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
@@ -140,13 +140,13 @@ asyncio.run(main())
|
|
140
140
|
|
141
141
|
```bash
|
142
142
|
# Run API server
|
143
|
-
docker run -p 8000:8000 goldziher/kreuzberg:
|
143
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
144
144
|
|
145
145
|
# Extract files
|
146
146
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
147
147
|
```
|
148
148
|
|
149
|
-
Available variants: `3.
|
149
|
+
Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
|
150
150
|
|
151
151
|
### 🌐 REST API
|
152
152
|
|
@@ -191,15 +191,20 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
191
191
|
|
192
192
|
## Performance
|
193
193
|
|
194
|
-
**
|
194
|
+
**[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
195
195
|
|
196
|
-
| Library | Speed
|
197
|
-
| ------------- |
|
198
|
-
| **Kreuzberg** |
|
199
|
-
| Unstructured |
|
200
|
-
| MarkItDown |
|
201
|
-
| Docling |
|
196
|
+
| Library | Speed | Memory | Install Size | Dependencies | Success Rate |
|
197
|
+
| ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
|
198
|
+
| **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
|
199
|
+
| Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
|
200
|
+
| MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
|
201
|
+
| Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
|
202
202
|
|
203
|
+
\*_Can achieve 75% reliability with 15% performance trade-off when configured_
|
204
|
+
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
205
|
+
‡_Frequently fails/times out on medium files (>1MB)_
|
206
|
+
|
207
|
+
> **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
|
203
208
|
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
204
209
|
|
205
210
|
## Documentation
|
@@ -233,7 +238,7 @@ ______________________________________________________________________
|
|
233
238
|
|
234
239
|
<div align="center">
|
235
240
|
|
236
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
241
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
237
242
|
|
238
243
|
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
239
244
|
|
@@ -11,8 +11,8 @@
|
|
11
11
|
|
12
12
|
## Why Kreuzberg?
|
13
13
|
|
14
|
-
- **🚀 Fastest Performance**: [
|
15
|
-
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
14
|
+
- **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
|
15
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
|
16
16
|
- **⚡ Dual APIs**: Only library with both sync and async support
|
17
17
|
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
18
18
|
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
@@ -68,13 +68,13 @@ asyncio.run(main())
|
|
68
68
|
|
69
69
|
```bash
|
70
70
|
# Run API server
|
71
|
-
docker run -p 8000:8000 goldziher/kreuzberg:
|
71
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
72
72
|
|
73
73
|
# Extract files
|
74
74
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
75
75
|
```
|
76
76
|
|
77
|
-
Available variants: `3.
|
77
|
+
Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
|
78
78
|
|
79
79
|
### 🌐 REST API
|
80
80
|
|
@@ -119,15 +119,20 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
119
119
|
|
120
120
|
## Performance
|
121
121
|
|
122
|
-
**
|
122
|
+
**[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
123
123
|
|
124
|
-
| Library | Speed
|
125
|
-
| ------------- |
|
126
|
-
| **Kreuzberg** |
|
127
|
-
| Unstructured |
|
128
|
-
| MarkItDown |
|
129
|
-
| Docling |
|
124
|
+
| Library | Speed | Memory | Install Size | Dependencies | Success Rate |
|
125
|
+
| ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
|
126
|
+
| **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
|
127
|
+
| Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
|
128
|
+
| MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
|
129
|
+
| Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
|
130
130
|
|
131
|
+
\*_Can achieve 75% reliability with 15% performance trade-off when configured_
|
132
|
+
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
133
|
+
‡_Frequently fails/times out on medium files (>1MB)_
|
134
|
+
|
135
|
+
> **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
|
131
136
|
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
132
137
|
|
133
138
|
## Documentation
|
@@ -161,7 +166,7 @@ ______________________________________________________________________
|
|
161
166
|
|
162
167
|
<div align="center">
|
163
168
|
|
164
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
169
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
165
170
|
|
166
171
|
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
167
172
|
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.6.
|
8
|
+
version = "3.6.2"
|
9
9
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -49,7 +49,7 @@ dependencies = [
|
|
49
49
|
"anyio>=4.9.0",
|
50
50
|
"charset-normalizer>=3.4.2",
|
51
51
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
52
|
-
"html-to-markdown>=1.
|
52
|
+
"html-to-markdown[lxml]>=1.6.0",
|
53
53
|
"msgspec>=0.18.0",
|
54
54
|
"playa-pdf>=0.6.1", # pinned due to breaking changes in 0.5.0
|
55
55
|
"psutil>=7.0.0",
|
@@ -100,7 +100,7 @@ async def test_extract_bytes_html(html_document: Path) -> None:
|
|
100
100
|
assert_extraction_result(result, mime_type=MARKDOWN_MIME_TYPE)
|
101
101
|
assert (
|
102
102
|
result.content
|
103
|
-
==
|
103
|
+
== 'Browsers usually insert quotation marks around the q element. WWF\'s goal is to: "Build a future where people live in harmony with nature."'
|
104
104
|
)
|
105
105
|
|
106
106
|
|
@@ -169,7 +169,7 @@ async def test_extract_file_html(html_document: Path) -> None:
|
|
169
169
|
assert_extraction_result(result, mime_type=MARKDOWN_MIME_TYPE)
|
170
170
|
assert (
|
171
171
|
result.content
|
172
|
-
==
|
172
|
+
== 'Browsers usually insert quotation marks around the q element. WWF\'s goal is to: "Build a future where people live in harmony with nature."'
|
173
173
|
)
|
174
174
|
|
175
175
|
|
@@ -24,7 +24,7 @@ wheels = [
|
|
24
24
|
|
25
25
|
[[package]]
|
26
26
|
name = "aiohttp"
|
27
|
-
version = "3.12.
|
27
|
+
version = "3.12.14"
|
28
28
|
source = { registry = "https://pypi.org/simple" }
|
29
29
|
dependencies = [
|
30
30
|
{ name = "aiohappyeyeballs" },
|
@@ -35,25 +35,25 @@ dependencies = [
|
|
35
35
|
{ name = "propcache" },
|
36
36
|
{ name = "yarl" },
|
37
37
|
]
|
38
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
38
|
+
sdist = { url = "https://files.pythonhosted.org/packages/e6/0b/e39ad954107ebf213a2325038a3e7a506be3d98e1435e1f82086eec4cde2/aiohttp-3.12.14.tar.gz", hash = "sha256:6e06e120e34d93100de448fd941522e11dafa78ef1a893c179901b7d66aa29f2", size = 7822921, upload-time = "2025-07-10T13:05:33.968Z" }
|
39
39
|
wheels = [
|
40
|
-
{ url = "https://files.pythonhosted.org/packages/
|
41
|
-
{ url = "https://files.pythonhosted.org/packages/
|
42
|
-
{ url = "https://files.pythonhosted.org/packages/
|
43
|
-
{ url = "https://files.pythonhosted.org/packages/
|
44
|
-
{ url = "https://files.pythonhosted.org/packages/
|
45
|
-
{ url = "https://files.pythonhosted.org/packages/
|
46
|
-
{ url = "https://files.pythonhosted.org/packages/
|
47
|
-
{ url = "https://files.pythonhosted.org/packages/
|
48
|
-
{ url = "https://files.pythonhosted.org/packages/
|
49
|
-
{ url = "https://files.pythonhosted.org/packages/
|
50
|
-
{ url = "https://files.pythonhosted.org/packages/
|
51
|
-
{ url = "https://files.pythonhosted.org/packages/
|
52
|
-
{ url = "https://files.pythonhosted.org/packages/
|
53
|
-
{ url = "https://files.pythonhosted.org/packages/
|
54
|
-
{ url = "https://files.pythonhosted.org/packages/
|
55
|
-
{ url = "https://files.pythonhosted.org/packages/
|
56
|
-
{ url = "https://files.pythonhosted.org/packages/
|
40
|
+
{ url = "https://files.pythonhosted.org/packages/06/48/e0d2fa8ac778008071e7b79b93ab31ef14ab88804d7ba71b5c964a7c844e/aiohttp-3.12.14-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3143a7893d94dc82bc409f7308bc10d60285a3cd831a68faf1aa0836c5c3c767", size = 695471, upload-time = "2025-07-10T13:04:20.124Z" },
|
41
|
+
{ url = "https://files.pythonhosted.org/packages/8d/e7/f73206afa33100804f790b71092888f47df65fd9a4cd0e6800d7c6826441/aiohttp-3.12.14-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3d62ac3d506cef54b355bd34c2a7c230eb693880001dfcda0bf88b38f5d7af7e", size = 473128, upload-time = "2025-07-10T13:04:21.928Z" },
|
42
|
+
{ url = "https://files.pythonhosted.org/packages/df/e2/4dd00180be551a6e7ee979c20fc7c32727f4889ee3fd5b0586e0d47f30e1/aiohttp-3.12.14-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:48e43e075c6a438937c4de48ec30fa8ad8e6dfef122a038847456bfe7b947b63", size = 465426, upload-time = "2025-07-10T13:04:24.071Z" },
|
43
|
+
{ url = "https://files.pythonhosted.org/packages/de/dd/525ed198a0bb674a323e93e4d928443a680860802c44fa7922d39436b48b/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:077b4488411a9724cecc436cbc8c133e0d61e694995b8de51aaf351c7578949d", size = 1704252, upload-time = "2025-07-10T13:04:26.049Z" },
|
44
|
+
{ url = "https://files.pythonhosted.org/packages/d8/b1/01e542aed560a968f692ab4fc4323286e8bc4daae83348cd63588e4f33e3/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d8c35632575653f297dcbc9546305b2c1133391089ab925a6a3706dfa775ccab", size = 1685514, upload-time = "2025-07-10T13:04:28.186Z" },
|
45
|
+
{ url = "https://files.pythonhosted.org/packages/b3/06/93669694dc5fdabdc01338791e70452d60ce21ea0946a878715688d5a191/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b8ce87963f0035c6834b28f061df90cf525ff7c9b6283a8ac23acee6502afd4", size = 1737586, upload-time = "2025-07-10T13:04:30.195Z" },
|
46
|
+
{ url = "https://files.pythonhosted.org/packages/a5/3a/18991048ffc1407ca51efb49ba8bcc1645961f97f563a6c480cdf0286310/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0a2cf66e32a2563bb0766eb24eae7e9a269ac0dc48db0aae90b575dc9583026", size = 1786958, upload-time = "2025-07-10T13:04:32.482Z" },
|
47
|
+
{ url = "https://files.pythonhosted.org/packages/30/a8/81e237f89a32029f9b4a805af6dffc378f8459c7b9942712c809ff9e76e5/aiohttp-3.12.14-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdea089caf6d5cde975084a884c72d901e36ef9c2fd972c9f51efbbc64e96fbd", size = 1709287, upload-time = "2025-07-10T13:04:34.493Z" },
|
48
|
+
{ url = "https://files.pythonhosted.org/packages/8c/e3/bd67a11b0fe7fc12c6030473afd9e44223d456f500f7cf526dbaa259ae46/aiohttp-3.12.14-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a7865f27db67d49e81d463da64a59365ebd6b826e0e4847aa111056dcb9dc88", size = 1622990, upload-time = "2025-07-10T13:04:36.433Z" },
|
49
|
+
{ url = "https://files.pythonhosted.org/packages/83/ba/e0cc8e0f0d9ce0904e3cf2d6fa41904e379e718a013c721b781d53dcbcca/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0ab5b38a6a39781d77713ad930cb5e7feea6f253de656a5f9f281a8f5931b086", size = 1676015, upload-time = "2025-07-10T13:04:38.958Z" },
|
50
|
+
{ url = "https://files.pythonhosted.org/packages/d8/b3/1e6c960520bda094c48b56de29a3d978254637ace7168dd97ddc273d0d6c/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9b3b15acee5c17e8848d90a4ebc27853f37077ba6aec4d8cb4dbbea56d156933", size = 1707678, upload-time = "2025-07-10T13:04:41.275Z" },
|
51
|
+
{ url = "https://files.pythonhosted.org/packages/0a/19/929a3eb8c35b7f9f076a462eaa9830b32c7f27d3395397665caa5e975614/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e4c972b0bdaac167c1e53e16a16101b17c6d0ed7eac178e653a07b9f7fad7151", size = 1650274, upload-time = "2025-07-10T13:04:43.483Z" },
|
52
|
+
{ url = "https://files.pythonhosted.org/packages/22/e5/81682a6f20dd1b18ce3d747de8eba11cbef9b270f567426ff7880b096b48/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7442488b0039257a3bdbc55f7209587911f143fca11df9869578db6c26feeeb8", size = 1726408, upload-time = "2025-07-10T13:04:45.577Z" },
|
53
|
+
{ url = "https://files.pythonhosted.org/packages/8c/17/884938dffaa4048302985483f77dfce5ac18339aad9b04ad4aaa5e32b028/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f68d3067eecb64c5e9bab4a26aa11bd676f4c70eea9ef6536b0a4e490639add3", size = 1759879, upload-time = "2025-07-10T13:04:47.663Z" },
|
54
|
+
{ url = "https://files.pythonhosted.org/packages/95/78/53b081980f50b5cf874359bde707a6eacd6c4be3f5f5c93937e48c9d0025/aiohttp-3.12.14-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f88d3704c8b3d598a08ad17d06006cb1ca52a1182291f04979e305c8be6c9758", size = 1708770, upload-time = "2025-07-10T13:04:49.944Z" },
|
55
|
+
{ url = "https://files.pythonhosted.org/packages/ed/91/228eeddb008ecbe3ffa6c77b440597fdf640307162f0c6488e72c5a2d112/aiohttp-3.12.14-cp313-cp313-win32.whl", hash = "sha256:a3c99ab19c7bf375c4ae3debd91ca5d394b98b6089a03231d4c580ef3c2ae4c5", size = 421688, upload-time = "2025-07-10T13:04:51.993Z" },
|
56
|
+
{ url = "https://files.pythonhosted.org/packages/66/5f/8427618903343402fdafe2850738f735fd1d9409d2a8f9bcaae5e630d3ba/aiohttp-3.12.14-cp313-cp313-win_amd64.whl", hash = "sha256:3f8aad695e12edc9d571f878c62bedc91adf30c760c8632f09663e5f564f4baa", size = 448098, upload-time = "2025-07-10T13:04:53.999Z" },
|
57
57
|
]
|
58
58
|
|
59
59
|
[[package]]
|
@@ -92,11 +92,11 @@ wheels = [
|
|
92
92
|
|
93
93
|
[[package]]
|
94
94
|
name = "asgiref"
|
95
|
-
version = "3.9.
|
95
|
+
version = "3.9.1"
|
96
96
|
source = { registry = "https://pypi.org/simple" }
|
97
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
97
|
+
sdist = { url = "https://files.pythonhosted.org/packages/90/61/0aa957eec22ff70b830b22ff91f825e70e1ef732c06666a805730f28b36b/asgiref-3.9.1.tar.gz", hash = "sha256:a5ab6582236218e5ef1648f242fd9f10626cfd4de8dc377db215d5d5098e3142", size = 36870, upload-time = "2025-07-08T09:07:43.344Z" }
|
98
98
|
wheels = [
|
99
|
-
{ url = "https://files.pythonhosted.org/packages/
|
99
|
+
{ url = "https://files.pythonhosted.org/packages/7c/3c/0464dcada90d5da0e71018c04a140ad6349558afb30b3051b4264cc5b965/asgiref-3.9.1-py3-none-any.whl", hash = "sha256:f3bba7092a48005b5f5bacd747d36ee4a5a61f4a269a6df590b43144355ebd2c", size = 23790, upload-time = "2025-07-08T09:07:41.548Z" },
|
100
100
|
]
|
101
101
|
|
102
102
|
[[package]]
|
@@ -212,11 +212,11 @@ wheels = [
|
|
212
212
|
|
213
213
|
[[package]]
|
214
214
|
name = "certifi"
|
215
|
-
version = "2025.
|
215
|
+
version = "2025.7.9"
|
216
216
|
source = { registry = "https://pypi.org/simple" }
|
217
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
217
|
+
sdist = { url = "https://files.pythonhosted.org/packages/de/8a/c729b6b60c66a38f590c4e774decc4b2ec7b0576be8f1aa984a53ffa812a/certifi-2025.7.9.tar.gz", hash = "sha256:c1d2ec05395148ee10cf672ffc28cd37ea0ab0d99f9cc74c43e588cbd111b079", size = 160386, upload-time = "2025-07-09T02:13:58.874Z" }
|
218
218
|
wheels = [
|
219
|
-
{ url = "https://files.pythonhosted.org/packages/
|
219
|
+
{ url = "https://files.pythonhosted.org/packages/66/f3/80a3f974c8b535d394ff960a11ac20368e06b736da395b551a49ce950cce/certifi-2025.7.9-py3-none-any.whl", hash = "sha256:d842783a14f8fdd646895ac26f719a061408834473cfc10203f6a575beb15d39", size = 159230, upload-time = "2025-07-09T02:13:57.007Z" },
|
220
220
|
]
|
221
221
|
|
222
222
|
[[package]]
|
@@ -866,14 +866,19 @@ wheels = [
|
|
866
866
|
|
867
867
|
[[package]]
|
868
868
|
name = "html-to-markdown"
|
869
|
-
version = "1.
|
869
|
+
version = "1.6.0"
|
870
870
|
source = { registry = "https://pypi.org/simple" }
|
871
871
|
dependencies = [
|
872
872
|
{ name = "beautifulsoup4" },
|
873
873
|
]
|
874
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
874
|
+
sdist = { url = "https://files.pythonhosted.org/packages/6b/33/041a35156641991d6805af0847e82467c69a6071738e982379ff9a55151b/html_to_markdown-1.6.0.tar.gz", hash = "sha256:b2916f9d78f4faac866935688ff3f05f7e0a873554f9d797b8bca0e32afdc921", size = 35099, upload-time = "2025-07-11T06:14:12.227Z" }
|
875
875
|
wheels = [
|
876
|
-
{ url = "https://files.pythonhosted.org/packages/
|
876
|
+
{ url = "https://files.pythonhosted.org/packages/c9/23/38e07928c22dd1facd00781758bc4efb5f94c0240aa59e22b9a290fe44a1/html_to_markdown-1.6.0-py3-none-any.whl", hash = "sha256:88848e851ea80719397fc5356018c367d415bbd1ace030f3ad380fafd49fa0ed", size = 31616, upload-time = "2025-07-11T06:14:10.544Z" },
|
877
|
+
]
|
878
|
+
|
879
|
+
[package.optional-dependencies]
|
880
|
+
lxml = [
|
881
|
+
{ name = "lxml" },
|
877
882
|
]
|
878
883
|
|
879
884
|
[[package]]
|
@@ -1166,12 +1171,12 @@ wheels = [
|
|
1166
1171
|
|
1167
1172
|
[[package]]
|
1168
1173
|
name = "kreuzberg"
|
1169
|
-
version = "3.6.
|
1174
|
+
version = "3.6.2"
|
1170
1175
|
source = { editable = "." }
|
1171
1176
|
dependencies = [
|
1172
1177
|
{ name = "anyio" },
|
1173
1178
|
{ name = "charset-normalizer" },
|
1174
|
-
{ name = "html-to-markdown" },
|
1179
|
+
{ name = "html-to-markdown", extra = ["lxml"] },
|
1175
1180
|
{ name = "msgspec" },
|
1176
1181
|
{ name = "playa-pdf" },
|
1177
1182
|
{ name = "psutil" },
|
@@ -1254,7 +1259,7 @@ requires-dist = [
|
|
1254
1259
|
{ name = "exceptiongroup", marker = "python_full_version < '3.11'", specifier = ">=1.2.2" },
|
1255
1260
|
{ name = "fast-langdetect", marker = "extra == 'langdetect'", specifier = ">=0.3.2" },
|
1256
1261
|
{ name = "gmft", marker = "extra == 'gmft'", specifier = ">=0.4.2" },
|
1257
|
-
{ name = "html-to-markdown", specifier = ">=1.
|
1262
|
+
{ name = "html-to-markdown", extras = ["lxml"], specifier = ">=1.6.0" },
|
1258
1263
|
{ name = "keybert", marker = "extra == 'entity-extraction'", specifier = ">=0.9.0" },
|
1259
1264
|
{ name = "kreuzberg", extras = ["api", "chunking", "cli", "easyocr", "entity-extraction", "gmft", "langdetect", "paddleocr"], marker = "extra == 'all'" },
|
1260
1265
|
{ name = "litestar", extras = ["standard", "structlog", "opentelemetry"], marker = "extra == 'api'", specifier = ">=2.16.0" },
|
@@ -1422,7 +1427,7 @@ wheels = [
|
|
1422
1427
|
|
1423
1428
|
[[package]]
|
1424
1429
|
name = "langsmith"
|
1425
|
-
version = "0.4.
|
1430
|
+
version = "0.4.5"
|
1426
1431
|
source = { registry = "https://pypi.org/simple" }
|
1427
1432
|
dependencies = [
|
1428
1433
|
{ name = "httpx" },
|
@@ -1433,9 +1438,9 @@ dependencies = [
|
|
1433
1438
|
{ name = "requests-toolbelt" },
|
1434
1439
|
{ name = "zstandard" },
|
1435
1440
|
]
|
1436
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
1441
|
+
sdist = { url = "https://files.pythonhosted.org/packages/5c/92/7885823f3d13222f57773921f0da19b37d628c64607491233dc853a0f6ea/langsmith-0.4.5.tar.gz", hash = "sha256:49444bd8ccd4e46402f1b9ff1d686fa8e3a31b175e7085e72175ab8ec6164a34", size = 352235, upload-time = "2025-07-10T22:08:04.505Z" }
|
1437
1442
|
wheels = [
|
1438
|
-
{ url = "https://files.pythonhosted.org/packages/
|
1443
|
+
{ url = "https://files.pythonhosted.org/packages/c8/10/ad3107b666c3203b7938d10ea6b8746b9735c399cf737a51386d58e41d34/langsmith-0.4.5-py3-none-any.whl", hash = "sha256:4167717a2cccc4dff5809dbddc439628e836f6fd13d4fdb31ea013bc8d5cfaf5", size = 367795, upload-time = "2025-07-10T22:08:02.548Z" },
|
1439
1444
|
]
|
1440
1445
|
|
1441
1446
|
[[package]]
|
@@ -2170,7 +2175,7 @@ wheels = [
|
|
2170
2175
|
|
2171
2176
|
[[package]]
|
2172
2177
|
name = "openai"
|
2173
|
-
version = "1.
|
2178
|
+
version = "1.95.0"
|
2174
2179
|
source = { registry = "https://pypi.org/simple" }
|
2175
2180
|
dependencies = [
|
2176
2181
|
{ name = "anyio" },
|
@@ -2182,9 +2187,9 @@ dependencies = [
|
|
2182
2187
|
{ name = "tqdm" },
|
2183
2188
|
{ name = "typing-extensions" },
|
2184
2189
|
]
|
2185
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
2190
|
+
sdist = { url = "https://files.pythonhosted.org/packages/ef/2f/0c6f509a1585545962bfa6e201d7fb658eb2a6f52fb8c26765632d91706c/openai-1.95.0.tar.gz", hash = "sha256:54bc42df9f7142312647dd485d34cca5df20af825fa64a30ca55164be2cf4cc9", size = 488144, upload-time = "2025-07-10T18:35:49.946Z" }
|
2186
2191
|
wheels = [
|
2187
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2192
|
+
{ url = "https://files.pythonhosted.org/packages/19/a5/57d0bb58b938a3e3f352ff26e645da1660436402a6ad1b29780d261cc5a5/openai-1.95.0-py3-none-any.whl", hash = "sha256:a7afc9dca7e7d616371842af8ea6dbfbcb739a85d183f5f664ab1cc311b9ef18", size = 755572, upload-time = "2025-07-10T18:35:47.507Z" },
|
2188
2193
|
]
|
2189
2194
|
|
2190
2195
|
[[package]]
|
@@ -2392,7 +2397,7 @@ wheels = [
|
|
2392
2397
|
|
2393
2398
|
[[package]]
|
2394
2399
|
name = "paddlex"
|
2395
|
-
version = "3.1.
|
2400
|
+
version = "3.1.2"
|
2396
2401
|
source = { registry = "https://pypi.org/simple" }
|
2397
2402
|
dependencies = [
|
2398
2403
|
{ name = "chardet" },
|
@@ -2413,7 +2418,7 @@ dependencies = [
|
|
2413
2418
|
{ name = "ujson" },
|
2414
2419
|
]
|
2415
2420
|
wheels = [
|
2416
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2421
|
+
{ url = "https://files.pythonhosted.org/packages/5a/71/f824642aa730a45c6cd195c2df3d0d6376f170023f66aadc1b9f1f856ac7/paddlex-3.1.2-py3-none-any.whl", hash = "sha256:a87ede09cf6a9aebc09deadfaaedc6377505b408cac2ca4ec372c90e7399c71b", size = 1688520, upload-time = "2025-07-08T08:47:20.639Z" },
|
2417
2422
|
]
|
2418
2423
|
|
2419
2424
|
[package.optional-dependencies]
|
@@ -2489,7 +2494,7 @@ wheels = [
|
|
2489
2494
|
|
2490
2495
|
[[package]]
|
2491
2496
|
name = "pandas"
|
2492
|
-
version = "2.3.
|
2497
|
+
version = "2.3.1"
|
2493
2498
|
source = { registry = "https://pypi.org/simple" }
|
2494
2499
|
dependencies = [
|
2495
2500
|
{ name = "numpy" },
|
@@ -2497,21 +2502,21 @@ dependencies = [
|
|
2497
2502
|
{ name = "pytz" },
|
2498
2503
|
{ name = "tzdata" },
|
2499
2504
|
]
|
2500
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
2501
|
-
wheels = [
|
2502
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2503
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2504
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2505
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2506
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2507
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2508
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2509
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2510
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2511
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2512
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2513
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2514
|
-
{ url = "https://files.pythonhosted.org/packages/
|
2505
|
+
sdist = { url = "https://files.pythonhosted.org/packages/d1/6f/75aa71f8a14267117adeeed5d21b204770189c0a0025acbdc03c337b28fc/pandas-2.3.1.tar.gz", hash = "sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2", size = 4487493, upload-time = "2025-07-07T19:20:04.079Z" }
|
2506
|
+
wheels = [
|
2507
|
+
{ url = "https://files.pythonhosted.org/packages/32/ed/ff0a67a2c5505e1854e6715586ac6693dd860fbf52ef9f81edee200266e7/pandas-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9026bd4a80108fac2239294a15ef9003c4ee191a0f64b90f170b40cfb7cf2d22", size = 11531393, upload-time = "2025-07-07T19:19:12.245Z" },
|
2508
|
+
{ url = "https://files.pythonhosted.org/packages/c7/db/d8f24a7cc9fb0972adab0cc80b6817e8bef888cfd0024eeb5a21c0bb5c4a/pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6de8547d4fdb12421e2d047a2c446c623ff4c11f47fddb6b9169eb98ffba485a", size = 10668750, upload-time = "2025-07-07T19:19:14.612Z" },
|
2509
|
+
{ url = "https://files.pythonhosted.org/packages/0f/b0/80f6ec783313f1e2356b28b4fd8d2148c378370045da918c73145e6aab50/pandas-2.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:782647ddc63c83133b2506912cc6b108140a38a37292102aaa19c81c83db2928", size = 11342004, upload-time = "2025-07-07T19:19:16.857Z" },
|
2510
|
+
{ url = "https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9", size = 12050869, upload-time = "2025-07-07T19:19:19.265Z" },
|
2511
|
+
{ url = "https://files.pythonhosted.org/packages/55/79/20d746b0a96c67203a5bee5fb4e00ac49c3e8009a39e1f78de264ecc5729/pandas-2.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e5635178b387bd2ba4ac040f82bc2ef6e6b500483975c4ebacd34bec945fda12", size = 12750218, upload-time = "2025-07-07T19:19:21.547Z" },
|
2512
|
+
{ url = "https://files.pythonhosted.org/packages/7c/0f/145c8b41e48dbf03dd18fdd7f24f8ba95b8254a97a3379048378f33e7838/pandas-2.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f3bf5ec947526106399a9e1d26d40ee2b259c66422efdf4de63c848492d91bb", size = 13416763, upload-time = "2025-07-07T19:19:23.939Z" },
|
2513
|
+
{ url = "https://files.pythonhosted.org/packages/b2/c0/54415af59db5cdd86a3d3bf79863e8cc3fa9ed265f0745254061ac09d5f2/pandas-2.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:1c78cf43c8fde236342a1cb2c34bcff89564a7bfed7e474ed2fffa6aed03a956", size = 10987482, upload-time = "2025-07-07T19:19:42.699Z" },
|
2514
|
+
{ url = "https://files.pythonhosted.org/packages/48/64/2fd2e400073a1230e13b8cd604c9bc95d9e3b962e5d44088ead2e8f0cfec/pandas-2.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8dfc17328e8da77be3cf9f47509e5637ba8f137148ed0e9b5241e1baf526e20a", size = 12029159, upload-time = "2025-07-07T19:19:26.362Z" },
|
2515
|
+
{ url = "https://files.pythonhosted.org/packages/d8/0a/d84fd79b0293b7ef88c760d7dca69828d867c89b6d9bc52d6a27e4d87316/pandas-2.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ec6c851509364c59a5344458ab935e6451b31b818be467eb24b0fe89bd05b6b9", size = 11393287, upload-time = "2025-07-07T19:19:29.157Z" },
|
2516
|
+
{ url = "https://files.pythonhosted.org/packages/50/ae/ff885d2b6e88f3c7520bb74ba319268b42f05d7e583b5dded9837da2723f/pandas-2.3.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:911580460fc4884d9b05254b38a6bfadddfcc6aaef856fb5859e7ca202e45275", size = 11309381, upload-time = "2025-07-07T19:19:31.436Z" },
|
2517
|
+
{ url = "https://files.pythonhosted.org/packages/85/86/1fa345fc17caf5d7780d2699985c03dbe186c68fee00b526813939062bb0/pandas-2.3.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f4d6feeba91744872a600e6edbbd5b033005b431d5ae8379abee5bcfa479fab", size = 11883998, upload-time = "2025-07-07T19:19:34.267Z" },
|
2518
|
+
{ url = "https://files.pythonhosted.org/packages/81/aa/e58541a49b5e6310d89474333e994ee57fea97c8aaa8fc7f00b873059bbf/pandas-2.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fe37e757f462d31a9cd7580236a82f353f5713a80e059a29753cf938c6775d96", size = 12704705, upload-time = "2025-07-07T19:19:36.856Z" },
|
2519
|
+
{ url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
|
2515
2520
|
]
|
2516
2521
|
|
2517
2522
|
[[package]]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|