kreuzberg 3.11.3__tar.gz → 3.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.docker/Dockerfile +27 -2
- kreuzberg-3.13.0/.docker/README.md +190 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/ci.yaml +0 -6
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/docker-e2e-tests.yml +4 -8
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/docs.yml +1 -1
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/publish-docker.yml +76 -27
- kreuzberg-3.13.0/.github/workflows/test-docker-builds.yml +97 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.gitignore +27 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.pre-commit-config.yaml +3 -3
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/PKG-INFO +17 -14
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/README.md +13 -9
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/Taskfile.yml +17 -16
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/ai-rulez.yaml +0 -37
- kreuzberg-3.13.0/benchmarks/README.md +234 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/benchmarks/pyproject.toml +5 -2
- kreuzberg-3.13.0/benchmarks/src/__init__.py +1 -0
- kreuzberg-3.13.0/benchmarks/src/__main__.py +4 -0
- {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/benchmarks.py +0 -9
- {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/cli.py +316 -15
- {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/models.py +2 -22
- {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/profiler.py +0 -14
- {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/runner.py +2 -26
- kreuzberg-3.13.0/docker-compose.example.yml +26 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/types.md +13 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/cli.md +36 -1
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/examples/extraction-examples.md +103 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/getting-started/installation.md +8 -38
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/index.md +1 -1
- kreuzberg-3.13.0/docs/user-guide/api-server.md +313 -0
- kreuzberg-3.13.0/docs/user-guide/docker.md +548 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/document-classification.md +1 -1
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/extraction-configuration.md +162 -27
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/ocr-backends.md +17 -13
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/ocr-configuration.md +125 -3
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/__init__.py +14 -13
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/__main__.py +0 -2
- kreuzberg-3.13.0/kreuzberg/_api/main.py +218 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_config.py +248 -204
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_document_classification.py +0 -8
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_entity_extraction.py +1 -93
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_base.py +0 -5
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_email.py +1 -11
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_html.py +9 -12
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_image.py +1 -23
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_pandoc.py +10 -89
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_pdf.py +39 -92
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_presentation.py +0 -17
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_spread_sheet.py +13 -53
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_structured.py +1 -4
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_gmft.py +14 -138
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_language_detection.py +1 -22
- kreuzberg-3.13.0/kreuzberg/_mcp/__init__.py +3 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_mcp/server.py +3 -10
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_mime_types.py +1 -2
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_ocr/_easyocr.py +21 -108
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg-3.13.0/kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg-3.13.0/kreuzberg/_ocr/_tesseract.py +1629 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_playa.py +5 -4
- kreuzberg-3.13.0/kreuzberg/_types.py +1011 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_cache.py +88 -90
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_device.py +0 -18
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_document_cache.py +0 -2
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_errors.py +0 -3
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_pdf_lock.py +0 -2
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_process_pool.py +19 -19
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_quality.py +0 -43
- kreuzberg-3.13.0/kreuzberg/_utils/_ref.py +48 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_serialization.py +0 -5
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_string.py +9 -39
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_sync.py +0 -1
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_table.py +50 -57
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/cli.py +55 -77
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/mkdocs.yaml +1 -1
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/pyproject.toml +11 -13
- kreuzberg-3.13.0/results/baseline.json +9 -0
- kreuzberg-3.13.0/results/serialization.json +11 -0
- kreuzberg-3.13.0/results/statistical.json +21 -0
- kreuzberg-3.13.0/tests/api/conftest.py +17 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/api/main_test.py +140 -88
- kreuzberg-3.13.0/tests/api/runtime_config_test.py +322 -0
- kreuzberg-3.13.0/tests/cli_command_test.py +481 -0
- kreuzberg-3.13.0/tests/cli_integration_test.py +858 -0
- kreuzberg-3.13.0/tests/cli_test.py +324 -0
- kreuzberg-3.13.0/tests/config_test.py +1540 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/conftest.py +0 -8
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/document_classification_test.py +12 -96
- kreuzberg-3.11.3/tests/e2e/docker_images_test.py → kreuzberg-3.13.0/tests/e2e/docker_e2e_test.py +11 -130
- kreuzberg-3.13.0/tests/entity_extraction_test.py +589 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/exceptions_test.py +0 -10
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/extraction_batch_test.py +3 -28
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/extraction_test.py +9 -91
- kreuzberg-3.13.0/tests/extractors/email_test.py +924 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/extractors/html_test.py +0 -2
- kreuzberg-3.13.0/tests/extractors/image_test.py +693 -0
- kreuzberg-3.13.0/tests/extractors/pandoc_test.py +1996 -0
- kreuzberg-3.13.0/tests/extractors/pdf_test.py +900 -0
- kreuzberg-3.13.0/tests/extractors/presentation_test.py +934 -0
- kreuzberg-3.13.0/tests/extractors/spreed_sheet_test.py +1121 -0
- kreuzberg-3.13.0/tests/extractors/structured_test.py +304 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/gmft_extended_test.py +2 -17
- kreuzberg-3.13.0/tests/gmft_test.py +785 -0
- kreuzberg-3.13.0/tests/html_to_markdown_config_test.py +217 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/language_detection_test.py +3 -24
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/mcp_server_test.py +19 -145
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/mime_types_test.py +0 -4
- kreuzberg-3.13.0/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/multiprocessing/gmft_integration_test.py +2 -9
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/multiprocessing/gmft_isolated_test.py +4 -50
- kreuzberg-3.13.0/tests/multiprocessing/process_manager_test.py +273 -0
- kreuzberg-3.13.0/tests/multiprocessing/tesseract_pool_test.py +331 -0
- kreuzberg-3.13.0/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/base_test.py +0 -13
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/device_integration_test.py +0 -3
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/easyocr_test.py +0 -15
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/init_test.py +0 -6
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/paddleocr_test.py +0 -10
- kreuzberg-3.13.0/tests/ocr/tesseract_test.py +1154 -0
- kreuzberg-3.13.0/tests/ocr/tesseract_tsv_integration_test.py +273 -0
- kreuzberg-3.13.0/tests/ocr/tesseract_tsv_test.py +382 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/playa_helpers_test.py +8 -84
- kreuzberg-3.13.0/tests/tesseract_sync_formats_test.py +168 -0
- kreuzberg-3.13.0/tests/test_source_files/contract.txt +1 -0
- kreuzberg-3.13.0/tests/test_source_files/tables/borderless_table.png +0 -0
- kreuzberg-3.13.0/tests/test_source_files/tables/complex_document.png +0 -0
- kreuzberg-3.13.0/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/types_test.py +10 -76
- kreuzberg-3.13.0/tests/utils/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/cache_test.py +26 -69
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/device_test.py +0 -2
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/errors_test.py +129 -94
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/pdf_lock_test.py +0 -18
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/process_pool_test.py +9 -29
- kreuzberg-3.13.0/tests/utils/ref_test.py +90 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/serialization_test.py +1 -40
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/string_test.py +12 -66
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/sync_test.py +0 -43
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/table_test.py +18 -78
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/uv.lock +633 -615
- kreuzberg-3.11.3/.docker/README.md +0 -87
- kreuzberg-3.11.3/.task/checksum/docker-build-core +0 -1
- kreuzberg-3.11.3/.task/checksum/docker-build-easyocr +0 -1
- kreuzberg-3.11.3/.task/checksum/docker-build-gmft +0 -1
- kreuzberg-3.11.3/.task/checksum/docker-build-paddle +0 -1
- kreuzberg-3.11.3/benchmarks/README.md +0 -152
- kreuzberg-3.11.3/benchmarks/benchmark_baseline.py +0 -116
- kreuzberg-3.11.3/benchmarks/end_to_end_benchmark.py +0 -238
- kreuzberg-3.11.3/benchmarks/final_benchmark.py +0 -147
- kreuzberg-3.11.3/benchmarks/results/baseline_results.json +0 -35
- kreuzberg-3.11.3/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -50
- kreuzberg-3.11.3/benchmarks/results/comprehensive_caching_results.json +0 -55
- kreuzberg-3.11.3/benchmarks/results/final_benchmark_results.json +0 -12
- kreuzberg-3.11.3/benchmarks/results/latest.json +0 -607
- kreuzberg-3.11.3/benchmarks/results/mime_caching_results.json +0 -18
- kreuzberg-3.11.3/benchmarks/results/msgspec_caching_results.json +0 -10
- kreuzberg-3.11.3/benchmarks/results/ocr_caching_results.json +0 -17
- kreuzberg-3.11.3/benchmarks/results/serialization_benchmark_results.json +0 -42
- kreuzberg-3.11.3/benchmarks/results/statistical_benchmark_results.json +0 -26
- kreuzberg-3.11.3/benchmarks/results/table_caching_results.json +0 -17
- kreuzberg-3.11.3/benchmarks/serialization_benchmark.py +0 -165
- kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -3
- kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -6
- kreuzberg-3.11.3/benchmarks/statistical_benchmark.py +0 -219
- kreuzberg-3.11.3/docs/performance-analysis.md +0 -168
- kreuzberg-3.11.3/docs/user-guide/api-server.md +0 -169
- kreuzberg-3.11.3/docs/user-guide/docker.md +0 -389
- kreuzberg-3.11.3/kreuzberg/_api/main.py +0 -108
- kreuzberg-3.11.3/kreuzberg/_mcp/__init__.py +0 -5
- kreuzberg-3.11.3/kreuzberg/_ocr/_tesseract.py +0 -987
- kreuzberg-3.11.3/kreuzberg/_types.py +0 -413
- kreuzberg-3.11.3/tests/cli_command_test.py +0 -523
- kreuzberg-3.11.3/tests/cli_integration_test.py +0 -531
- kreuzberg-3.11.3/tests/cli_test.py +0 -335
- kreuzberg-3.11.3/tests/config_test.py +0 -1570
- kreuzberg-3.11.3/tests/e2e/run_docker_tests.sh +0 -371
- kreuzberg-3.11.3/tests/e2e/test_report.json +0 -14
- kreuzberg-3.11.3/tests/entity_extraction_test.py +0 -675
- kreuzberg-3.11.3/tests/extractors/email_test.py +0 -1003
- kreuzberg-3.11.3/tests/extractors/image_test.py +0 -768
- kreuzberg-3.11.3/tests/extractors/pandoc_test.py +0 -2123
- kreuzberg-3.11.3/tests/extractors/pdf_test.py +0 -973
- kreuzberg-3.11.3/tests/extractors/presentation_test.py +0 -1005
- kreuzberg-3.11.3/tests/extractors/spreed_sheet_test.py +0 -1237
- kreuzberg-3.11.3/tests/extractors/structured_test.py +0 -302
- kreuzberg-3.11.3/tests/gmft_test.py +0 -839
- kreuzberg-3.11.3/tests/multiprocessing/__init__.py +0 -1
- kreuzberg-3.11.3/tests/multiprocessing/process_manager_test.py +0 -282
- kreuzberg-3.11.3/tests/multiprocessing/tesseract_pool_test.py +0 -349
- kreuzberg-3.11.3/tests/ocr/tesseract_test.py +0 -1141
- kreuzberg-3.11.3/tests/utils_errors_test.py +0 -299
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.commitlintrc +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.deepsource.toml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.dockerignore +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/LICENSE +0 -0
- {kreuzberg-3.11.3/kreuzberg/_api → kreuzberg-3.13.0/benchmarks}/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/contributing.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.11.3/kreuzberg/_extractors → kreuzberg-3.13.0/kreuzberg/_api}/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.11.3/kreuzberg/_utils → kreuzberg-3.13.0/kreuzberg/_extractors}/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.11.3/tests → kreuzberg-3.13.0/kreuzberg/_utils}/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/py.typed +0 -0
- /kreuzberg-3.11.3/tests/api/__init__.py → /kreuzberg-3.13.0/output.txt +0 -0
- {kreuzberg-3.11.3/tests/extractors → kreuzberg-3.13.0/tests}/__init__.py +0 -0
- {kreuzberg-3.11.3/tests/ocr → kreuzberg-3.13.0/tests/api}/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.11.3/tests/utils → kreuzberg-3.13.0/tests/e2e}/__init__.py +0 -0
- /kreuzberg-3.11.3/tests/test_source_files/contract.txt → /kreuzberg-3.13.0/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/excel.xlsx +0 -0
- /kreuzberg-3.11.3/tests/test_source_files/better-ocr-image.jpg → /kreuzberg-3.13.0/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/tmp_test.py +0 -0
@@ -13,7 +13,15 @@ COPY kreuzberg kreuzberg
|
|
13
13
|
|
14
14
|
# Install dependencies with optimizations
|
15
15
|
RUN --mount=type=cache,target=/tmp/uv-cache \
|
16
|
-
|
16
|
+
if [ -z "$EXTRAS" ]; then \
|
17
|
+
uv sync --extra api --no-editable --no-dev --compile-bytecode; \
|
18
|
+
else \
|
19
|
+
extras_args="--extra api"; \
|
20
|
+
for extra in $EXTRAS; do \
|
21
|
+
extras_args="$extras_args --extra $extra"; \
|
22
|
+
done; \
|
23
|
+
uv sync $extras_args --no-editable --no-dev --compile-bytecode; \
|
24
|
+
fi && \
|
17
25
|
rm -rf /app/.venv/lib/python*/site-packages/**/__pycache__ && \
|
18
26
|
find /app/.venv -type f -name "*.pyc" -delete && \
|
19
27
|
find /app/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
|
@@ -28,11 +36,24 @@ ENV PYTHONUNBUFFERED=1
|
|
28
36
|
ENV PATH="/app/.venv/bin:$PATH"
|
29
37
|
|
30
38
|
# Install runtime dependencies
|
39
|
+
# Languages included: English (default), Spanish, French, German, Italian, Portuguese,
|
40
|
+
# Chinese (simplified & traditional), Japanese, Arabic, Russian, Hindi ~keep
|
31
41
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
32
42
|
pandoc \
|
33
43
|
tesseract-ocr \
|
34
44
|
tesseract-ocr-eng \
|
35
45
|
tesseract-ocr-osd \
|
46
|
+
tesseract-ocr-spa \
|
47
|
+
tesseract-ocr-fra \
|
48
|
+
tesseract-ocr-deu \
|
49
|
+
tesseract-ocr-ita \
|
50
|
+
tesseract-ocr-por \
|
51
|
+
tesseract-ocr-chi-sim \
|
52
|
+
tesseract-ocr-chi-tra \
|
53
|
+
tesseract-ocr-jpn \
|
54
|
+
tesseract-ocr-ara \
|
55
|
+
tesseract-ocr-rus \
|
56
|
+
tesseract-ocr-hin \
|
36
57
|
libglib2.0-0 \
|
37
58
|
libsm6 \
|
38
59
|
libxext6 \
|
@@ -46,9 +67,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
46
67
|
COPY --from=builder /app/.venv /app/.venv
|
47
68
|
COPY --from=builder /app/kreuzberg /app/kreuzberg
|
48
69
|
|
49
|
-
# Create non-root user
|
70
|
+
# Create non-root user and cache directory
|
50
71
|
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser && \
|
72
|
+
mkdir -p /app/.kreuzberg && \
|
51
73
|
chown -R appuser:appuser /app
|
52
74
|
|
75
|
+
# Set default cache directory to prevent permission issues
|
76
|
+
ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg
|
77
|
+
|
53
78
|
USER appuser
|
54
79
|
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -0,0 +1,190 @@
|
|
1
|
+
# Kreuzberg Docker Images
|
2
|
+
|
3
|
+
[](https://github.com/Goldziher/kreuzberg)
|
4
|
+
[](https://badge.fury.io/py/kreuzberg)
|
5
|
+
[](https://kreuzberg.dev/)
|
6
|
+
[](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE)
|
7
|
+
|
8
|
+
High-performance Python library for text extraction from documents, available as optimized Docker images.
|
9
|
+
|
10
|
+
**Source Code**: [github.com/Goldziher/kreuzberg](https://github.com/Goldziher/kreuzberg)
|
11
|
+
|
12
|
+
## Quick Start
|
13
|
+
|
14
|
+
```bash
|
15
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
16
|
+
```
|
17
|
+
|
18
|
+
## Available Images
|
19
|
+
|
20
|
+
### Base Image (`latest`)
|
21
|
+
|
22
|
+
- **Image**: `goldziher/kreuzberg:latest`
|
23
|
+
- **Size**: ~550MB compressed
|
24
|
+
- **Includes**: REST API server, CLI tools, Tesseract OCR with 12 business languages
|
25
|
+
- **Languages**: English, Spanish, French, German, Italian, Portuguese, Chinese (Simplified & Traditional), Japanese, Arabic, Russian, Hindi
|
26
|
+
- **Use cases**: Basic document processing, simple API deployments, cost-conscious workflows
|
27
|
+
|
28
|
+
### Core Image (`core`)
|
29
|
+
|
30
|
+
- **Image**: `goldziher/kreuzberg-core:latest`
|
31
|
+
- **Size**: ~700MB compressed
|
32
|
+
- **Includes**: Everything from base plus:
|
33
|
+
- Text chunking (semantic-text-splitter)
|
34
|
+
- Encrypted PDF support (crypto)
|
35
|
+
- Document classification
|
36
|
+
- Language detection
|
37
|
+
- Email parsing (.eml, .msg)
|
38
|
+
- Additional format extensions
|
39
|
+
- **Use cases**: RAG applications, document intelligence, enterprise workflows, multi-language processing
|
40
|
+
|
41
|
+
## Usage
|
42
|
+
|
43
|
+
### Extract Files via API
|
44
|
+
|
45
|
+
```bash
|
46
|
+
# Single file with base image
|
47
|
+
curl -X POST http://localhost:8000/extract \
|
48
|
+
-F "data=@document.pdf"
|
49
|
+
|
50
|
+
# With core image - chunking for RAG
|
51
|
+
docker run -p 8000:8000 goldziher/kreuzberg-core:latest
|
52
|
+
curl -X POST http://localhost:8000/extract \
|
53
|
+
-F "data=@document.pdf" \
|
54
|
+
-F "chunk_content=true" \
|
55
|
+
-F "max_chars=1000"
|
56
|
+
|
57
|
+
# Language detection
|
58
|
+
curl -X POST http://localhost:8000/extract \
|
59
|
+
-F "data=@document.pdf" \
|
60
|
+
-F "auto_detect_language=true"
|
61
|
+
|
62
|
+
# Encrypted PDF
|
63
|
+
curl -X POST http://localhost:8000/extract \
|
64
|
+
-F "data=@encrypted.pdf" \
|
65
|
+
-F "password=secretpassword"
|
66
|
+
```
|
67
|
+
|
68
|
+
### Docker Compose
|
69
|
+
|
70
|
+
```yaml
|
71
|
+
version: '3.8'
|
72
|
+
|
73
|
+
services:
|
74
|
+
kreuzberg:
|
75
|
+
image: goldziher/kreuzberg-core:latest
|
76
|
+
ports:
|
77
|
+
- "8000:8000"
|
78
|
+
volumes:
|
79
|
+
- kreuzberg-cache:/app/.kreuzberg
|
80
|
+
environment:
|
81
|
+
- PYTHONUNBUFFERED=1
|
82
|
+
- KREUZBERG_CACHE_DIR=/app/.kreuzberg
|
83
|
+
restart: unless-stopped
|
84
|
+
healthcheck:
|
85
|
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
86
|
+
interval: 30s
|
87
|
+
timeout: 10s
|
88
|
+
retries: 3
|
89
|
+
|
90
|
+
volumes:
|
91
|
+
kreuzberg-cache:
|
92
|
+
```
|
93
|
+
|
94
|
+
## Custom Images
|
95
|
+
|
96
|
+
Create tailored images for your specific needs:
|
97
|
+
|
98
|
+
### Example: RAG-Optimized
|
99
|
+
|
100
|
+
```dockerfile
|
101
|
+
FROM goldziher/kreuzberg:latest
|
102
|
+
|
103
|
+
USER root
|
104
|
+
|
105
|
+
# Add chunking and language detection
|
106
|
+
RUN pip install --upgrade "kreuzberg[chunking,langdetect]"
|
107
|
+
|
108
|
+
USER appuser
|
109
|
+
|
110
|
+
# RAG-optimized defaults
|
111
|
+
ENV KREUZBERG_CHUNK_CONTENT=true
|
112
|
+
ENV KREUZBERG_MAX_CHARS=1000
|
113
|
+
ENV KREUZBERG_AUTO_DETECT_LANGUAGE=true
|
114
|
+
```
|
115
|
+
|
116
|
+
### Example: Crypto Support
|
117
|
+
|
118
|
+
```dockerfile
|
119
|
+
FROM goldziher/kreuzberg:latest
|
120
|
+
|
121
|
+
USER root
|
122
|
+
|
123
|
+
# Add encrypted PDF support
|
124
|
+
RUN pip install --upgrade "kreuzberg[crypto]"
|
125
|
+
|
126
|
+
USER appuser
|
127
|
+
```
|
128
|
+
|
129
|
+
## Configuration
|
130
|
+
|
131
|
+
### Environment Variables
|
132
|
+
|
133
|
+
- `KREUZBERG_CACHE_DIR`: Cache directory (default: `/app/.kreuzberg`)
|
134
|
+
- `KREUZBERG_CHUNK_CONTENT`: Enable chunking (`true`/`false`)
|
135
|
+
- `KREUZBERG_AUTO_DETECT_LANGUAGE`: Enable language detection (`true`/`false`)
|
136
|
+
- `KREUZBERG_OCR_BACKEND`: OCR backend (`tesseract` or `none`)
|
137
|
+
|
138
|
+
### Configuration File
|
139
|
+
|
140
|
+
Mount `kreuzberg.toml`:
|
141
|
+
|
142
|
+
```toml
|
143
|
+
chunk_content = true
|
144
|
+
auto_detect_language = true
|
145
|
+
max_chars = 1000
|
146
|
+
ocr_backend = "tesseract"
|
147
|
+
|
148
|
+
[tesseract]
|
149
|
+
language = "eng+spa+fra+deu"
|
150
|
+
psm = 6
|
151
|
+
```
|
152
|
+
|
153
|
+
```bash
|
154
|
+
docker run -p 8000:8000 \
|
155
|
+
-v "$(pwd)/kreuzberg.toml:/app/kreuzberg.toml:ro" \
|
156
|
+
goldziher/kreuzberg-core:latest
|
157
|
+
```
|
158
|
+
|
159
|
+
## Features
|
160
|
+
|
161
|
+
- **🚀 High Performance**: Optimized for speed and efficiency
|
162
|
+
- **📄 Multiple Formats**: PDF, DOCX, images, HTML, and more
|
163
|
+
- **🔍 OCR Support**: Built-in Tesseract with 12 business languages
|
164
|
+
- **🔒 Secure**: Runs as non-root user, no external API calls
|
165
|
+
- **📦 Ready to Use**: Pre-configured API server
|
166
|
+
|
167
|
+
## Documentation
|
168
|
+
|
169
|
+
- **[GitHub Repository](https://github.com/Goldziher/kreuzberg)** - Source code and issue tracking
|
170
|
+
- **[Full Documentation](https://kreuzberg.dev/)** - Complete user guide and API reference
|
171
|
+
- **[API Documentation](https://kreuzberg.dev/user-guide/api-server/)** - REST API endpoints and usage
|
172
|
+
- **[Docker Guide](https://kreuzberg.dev/user-guide/docker/)** - Detailed Docker usage guide
|
173
|
+
|
174
|
+
## Support
|
175
|
+
|
176
|
+
- **Issues**: [github.com/Goldziher/kreuzberg/issues](https://github.com/Goldziher/kreuzberg/issues)
|
177
|
+
- **Discussions**: [github.com/Goldziher/kreuzberg/discussions](https://github.com/Goldziher/kreuzberg/discussions)
|
178
|
+
- **Discord**: [Join our community](https://discord.gg/pXxagNK2zN)
|
179
|
+
|
180
|
+
## Contributing
|
181
|
+
|
182
|
+
Contributions are welcome! See our [Contributing Guide](https://github.com/Goldziher/kreuzberg/blob/main/docs/contributing.md).
|
183
|
+
|
184
|
+
## License
|
185
|
+
|
186
|
+
MIT License - see [LICENSE](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE) for details.
|
187
|
+
|
188
|
+
______________________________________________________________________
|
189
|
+
|
190
|
+
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
@@ -51,7 +51,6 @@ jobs:
|
|
51
51
|
- name: Execute Pre-Commit
|
52
52
|
run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
|
53
53
|
|
54
|
-
# Coverage job runs first, only on Python 3.13 Ubuntu
|
55
54
|
coverage:
|
56
55
|
needs: validate
|
57
56
|
runs-on: ubuntu-latest
|
@@ -135,7 +134,6 @@ jobs:
|
|
135
134
|
.coverage
|
136
135
|
retention-days: 7
|
137
136
|
|
138
|
-
# Full test matrix runs only after coverage succeeds
|
139
137
|
test:
|
140
138
|
needs: coverage
|
141
139
|
runs-on: ${{ matrix.os }}
|
@@ -144,10 +142,6 @@ jobs:
|
|
144
142
|
matrix:
|
145
143
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
146
144
|
python: ["3.10", "3.11", "3.12", "3.13"]
|
147
|
-
exclude:
|
148
|
-
# Skip Python 3.13 on macOS for now due to compatibility issues
|
149
|
-
- os: macos-latest
|
150
|
-
python: "3.13"
|
151
145
|
timeout-minutes: 30
|
152
146
|
steps:
|
153
147
|
- name: Checkout
|
@@ -11,10 +11,8 @@ jobs:
|
|
11
11
|
strategy:
|
12
12
|
matrix:
|
13
13
|
image:
|
14
|
-
- { name: "
|
15
|
-
- { name: "
|
16
|
-
- { name: "paddle", extras: "paddleocr" }
|
17
|
-
- { name: "gmft", extras: "gmft" }
|
14
|
+
- { name: "base", extras: "cli" }
|
15
|
+
- { name: "core", extras: "cli chunking crypto document-classification langdetect additional-extensions" }
|
18
16
|
fail-fast: false
|
19
17
|
|
20
18
|
steps:
|
@@ -92,9 +90,7 @@ jobs:
|
|
92
90
|
docker build -f .docker/Dockerfile \
|
93
91
|
--build-arg EXTRAS="${{ matrix.image.extras }}" \
|
94
92
|
-t kreuzberg:${{ matrix.image.name }} \
|
95
|
-
--cache
|
96
|
-
--cache-to type=gha,mode=max \
|
97
|
-
--load \
|
93
|
+
--no-cache \
|
98
94
|
.
|
99
95
|
|
100
96
|
echo "Built image:"
|
@@ -104,7 +100,7 @@ jobs:
|
|
104
100
|
run: |
|
105
101
|
mkdir -p tests/e2e/logs
|
106
102
|
echo "Running E2E tests for ${{ matrix.image.name }}..."
|
107
|
-
python3 tests/e2e/
|
103
|
+
python3 tests/e2e/docker_e2e_test.py --image ${{ matrix.image.name }}
|
108
104
|
|
109
105
|
- name: Generate test report - ${{ matrix.image.name }}
|
110
106
|
if: always()
|
@@ -2,19 +2,32 @@ name: Publish Docker Images
|
|
2
2
|
|
3
3
|
on:
|
4
4
|
workflow_dispatch:
|
5
|
+
inputs:
|
6
|
+
version:
|
7
|
+
description: 'Version to build (leave empty to use latest git tag)'
|
8
|
+
required: false
|
9
|
+
type: string
|
10
|
+
build_base:
|
11
|
+
description: 'Build base image'
|
12
|
+
required: true
|
13
|
+
type: boolean
|
14
|
+
default: true
|
15
|
+
build_core:
|
16
|
+
description: 'Build core image'
|
17
|
+
required: true
|
18
|
+
type: boolean
|
19
|
+
default: true
|
5
20
|
release:
|
6
21
|
types: [published]
|
7
22
|
|
8
23
|
jobs:
|
9
|
-
# Run E2E tests first
|
10
24
|
test-images:
|
11
25
|
uses: ./.github/workflows/docker-e2e-tests.yml
|
12
26
|
|
13
|
-
# Build and publish images after tests pass
|
14
27
|
build-and-publish:
|
15
28
|
needs: test-images
|
16
|
-
runs-on: ubuntu-latest
|
17
29
|
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'release' }}
|
30
|
+
runs-on: ubuntu-latest
|
18
31
|
permissions:
|
19
32
|
contents: read
|
20
33
|
packages: write
|
@@ -22,34 +35,49 @@ jobs:
|
|
22
35
|
version: ${{ steps.get_version.outputs.VERSION }}
|
23
36
|
|
24
37
|
strategy:
|
25
|
-
max-parallel:
|
38
|
+
max-parallel: 1 # Build one at a time to save disk space ~keep
|
26
39
|
matrix:
|
27
40
|
include:
|
41
|
+
- name: base
|
42
|
+
extras: "cli"
|
43
|
+
image_name: "goldziher/kreuzberg"
|
44
|
+
should_build: ${{ github.event_name == 'release' || inputs.build_base == true }}
|
28
45
|
- name: core
|
29
|
-
extras: ""
|
30
|
-
|
31
|
-
|
32
|
-
extras: "easyocr"
|
33
|
-
tag_suffix: "-easyocr"
|
34
|
-
- name: paddle
|
35
|
-
extras: "paddleocr"
|
36
|
-
tag_suffix: "-paddle"
|
37
|
-
- name: gmft
|
38
|
-
extras: "gmft"
|
39
|
-
tag_suffix: "-gmft"
|
40
|
-
- name: all
|
41
|
-
extras: "all"
|
42
|
-
tag_suffix: "-all"
|
46
|
+
extras: "cli chunking crypto document-classification langdetect additional-extensions"
|
47
|
+
image_name: "goldziher/kreuzberg-core"
|
48
|
+
should_build: ${{ github.event_name == 'release' || inputs.build_core == true }}
|
43
49
|
|
44
50
|
steps:
|
45
51
|
- name: Free up disk space
|
46
52
|
run: |
|
53
|
+
echo "Initial disk space:"
|
54
|
+
df -h /
|
55
|
+
|
56
|
+
# Remove unnecessary large directories (saves ~30GB) ~keep
|
47
57
|
sudo rm -rf /usr/share/dotnet
|
48
58
|
sudo rm -rf /usr/local/lib/android
|
49
59
|
sudo rm -rf /opt/ghc
|
50
60
|
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
51
|
-
sudo
|
52
|
-
|
61
|
+
sudo rm -rf /usr/local/share/boost
|
62
|
+
sudo rm -rf /usr/local/lib/node_modules
|
63
|
+
sudo rm -rf /opt/microsoft
|
64
|
+
sudo rm -rf /usr/local/.ghcup
|
65
|
+
sudo rm -rf /opt/hostedtoolcache
|
66
|
+
|
67
|
+
# Clean apt
|
68
|
+
sudo apt-get clean
|
69
|
+
sudo rm -rf /var/lib/apt/lists/*
|
70
|
+
|
71
|
+
# Remove swap to free up space
|
72
|
+
sudo swapoff -a
|
73
|
+
sudo rm -f /swapfile
|
74
|
+
|
75
|
+
# Clean Docker completely
|
76
|
+
docker system prune -af --volumes || true
|
77
|
+
docker builder prune -af || true
|
78
|
+
|
79
|
+
echo "Available disk space after cleanup:"
|
80
|
+
df -h /
|
53
81
|
|
54
82
|
- name: Checkout repository
|
55
83
|
uses: actions/checkout@v5
|
@@ -61,6 +89,8 @@ jobs:
|
|
61
89
|
run: |
|
62
90
|
if [ "${{ github.event_name }}" = "release" ]; then
|
63
91
|
VERSION="${{ github.event.release.tag_name }}"
|
92
|
+
elif [ -n "${{ inputs.version }}" ]; then
|
93
|
+
VERSION="${{ inputs.version }}"
|
64
94
|
else
|
65
95
|
git fetch --tags
|
66
96
|
VERSION=$(git tag --sort=-version:refname | head -n1)
|
@@ -81,15 +111,17 @@ jobs:
|
|
81
111
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
82
112
|
|
83
113
|
- name: Extract metadata (tags, labels) for Docker
|
114
|
+
if: ${{ matrix.should_build }}
|
84
115
|
id: meta
|
85
116
|
uses: docker/metadata-action@v5
|
86
117
|
with:
|
87
|
-
images:
|
118
|
+
images: ${{ matrix.image_name }}
|
88
119
|
tags: |
|
89
|
-
type=raw,value=${{ steps.get_version.outputs.VERSION }}
|
90
|
-
type=raw,value=latest
|
120
|
+
type=raw,value=${{ steps.get_version.outputs.VERSION }}
|
121
|
+
type=raw,value=latest
|
91
122
|
|
92
123
|
- name: Build and push Docker image to Docker Hub
|
124
|
+
if: ${{ matrix.should_build }}
|
93
125
|
uses: docker/build-push-action@v6
|
94
126
|
with:
|
95
127
|
context: .
|
@@ -100,15 +132,32 @@ jobs:
|
|
100
132
|
EXTRAS=${{ matrix.extras }}
|
101
133
|
tags: ${{ steps.meta.outputs.tags }}
|
102
134
|
labels: ${{ steps.meta.outputs.labels }}
|
103
|
-
cache
|
104
|
-
|
135
|
+
no-cache: true
|
136
|
+
|
137
|
+
- name: Clean up after build
|
138
|
+
if: always()
|
139
|
+
run: |
|
140
|
+
# Remove all Docker images and containers
|
141
|
+
docker stop $(docker ps -aq) || true
|
142
|
+
docker rm $(docker ps -aq) || true
|
143
|
+
docker rmi $(docker images -q) || true
|
144
|
+
|
145
|
+
# Clean all Docker data
|
146
|
+
docker system prune -af --volumes || true
|
147
|
+
docker builder prune -af || true
|
148
|
+
|
149
|
+
# Clear buildkit cache
|
150
|
+
docker buildx prune -af || true
|
151
|
+
|
152
|
+
echo "Disk space after cleanup:"
|
153
|
+
df -h /
|
105
154
|
|
106
155
|
- name: Update Docker Hub README
|
156
|
+
if: ${{ matrix.should_build }}
|
107
157
|
uses: peter-evans/dockerhub-description@v4
|
108
|
-
if: matrix.name == 'core'
|
109
158
|
continue-on-error: true
|
110
159
|
with:
|
111
160
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
112
161
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
113
|
-
repository:
|
162
|
+
repository: ${{ matrix.image_name }}
|
114
163
|
readme-filepath: ./.docker/README.md
|
@@ -0,0 +1,97 @@
|
|
1
|
+
name: Test Docker Builds (No Push)
|
2
|
+
|
3
|
+
on:
|
4
|
+
workflow_dispatch:
|
5
|
+
|
6
|
+
jobs:
|
7
|
+
test-build-all-images:
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
strategy:
|
10
|
+
max-parallel: 1 # Build one at a time to save disk space ~keep
|
11
|
+
matrix:
|
12
|
+
include:
|
13
|
+
- name: base
|
14
|
+
extras: "cli"
|
15
|
+
image_name: "goldziher/kreuzberg"
|
16
|
+
- name: core
|
17
|
+
extras: "cli,chunking,crypto,document-classification,langdetect,additional-extensions"
|
18
|
+
image_name: "goldziher/kreuzberg-core"
|
19
|
+
|
20
|
+
steps:
|
21
|
+
- name: Free up disk space
|
22
|
+
run: |
|
23
|
+
echo "Initial disk space:"
|
24
|
+
df -h /
|
25
|
+
|
26
|
+
# Remove unnecessary large directories (saves ~30GB)
|
27
|
+
sudo rm -rf /usr/share/dotnet
|
28
|
+
sudo rm -rf /usr/local/lib/android
|
29
|
+
sudo rm -rf /opt/ghc
|
30
|
+
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
31
|
+
sudo rm -rf /usr/local/share/boost
|
32
|
+
sudo rm -rf /usr/local/lib/node_modules
|
33
|
+
sudo rm -rf /opt/microsoft
|
34
|
+
sudo rm -rf /usr/local/.ghcup
|
35
|
+
sudo rm -rf /opt/hostedtoolcache
|
36
|
+
|
37
|
+
# Clean apt
|
38
|
+
sudo apt-get clean
|
39
|
+
sudo rm -rf /var/lib/apt/lists/*
|
40
|
+
|
41
|
+
# Remove swap to free up space
|
42
|
+
sudo swapoff -a
|
43
|
+
sudo rm -f /swapfile
|
44
|
+
|
45
|
+
# Clean Docker completely
|
46
|
+
docker system prune -af --volumes || true
|
47
|
+
docker builder prune -af || true
|
48
|
+
|
49
|
+
echo "Available disk space after cleanup:"
|
50
|
+
df -h /
|
51
|
+
|
52
|
+
- name: Checkout repository
|
53
|
+
uses: actions/checkout@v5
|
54
|
+
|
55
|
+
- name: Set up Docker Buildx
|
56
|
+
uses: docker/setup-buildx-action@v3
|
57
|
+
|
58
|
+
- name: Build Docker image - ${{ matrix.name }}
|
59
|
+
run: |
|
60
|
+
echo "===================="
|
61
|
+
echo "Building ${{ matrix.name }} image"
|
62
|
+
echo "===================="
|
63
|
+
|
64
|
+
docker buildx build \
|
65
|
+
--file ./.docker/Dockerfile \
|
66
|
+
--build-arg EXTRAS="${{ matrix.extras }}" \
|
67
|
+
--tag ${{ matrix.image_name }}:test \
|
68
|
+
--platform linux/amd64 \
|
69
|
+
--no-cache \
|
70
|
+
--load \
|
71
|
+
.
|
72
|
+
|
73
|
+
echo "Image built successfully:"
|
74
|
+
docker images | grep "${{ matrix.image_name }}:test" || true
|
75
|
+
|
76
|
+
echo "Current disk usage:"
|
77
|
+
df -h /
|
78
|
+
|
79
|
+
- name: Clean up after build
|
80
|
+
if: always()
|
81
|
+
run: |
|
82
|
+
echo "Cleaning up Docker resources..."
|
83
|
+
|
84
|
+
# Remove all Docker images and containers
|
85
|
+
docker stop $(docker ps -aq) || true
|
86
|
+
docker rm $(docker ps -aq) || true
|
87
|
+
docker rmi $(docker images -q) || true
|
88
|
+
|
89
|
+
# Clean all Docker data
|
90
|
+
docker system prune -af --volumes || true
|
91
|
+
docker builder prune -af || true
|
92
|
+
|
93
|
+
# Clear buildkit cache
|
94
|
+
docker buildx prune -af || true
|
95
|
+
|
96
|
+
echo "Disk space after cleanup:"
|
97
|
+
df -h /
|
@@ -39,3 +39,30 @@ requirements.txt
|
|
39
39
|
site/
|
40
40
|
.cache/
|
41
41
|
dist/
|
42
|
+
build/
|
43
|
+
.task/
|
44
|
+
tests/e2e/test_report.json
|
45
|
+
tests/e2e/logs/
|
46
|
+
|
47
|
+
# Additional build artifacts
|
48
|
+
*.whl
|
49
|
+
*.tar.gz
|
50
|
+
.tox/
|
51
|
+
.nox/
|
52
|
+
wheels/
|
53
|
+
share/python-wheels/
|
54
|
+
|
55
|
+
# Documentation builds
|
56
|
+
docs/_build/
|
57
|
+
docs/build/
|
58
|
+
|
59
|
+
# Node.js (if any frontend tools are used)
|
60
|
+
node_modules/
|
61
|
+
npm-debug.log*
|
62
|
+
yarn-debug.log*
|
63
|
+
yarn-error.log*
|
64
|
+
|
65
|
+
# Temporary files
|
66
|
+
*.tmp
|
67
|
+
*.temp
|
68
|
+
.tmp/
|
@@ -6,7 +6,7 @@ repos:
|
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
8
|
- repo: https://github.com/Goldziher/ai-rulez
|
9
|
-
rev: v1.6.
|
9
|
+
rev: v1.6.1
|
10
10
|
hooks:
|
11
11
|
- id: ai-rulez-validate
|
12
12
|
- id: ai-rulez-generate
|
@@ -53,7 +53,7 @@ repos:
|
|
53
53
|
hooks:
|
54
54
|
- id: pyproject-fmt
|
55
55
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
56
|
-
rev: v0.12.
|
56
|
+
rev: v0.12.11
|
57
57
|
hooks:
|
58
58
|
- id: ruff
|
59
59
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -66,7 +66,7 @@ repos:
|
|
66
66
|
additional_dependencies:
|
67
67
|
- tomli
|
68
68
|
- repo: https://github.com/jsh9/pydoclint
|
69
|
-
rev: 0.
|
69
|
+
rev: 0.7.3
|
70
70
|
hooks:
|
71
71
|
- id: pydoclint
|
72
72
|
args:
|