kreuzberg 3.11.1__tar.gz → 3.11.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-3.11.3/.docker/Dockerfile +54 -0
- kreuzberg-3.11.3/.github/workflows/docker-e2e-tests.yml +151 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.github/workflows/publish-docker.yml +10 -8
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.gitignore +1 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.pre-commit-config.yaml +7 -9
- kreuzberg-3.11.3/.task/checksum/docker-build-core +1 -0
- kreuzberg-3.11.3/.task/checksum/docker-build-easyocr +1 -0
- kreuzberg-3.11.3/.task/checksum/docker-build-gmft +1 -0
- kreuzberg-3.11.3/.task/checksum/docker-build-paddle +1 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/PKG-INFO +8 -8
- kreuzberg-3.11.3/Taskfile.yml +160 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/ai-rulez.yaml +327 -11
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/getting-started/installation.md +25 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/docker.md +1 -1
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/ocr-backends.md +5 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_gmft.py +28 -10
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/pyproject.toml +15 -6
- kreuzberg-3.11.3/tests/e2e/docker_images_test.py +600 -0
- kreuzberg-3.11.3/tests/e2e/run_docker_tests.sh +371 -0
- kreuzberg-3.11.3/tests/e2e/test_report.json +14 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/gmft_test.py +119 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/uv.lock +604 -1583
- kreuzberg-3.11.1/.docker/Dockerfile +0 -21
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.commitlintrc +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.deepsource.toml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.docker/README.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.dockerignore +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/.markdownlint.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/LICENSE +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/README.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/README.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/latest.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/advanced/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/assets/logo.png +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/cli.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/contributing.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/css/extra.css +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/examples/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/performance-analysis.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/mkdocs.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/api/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/api/main_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/chunker_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/cli_command_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/cli_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/config_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/conftest.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/document_classification_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/entity_extraction_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extraction_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/hooks_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/mcp_server_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/playa_helpers_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/playa_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/registry_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/better-ocr-image.jpg +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/types_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils/tmp_test.py +0 -0
- {kreuzberg-3.11.1 → kreuzberg-3.11.3}/tests/utils_errors_test.py +0 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
# Build stage
|
2
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS builder
|
3
|
+
ARG EXTRAS=""
|
4
|
+
WORKDIR /app
|
5
|
+
ENV PYTHONDONTWRITEBYTECODE=1
|
6
|
+
ENV PYTHONUNBUFFERED=1
|
7
|
+
ENV UV_LINK_MODE=copy
|
8
|
+
ENV UV_CACHE_DIR=/tmp/uv-cache
|
9
|
+
|
10
|
+
# Copy dependency files
|
11
|
+
COPY pyproject.toml uv.lock README.md ./
|
12
|
+
COPY kreuzberg kreuzberg
|
13
|
+
|
14
|
+
# Install dependencies with optimizations
|
15
|
+
RUN --mount=type=cache,target=/tmp/uv-cache \
|
16
|
+
uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --compile-bytecode && \
|
17
|
+
rm -rf /app/.venv/lib/python*/site-packages/**/__pycache__ && \
|
18
|
+
find /app/.venv -type f -name "*.pyc" -delete && \
|
19
|
+
find /app/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
|
20
|
+
find /app/.venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true
|
21
|
+
|
22
|
+
# Runtime stage
|
23
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim AS runtime
|
24
|
+
ARG EXTRAS=""
|
25
|
+
WORKDIR /app
|
26
|
+
ENV PYTHONDONTWRITEBYTECODE=1
|
27
|
+
ENV PYTHONUNBUFFERED=1
|
28
|
+
ENV PATH="/app/.venv/bin:$PATH"
|
29
|
+
|
30
|
+
# Install runtime dependencies
|
31
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
32
|
+
pandoc \
|
33
|
+
tesseract-ocr \
|
34
|
+
tesseract-ocr-eng \
|
35
|
+
tesseract-ocr-osd \
|
36
|
+
libglib2.0-0 \
|
37
|
+
libsm6 \
|
38
|
+
libxext6 \
|
39
|
+
libxrender-dev \
|
40
|
+
libgomp1 \
|
41
|
+
libgl1 \
|
42
|
+
libglib2.0-0 \
|
43
|
+
&& apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
44
|
+
|
45
|
+
# Copy virtual environment from builder
|
46
|
+
COPY --from=builder /app/.venv /app/.venv
|
47
|
+
COPY --from=builder /app/kreuzberg /app/kreuzberg
|
48
|
+
|
49
|
+
# Create non-root user
|
50
|
+
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser && \
|
51
|
+
chown -R appuser:appuser /app
|
52
|
+
|
53
|
+
USER appuser
|
54
|
+
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -0,0 +1,151 @@
|
|
1
|
+
name: Docker E2E Tests
|
2
|
+
|
3
|
+
on:
|
4
|
+
workflow_dispatch:
|
5
|
+
workflow_call:
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
test-docker-images:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
timeout-minutes: 60
|
11
|
+
strategy:
|
12
|
+
matrix:
|
13
|
+
image:
|
14
|
+
- { name: "core", extras: "" }
|
15
|
+
- { name: "easyocr", extras: "easyocr" }
|
16
|
+
- { name: "paddle", extras: "paddleocr" }
|
17
|
+
- { name: "gmft", extras: "gmft" }
|
18
|
+
fail-fast: false
|
19
|
+
|
20
|
+
steps:
|
21
|
+
- name: Checkout repository
|
22
|
+
uses: actions/checkout@v5
|
23
|
+
|
24
|
+
- name: Install uv
|
25
|
+
uses: astral-sh/setup-uv@v6
|
26
|
+
with:
|
27
|
+
enable-cache: true
|
28
|
+
|
29
|
+
- name: Set up Python
|
30
|
+
uses: actions/setup-python@v5
|
31
|
+
with:
|
32
|
+
python-version-file: "pyproject.toml"
|
33
|
+
|
34
|
+
- name: Install test dependencies
|
35
|
+
run: |
|
36
|
+
uv pip install --system asyncio
|
37
|
+
|
38
|
+
- name: Install system dependencies
|
39
|
+
run: |
|
40
|
+
sudo apt-get update
|
41
|
+
sudo apt-get install -y pandoc tesseract-ocr jq bc
|
42
|
+
|
43
|
+
- name: Free up disk space
|
44
|
+
run: |
|
45
|
+
echo "Initial disk usage:"
|
46
|
+
df -h
|
47
|
+
|
48
|
+
sudo rm -rf /usr/share/dotnet
|
49
|
+
sudo rm -rf /usr/local/lib/android
|
50
|
+
sudo rm -rf /opt/ghc
|
51
|
+
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
52
|
+
sudo rm -rf /usr/local/share/boost
|
53
|
+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
54
|
+
|
55
|
+
sudo apt-get clean
|
56
|
+
sudo apt-get autoremove -y
|
57
|
+
|
58
|
+
docker system prune -af --volumes
|
59
|
+
|
60
|
+
sudo swapoff -a
|
61
|
+
sudo rm -f /swapfile
|
62
|
+
|
63
|
+
echo "Available disk space after cleanup:"
|
64
|
+
df -h
|
65
|
+
|
66
|
+
- name: Set up Docker Buildx
|
67
|
+
uses: docker/setup-buildx-action@v3
|
68
|
+
with:
|
69
|
+
driver-opts: |
|
70
|
+
image=moby/buildkit:latest
|
71
|
+
network=host
|
72
|
+
|
73
|
+
- name: Configure Docker
|
74
|
+
run: |
|
75
|
+
sudo tee /etc/docker/daemon.json > /dev/null <<EOF
|
76
|
+
{
|
77
|
+
"max-concurrent-downloads": 10,
|
78
|
+
"max-concurrent-uploads": 10,
|
79
|
+
"storage-driver": "overlay2"
|
80
|
+
}
|
81
|
+
EOF
|
82
|
+
|
83
|
+
sudo systemctl restart docker
|
84
|
+
docker info
|
85
|
+
|
86
|
+
- name: Build Docker image - ${{ matrix.image.name }}
|
87
|
+
run: |
|
88
|
+
export DOCKER_BUILDKIT=1
|
89
|
+
export BUILDKIT_PROGRESS=plain
|
90
|
+
|
91
|
+
echo "Building ${{ matrix.image.name }} image..."
|
92
|
+
docker build -f .docker/Dockerfile \
|
93
|
+
--build-arg EXTRAS="${{ matrix.image.extras }}" \
|
94
|
+
-t kreuzberg:${{ matrix.image.name }} \
|
95
|
+
--cache-from type=gha \
|
96
|
+
--cache-to type=gha,mode=max \
|
97
|
+
--load \
|
98
|
+
.
|
99
|
+
|
100
|
+
echo "Built image:"
|
101
|
+
docker images --format "table {{.Repository}}:{{.Tag}}\t{{.Size}}" | grep kreuzberg:${{ matrix.image.name }} || true
|
102
|
+
|
103
|
+
- name: Run E2E tests - ${{ matrix.image.name }}
|
104
|
+
run: |
|
105
|
+
mkdir -p tests/e2e/logs
|
106
|
+
echo "Running E2E tests for ${{ matrix.image.name }}..."
|
107
|
+
python3 tests/e2e/docker_images_test.py --image ${{ matrix.image.name }}
|
108
|
+
|
109
|
+
- name: Generate test report - ${{ matrix.image.name }}
|
110
|
+
if: always()
|
111
|
+
run: |
|
112
|
+
if [ -f "tests/e2e/test_report.json" ]; then
|
113
|
+
echo "## Test Report Summary for ${{ matrix.image.name }}" >> $GITHUB_STEP_SUMMARY
|
114
|
+
echo "\`\`\`json" >> $GITHUB_STEP_SUMMARY
|
115
|
+
jq '.["${{ matrix.image.name }}"]' tests/e2e/test_report.json >> $GITHUB_STEP_SUMMARY || echo "No results for ${{ matrix.image.name }}" >> $GITHUB_STEP_SUMMARY
|
116
|
+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
117
|
+
fi
|
118
|
+
|
119
|
+
- name: Collect Docker logs on failure
|
120
|
+
if: failure()
|
121
|
+
run: |
|
122
|
+
mkdir -p tests/e2e/docker-logs
|
123
|
+
|
124
|
+
for container in $(docker ps -a --filter "name=kreuzberg-test" --format "{{.Names}}"); do
|
125
|
+
docker logs "$container" > "tests/e2e/docker-logs/${container}.log" 2>&1 || true
|
126
|
+
done
|
127
|
+
|
128
|
+
docker info > tests/e2e/docker-logs/docker-info.txt 2>&1 || true
|
129
|
+
docker version > tests/e2e/docker-logs/docker-version.txt 2>&1 || true
|
130
|
+
|
131
|
+
- name: Upload test results - ${{ matrix.image.name }}
|
132
|
+
if: always()
|
133
|
+
uses: actions/upload-artifact@v4
|
134
|
+
with:
|
135
|
+
name: docker-e2e-test-results-${{ matrix.image.name }}
|
136
|
+
path: |
|
137
|
+
tests/e2e/logs/
|
138
|
+
tests/e2e/*.log
|
139
|
+
tests/e2e/*.json
|
140
|
+
tests/e2e/docker-logs/
|
141
|
+
retention-days: 7
|
142
|
+
|
143
|
+
- name: Clean up Docker resources
|
144
|
+
if: always()
|
145
|
+
run: |
|
146
|
+
docker ps -aq --filter "name=kreuzberg-test" | xargs -r docker rm -f || true
|
147
|
+
docker rmi kreuzberg:${{ matrix.image.name }} || true
|
148
|
+
docker system prune -af --volumes || true
|
149
|
+
|
150
|
+
echo "Final disk usage after ${{ matrix.image.name }}:"
|
151
|
+
df -h
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
name: Publish Docker Images
|
3
2
|
|
4
3
|
on:
|
@@ -7,12 +6,20 @@ on:
|
|
7
6
|
types: [published]
|
8
7
|
|
9
8
|
jobs:
|
10
|
-
|
9
|
+
# Run E2E tests first
|
10
|
+
test-images:
|
11
|
+
uses: ./.github/workflows/docker-e2e-tests.yml
|
12
|
+
|
13
|
+
# Build and publish images after tests pass
|
14
|
+
build-and-publish:
|
15
|
+
needs: test-images
|
11
16
|
runs-on: ubuntu-latest
|
12
17
|
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'release' }}
|
13
18
|
permissions:
|
14
19
|
contents: read
|
15
20
|
packages: write
|
21
|
+
outputs:
|
22
|
+
version: ${{ steps.get_version.outputs.VERSION }}
|
16
23
|
|
17
24
|
strategy:
|
18
25
|
max-parallel: 2
|
@@ -37,7 +44,6 @@ jobs:
|
|
37
44
|
steps:
|
38
45
|
- name: Free up disk space
|
39
46
|
run: |
|
40
|
-
# Remove large unnecessary packages to free up space
|
41
47
|
sudo rm -rf /usr/share/dotnet
|
42
48
|
sudo rm -rf /usr/local/lib/android
|
43
49
|
sudo rm -rf /opt/ghc
|
@@ -54,10 +60,8 @@ jobs:
|
|
54
60
|
id: get_version
|
55
61
|
run: |
|
56
62
|
if [ "${{ github.event_name }}" = "release" ]; then
|
57
|
-
# For release events, use the release tag
|
58
63
|
VERSION="${{ github.event.release.tag_name }}"
|
59
64
|
else
|
60
|
-
# For workflow_dispatch, get the latest tag
|
61
65
|
git fetch --tags
|
62
66
|
VERSION=$(git tag --sort=-version:refname | head -n1)
|
63
67
|
fi
|
@@ -82,12 +86,10 @@ jobs:
|
|
82
86
|
with:
|
83
87
|
images: goldziher/kreuzberg
|
84
88
|
tags: |
|
85
|
-
# Release version tag (e.g., v3.0.0-easyocr)
|
86
89
|
type=raw,value=${{ steps.get_version.outputs.VERSION }}${{ matrix.tag_suffix }}
|
87
|
-
# Latest tag for each variant (e.g., latest-easyocr)
|
88
90
|
type=raw,value=latest${{ matrix.tag_suffix }}
|
89
91
|
|
90
|
-
- name: Build and push Docker image
|
92
|
+
- name: Build and push Docker image to Docker Hub
|
91
93
|
uses: docker/build-push-action@v6
|
92
94
|
with:
|
93
95
|
context: .
|
@@ -5,13 +5,11 @@ repos:
|
|
5
5
|
- id: commitlint
|
6
6
|
stages: [commit-msg]
|
7
7
|
additional_dependencies: ["@commitlint/config-conventional"]
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
# - id: ai-rulez-validate
|
14
|
-
# - id: ai-rulez-generate
|
8
|
+
- repo: https://github.com/Goldziher/ai-rulez
|
9
|
+
rev: v1.6.0
|
10
|
+
hooks:
|
11
|
+
- id: ai-rulez-validate
|
12
|
+
- id: ai-rulez-generate
|
15
13
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
16
14
|
rev: v6.0.0
|
17
15
|
hooks:
|
@@ -55,7 +53,7 @@ repos:
|
|
55
53
|
hooks:
|
56
54
|
- id: pyproject-fmt
|
57
55
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
58
|
-
rev: v0.12.
|
56
|
+
rev: v0.12.10
|
59
57
|
hooks:
|
60
58
|
- id: ruff
|
61
59
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -68,7 +66,7 @@ repos:
|
|
68
66
|
additional_dependencies:
|
69
67
|
- tomli
|
70
68
|
- repo: https://github.com/jsh9/pydoclint
|
71
|
-
rev: 0.6.
|
69
|
+
rev: 0.6.10
|
72
70
|
hooks:
|
73
71
|
- id: pydoclint
|
74
72
|
args:
|
@@ -0,0 +1 @@
|
|
1
|
+
f2683e2fcbf67fa54bee4baa3ead293a
|
@@ -0,0 +1 @@
|
|
1
|
+
f2683e2fcbf67fa54bee4baa3ead293a
|
@@ -0,0 +1 @@
|
|
1
|
+
f2683e2fcbf67fa54bee4baa3ead293a
|
@@ -0,0 +1 @@
|
|
1
|
+
f2683e2fcbf67fa54bee4baa3ead293a
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.11.
|
3
|
+
Version: 3.11.3
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.10.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.
|
35
|
+
Requires-Dist: mcp>=1.13.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: playa-pdf>=0.7.0
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
@@ -52,9 +52,9 @@ Requires-Dist: gmft>=0.4.2; extra == 'all'
|
|
52
52
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
53
53
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
54
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
55
|
-
Requires-Dist: paddleocr>=3.
|
56
|
-
Requires-Dist: paddlepaddle>=3.1.
|
57
|
-
Requires-Dist: pandas>=2.3.
|
55
|
+
Requires-Dist: paddleocr>=3.2.0; extra == 'all'
|
56
|
+
Requires-Dist: paddlepaddle>=3.1.1; extra == 'all'
|
57
|
+
Requires-Dist: pandas>=2.3.2; extra == 'all'
|
58
58
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
59
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
60
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
@@ -73,7 +73,7 @@ Provides-Extra: crypto
|
|
73
73
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
|
74
74
|
Provides-Extra: document-classification
|
75
75
|
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
|
-
Requires-Dist: pandas>=2.3.
|
76
|
+
Requires-Dist: pandas>=2.3.2; extra == 'document-classification'
|
77
77
|
Provides-Extra: easyocr
|
78
78
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
79
79
|
Provides-Extra: entity-extraction
|
@@ -84,8 +84,8 @@ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
|
84
84
|
Provides-Extra: langdetect
|
85
85
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
86
86
|
Provides-Extra: paddleocr
|
87
|
-
Requires-Dist: paddleocr>=3.
|
88
|
-
Requires-Dist: paddlepaddle>=3.1.
|
87
|
+
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
88
|
+
Requires-Dist: paddlepaddle>=3.1.1; extra == 'paddleocr'
|
89
89
|
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
90
90
|
Description-Content-Type: text/markdown
|
91
91
|
|
@@ -0,0 +1,160 @@
|
|
1
|
+
version: "3"
|
2
|
+
|
3
|
+
env:
|
4
|
+
DOCKER_BUILDKIT: 1
|
5
|
+
BUILDKIT_PROGRESS: plain
|
6
|
+
|
7
|
+
vars:
|
8
|
+
DOCKER_DIR: .docker
|
9
|
+
DOCKERFILE: "{{.DOCKER_DIR}}/Dockerfile"
|
10
|
+
TEST_DIR: tests/e2e
|
11
|
+
LOG_DIR: "{{.TEST_DIR}}/logs"
|
12
|
+
|
13
|
+
tasks:
|
14
|
+
# Main task for Docker E2E testing
|
15
|
+
docker:e2e:
|
16
|
+
desc: "Build Docker images and run E2E tests"
|
17
|
+
deps:
|
18
|
+
- docker:build
|
19
|
+
cmds:
|
20
|
+
- uv run python {{.TEST_DIR}}/docker_images_test.py
|
21
|
+
|
22
|
+
# Docker build tasks
|
23
|
+
docker:build:
|
24
|
+
desc: "Build all Docker images for testing"
|
25
|
+
deps:
|
26
|
+
- docker:build:core
|
27
|
+
- docker:build:easyocr
|
28
|
+
- docker:build:paddle
|
29
|
+
- docker:build:gmft
|
30
|
+
cmds:
|
31
|
+
- docker images --format "table {{.Repository}}:{{.Tag}}\t{{.Size}}" | grep kreuzberg || true
|
32
|
+
|
33
|
+
docker:build:core:
|
34
|
+
desc: "Build core Docker image"
|
35
|
+
cmds:
|
36
|
+
- docker build -f {{.DOCKERFILE}} --build-arg EXTRAS="" -t kreuzberg:core .
|
37
|
+
sources:
|
38
|
+
- "{{.DOCKERFILE}}"
|
39
|
+
- kreuzberg/**/*.py
|
40
|
+
- pyproject.toml
|
41
|
+
|
42
|
+
docker:build:easyocr:
|
43
|
+
desc: "Build EasyOCR Docker image"
|
44
|
+
cmds:
|
45
|
+
- docker build -f {{.DOCKERFILE}} --build-arg EXTRAS="easyocr" -t kreuzberg:easyocr .
|
46
|
+
sources:
|
47
|
+
- "{{.DOCKERFILE}}"
|
48
|
+
- kreuzberg/**/*.py
|
49
|
+
- pyproject.toml
|
50
|
+
|
51
|
+
docker:build:paddle:
|
52
|
+
desc: "Build PaddleOCR Docker image"
|
53
|
+
cmds:
|
54
|
+
- docker build -f {{.DOCKERFILE}} --build-arg EXTRAS="paddleocr" -t kreuzberg:paddle .
|
55
|
+
sources:
|
56
|
+
- "{{.DOCKERFILE}}"
|
57
|
+
- kreuzberg/**/*.py
|
58
|
+
- pyproject.toml
|
59
|
+
|
60
|
+
docker:build:gmft:
|
61
|
+
desc: "Build GMFT Docker image"
|
62
|
+
cmds:
|
63
|
+
- docker build -f {{.DOCKERFILE}} --build-arg EXTRAS="gmft" -t kreuzberg:gmft .
|
64
|
+
sources:
|
65
|
+
- "{{.DOCKERFILE}}"
|
66
|
+
- kreuzberg/**/*.py
|
67
|
+
- pyproject.toml
|
68
|
+
|
69
|
+
# Test runner variants
|
70
|
+
docker:test:
|
71
|
+
desc: "Run Docker E2E tests (images must be built)"
|
72
|
+
cmds:
|
73
|
+
- uv run python {{.TEST_DIR}}/docker_images_test.py
|
74
|
+
|
75
|
+
# Utility tasks
|
76
|
+
docker:clean:
|
77
|
+
desc: "Clean up Docker test images and containers"
|
78
|
+
cmds:
|
79
|
+
- docker ps -aq --filter "name=kreuzberg-test" | xargs -r docker rm -f 2>/dev/null || true
|
80
|
+
- docker rmi kreuzberg:core kreuzberg:easyocr kreuzberg:paddle kreuzberg:gmft 2>/dev/null || true
|
81
|
+
- docker system prune -f
|
82
|
+
|
83
|
+
docker:logs:
|
84
|
+
desc: "Show logs from test containers"
|
85
|
+
cmds:
|
86
|
+
- docker ps -a --filter "name=kreuzberg-test" --format "table {{.Names}}\t{{.Status}}"
|
87
|
+
- for container in $(docker ps -a --filter "name=kreuzberg-test" --format "{{.Names}}"); do echo "=== $container ==="; docker logs --tail 50 "$container" 2>&1 || true; done
|
88
|
+
|
89
|
+
# Development tasks
|
90
|
+
install:
|
91
|
+
desc: "Install dependencies with uv"
|
92
|
+
cmds:
|
93
|
+
- uv sync
|
94
|
+
|
95
|
+
install:all:
|
96
|
+
desc: "Install all optional dependencies with uv"
|
97
|
+
cmds:
|
98
|
+
- uv sync --all-extras
|
99
|
+
|
100
|
+
test:
|
101
|
+
desc: "Run tests with pytest"
|
102
|
+
cmds:
|
103
|
+
- uv run pytest
|
104
|
+
|
105
|
+
test:cov:
|
106
|
+
desc: "Run tests with coverage"
|
107
|
+
cmds:
|
108
|
+
- uv run pytest --cov
|
109
|
+
|
110
|
+
format:
|
111
|
+
desc: "Format code with ruff"
|
112
|
+
cmds:
|
113
|
+
- uv run ruff format
|
114
|
+
|
115
|
+
lint:
|
116
|
+
desc: "Lint code with ruff"
|
117
|
+
cmds:
|
118
|
+
- uv run ruff check
|
119
|
+
|
120
|
+
lint:fix:
|
121
|
+
desc: "Fix linting issues with ruff"
|
122
|
+
cmds:
|
123
|
+
- uv run ruff check --fix
|
124
|
+
|
125
|
+
typecheck:
|
126
|
+
desc: "Type check with mypy"
|
127
|
+
cmds:
|
128
|
+
- uv run mypy
|
129
|
+
|
130
|
+
# Documentation tasks
|
131
|
+
docs:build:
|
132
|
+
desc: "Build documentation"
|
133
|
+
cmds:
|
134
|
+
- uv run mkdocs build --clean --strict
|
135
|
+
|
136
|
+
docs:serve:
|
137
|
+
desc: "Serve documentation locally"
|
138
|
+
cmds:
|
139
|
+
- uv run mkdocs serve
|
140
|
+
|
141
|
+
# CI/CD tasks
|
142
|
+
ci:
|
143
|
+
desc: "Run full CI pipeline locally"
|
144
|
+
deps:
|
145
|
+
- format
|
146
|
+
- lint
|
147
|
+
- typecheck
|
148
|
+
- test:cov
|
149
|
+
|
150
|
+
ci:docker:
|
151
|
+
desc: "Run Docker CI pipeline"
|
152
|
+
deps:
|
153
|
+
- docker:build
|
154
|
+
- docker:test
|
155
|
+
|
156
|
+
# Default task
|
157
|
+
default:
|
158
|
+
desc: "Show available tasks"
|
159
|
+
cmds:
|
160
|
+
- task --list
|