kreuzberg 3.2.0__tar.gz → 3.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-3.4.0/.commitlintrc +1 -0
- kreuzberg-3.4.0/.docker/Dockerfile +21 -0
- kreuzberg-3.4.0/.docker/README.md +87 -0
- kreuzberg-3.4.0/.dockerignore +15 -0
- kreuzberg-3.4.0/.github/benchmarks/README.md +15 -0
- kreuzberg-3.4.0/.github/dependabot.yaml +6 -0
- kreuzberg-3.4.0/.github/workflows/ci.yaml +124 -0
- kreuzberg-3.4.0/.github/workflows/pr-title.yaml +20 -0
- kreuzberg-3.4.0/.github/workflows/publish-docker.yml +101 -0
- kreuzberg-3.4.0/.github/workflows/release.yaml +31 -0
- kreuzberg-3.4.0/.gitignore +34 -0
- kreuzberg-3.4.0/.markdownlint.yaml +17 -0
- kreuzberg-3.4.0/.pre-commit-config.yaml +86 -0
- kreuzberg-3.4.0/PKG-INFO +290 -0
- kreuzberg-3.4.0/README.md +229 -0
- kreuzberg-3.4.0/ai-rulez.yaml +166 -0
- kreuzberg-3.4.0/benchmarks/README.md +152 -0
- kreuzberg-3.4.0/benchmarks/benchmark_baseline.py +117 -0
- kreuzberg-3.4.0/benchmarks/end_to_end_benchmark.py +239 -0
- kreuzberg-3.4.0/benchmarks/final_benchmark.py +147 -0
- kreuzberg-3.4.0/benchmarks/pyproject.toml +28 -0
- kreuzberg-3.4.0/benchmarks/results/baseline_results.json +35 -0
- kreuzberg-3.4.0/benchmarks/results/benchmark_msgpack_20250702_003800.json +50 -0
- kreuzberg-3.4.0/benchmarks/results/comprehensive_caching_results.json +55 -0
- kreuzberg-3.4.0/benchmarks/results/final_benchmark_results.json +12 -0
- kreuzberg-3.4.0/benchmarks/results/mime_caching_results.json +18 -0
- kreuzberg-3.4.0/benchmarks/results/msgspec_caching_results.json +10 -0
- kreuzberg-3.4.0/benchmarks/results/ocr_caching_results.json +17 -0
- kreuzberg-3.4.0/benchmarks/results/serialization_benchmark_results.json +42 -0
- kreuzberg-3.4.0/benchmarks/results/statistical_benchmark_results.json +26 -0
- kreuzberg-3.4.0/benchmarks/results/table_caching_results.json +17 -0
- kreuzberg-3.4.0/benchmarks/serialization_benchmark.py +167 -0
- kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/__init__.py +3 -0
- kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/__main__.py +6 -0
- kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +274 -0
- kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/cli.py +247 -0
- kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/models.py +145 -0
- kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/profiler.py +184 -0
- kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/runner.py +278 -0
- kreuzberg-3.4.0/benchmarks/statistical_benchmark.py +220 -0
- kreuzberg-3.4.0/docs/advanced/custom-extractors.md +203 -0
- kreuzberg-3.4.0/docs/advanced/custom-hooks.md +148 -0
- kreuzberg-3.4.0/docs/advanced/error-handling.md +181 -0
- kreuzberg-3.4.0/docs/advanced/index.md +41 -0
- kreuzberg-3.4.0/docs/advanced/performance.md +269 -0
- kreuzberg-3.4.0/docs/api-reference/exceptions.md +33 -0
- kreuzberg-3.4.0/docs/api-reference/extraction-functions.md +59 -0
- kreuzberg-3.4.0/docs/api-reference/extractor-registry.md +5 -0
- kreuzberg-3.4.0/docs/api-reference/index.md +51 -0
- kreuzberg-3.4.0/docs/api-reference/ocr-configuration.md +27 -0
- kreuzberg-3.4.0/docs/api-reference/types.md +51 -0
- kreuzberg-3.4.0/docs/assets/favicon.png +0 -0
- kreuzberg-3.4.0/docs/assets/logo.png +0 -0
- kreuzberg-3.4.0/docs/changelog.md +30 -0
- kreuzberg-3.4.0/docs/cli.md +190 -0
- kreuzberg-3.4.0/docs/contributing.md +78 -0
- kreuzberg-3.4.0/docs/css/extra.css +56 -0
- kreuzberg-3.4.0/docs/examples/extraction-examples.md +195 -0
- kreuzberg-3.4.0/docs/examples/index.md +48 -0
- kreuzberg-3.4.0/docs/getting-started/index.md +20 -0
- kreuzberg-3.4.0/docs/getting-started/installation.md +117 -0
- kreuzberg-3.4.0/docs/getting-started/quick-start.md +111 -0
- kreuzberg-3.4.0/docs/index.md +15 -0
- kreuzberg-3.4.0/docs/user-guide/api-server.md +169 -0
- kreuzberg-3.4.0/docs/user-guide/basic-usage.md +133 -0
- kreuzberg-3.4.0/docs/user-guide/chunking.md +124 -0
- kreuzberg-3.4.0/docs/user-guide/docker.md +249 -0
- kreuzberg-3.4.0/docs/user-guide/extraction-configuration.md +162 -0
- kreuzberg-3.4.0/docs/user-guide/index.md +42 -0
- kreuzberg-3.4.0/docs/user-guide/metadata-extraction.md +74 -0
- kreuzberg-3.4.0/docs/user-guide/ocr-backends.md +238 -0
- kreuzberg-3.4.0/docs/user-guide/ocr-configuration.md +161 -0
- kreuzberg-3.4.0/docs/user-guide/supported-formats.md +48 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/__init__.py +3 -0
- kreuzberg-3.4.0/kreuzberg/__main__.py +8 -0
- kreuzberg-3.4.0/kreuzberg/_api/main.py +87 -0
- kreuzberg-3.4.0/kreuzberg/_cli_config.py +175 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_image.py +39 -4
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_pandoc.py +158 -18
- kreuzberg-3.4.0/kreuzberg/_extractors/_pdf.py +351 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_presentation.py +1 -1
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_spread_sheet.py +65 -7
- kreuzberg-3.4.0/kreuzberg/_gmft.py +380 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_mime_types.py +62 -16
- kreuzberg-3.4.0/kreuzberg/_multiprocessing/__init__.py +6 -0
- kreuzberg-3.4.0/kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
- kreuzberg-3.4.0/kreuzberg/_multiprocessing/process_manager.py +188 -0
- kreuzberg-3.4.0/kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
- kreuzberg-3.4.0/kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_easyocr.py +6 -12
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_paddleocr.py +15 -13
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_tesseract.py +136 -46
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_playa.py +43 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_types.py +4 -0
- kreuzberg-3.4.0/kreuzberg/_utils/_cache.py +372 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_device.py +10 -27
- kreuzberg-3.4.0/kreuzberg/_utils/_document_cache.py +220 -0
- kreuzberg-3.4.0/kreuzberg/_utils/_errors.py +232 -0
- kreuzberg-3.4.0/kreuzberg/_utils/_pdf_lock.py +72 -0
- kreuzberg-3.4.0/kreuzberg/_utils/_process_pool.py +100 -0
- kreuzberg-3.4.0/kreuzberg/_utils/_serialization.py +82 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_string.py +1 -1
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_sync.py +21 -0
- kreuzberg-3.4.0/kreuzberg/cli.py +338 -0
- kreuzberg-3.4.0/kreuzberg/extraction.py +462 -0
- kreuzberg-3.4.0/kreuzberg/py.typed +0 -0
- kreuzberg-3.4.0/mkdocs.yaml +157 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/pyproject.toml +68 -28
- kreuzberg-3.4.0/tests/__init__.py +0 -0
- kreuzberg-3.4.0/tests/api/__init__.py +0 -0
- kreuzberg-3.4.0/tests/api/main_test.py +252 -0
- kreuzberg-3.4.0/tests/chunker_test.py +102 -0
- kreuzberg-3.4.0/tests/cli_integration_test.py +523 -0
- kreuzberg-3.4.0/tests/cli_test.py +335 -0
- kreuzberg-3.4.0/tests/conftest.py +117 -0
- kreuzberg-3.4.0/tests/exceptions_test.py +101 -0
- kreuzberg-3.4.0/tests/extraction_batch_test.py +278 -0
- kreuzberg-3.4.0/tests/extraction_test.py +373 -0
- kreuzberg-3.4.0/tests/extractors/__init__.py +0 -0
- kreuzberg-3.4.0/tests/extractors/html_test.py +54 -0
- kreuzberg-3.4.0/tests/extractors/image_test.py +240 -0
- kreuzberg-3.4.0/tests/extractors/pandoc_metadata_test.py +323 -0
- kreuzberg-3.4.0/tests/extractors/pandoc_test.py +458 -0
- kreuzberg-3.4.0/tests/extractors/pdf_test.py +385 -0
- kreuzberg-3.4.0/tests/extractors/presentation_test.py +410 -0
- kreuzberg-3.4.0/tests/extractors/spreed_sheet_test.py +325 -0
- kreuzberg-3.4.0/tests/gmft_extended_test.py +163 -0
- kreuzberg-3.4.0/tests/gmft_test.py +387 -0
- kreuzberg-3.4.0/tests/hooks_test.py +205 -0
- kreuzberg-3.4.0/tests/mime_types_test.py +199 -0
- kreuzberg-3.4.0/tests/multiprocessing/__init__.py +1 -0
- kreuzberg-3.4.0/tests/multiprocessing/gmft_integration_test.py +104 -0
- kreuzberg-3.4.0/tests/multiprocessing/process_manager_test.py +282 -0
- kreuzberg-3.4.0/tests/multiprocessing/sync_tesseract_test.py +367 -0
- kreuzberg-3.4.0/tests/multiprocessing/tesseract_pool_test.py +349 -0
- kreuzberg-3.4.0/tests/ocr/__init__.py +0 -0
- kreuzberg-3.4.0/tests/ocr/base_test.py +79 -0
- kreuzberg-3.4.0/tests/ocr/device_integration_test.py +270 -0
- kreuzberg-3.4.0/tests/ocr/easyocr_test.py +462 -0
- kreuzberg-3.4.0/tests/ocr/init_test.py +41 -0
- kreuzberg-3.4.0/tests/ocr/paddleocr_test.py +857 -0
- kreuzberg-3.4.0/tests/ocr/tesseract_test.py +433 -0
- kreuzberg-3.4.0/tests/playa_test.py +111 -0
- kreuzberg-3.4.0/tests/registry_test.py +190 -0
- kreuzberg-3.4.0/tests/test_source_files/document.docx +0 -0
- kreuzberg-3.4.0/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- kreuzberg-3.4.0/tests/test_source_files/excel.xlsx +0 -0
- kreuzberg-3.4.0/tests/test_source_files/html.html +10 -0
- kreuzberg-3.4.0/tests/test_source_files/markdown.md +1 -0
- kreuzberg-3.4.0/tests/test_source_files/non-ascii-text.pdf +0 -0
- kreuzberg-3.4.0/tests/test_source_files/non-searchable.pdf +0 -0
- kreuzberg-3.4.0/tests/test_source_files/ocr-image.jpg +0 -0
- kreuzberg-3.4.0/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- kreuzberg-3.4.0/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- kreuzberg-3.4.0/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- kreuzberg-3.4.0/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- kreuzberg-3.4.0/tests/test_source_files/sample-contract.pdf +0 -0
- kreuzberg-3.4.0/tests/test_source_files/scanned.pdf +0 -0
- kreuzberg-3.4.0/tests/test_source_files/searchable.pdf +0 -0
- kreuzberg-3.4.0/tests/test_source_files/test-article.pdf +0 -0
- kreuzberg-3.4.0/tests/types_test.py +132 -0
- kreuzberg-3.4.0/tests/utils/__init__.py +0 -0
- kreuzberg-3.4.0/tests/utils/cache_test.py +473 -0
- kreuzberg-3.4.0/tests/utils/device_test.py +349 -0
- kreuzberg-3.4.0/tests/utils/errors_test.py +309 -0
- kreuzberg-3.4.0/tests/utils/pdf_lock_test.py +233 -0
- kreuzberg-3.4.0/tests/utils/process_pool_test.py +246 -0
- kreuzberg-3.4.0/tests/utils/serialization_test.py +336 -0
- kreuzberg-3.4.0/tests/utils/string_test.py +85 -0
- kreuzberg-3.4.0/tests/utils/sync_test.py +309 -0
- kreuzberg-3.4.0/tests/utils/tmp_test.py +50 -0
- kreuzberg-3.4.0/uv.lock +3804 -0
- kreuzberg-3.2.0/PKG-INFO +0 -166
- kreuzberg-3.2.0/README.md +0 -112
- kreuzberg-3.2.0/kreuzberg/_extractors/_pdf.py +0 -171
- kreuzberg-3.2.0/kreuzberg/_gmft.py +0 -174
- kreuzberg-3.2.0/kreuzberg/extraction.py +0 -251
- kreuzberg-3.2.0/kreuzberg.egg-info/PKG-INFO +0 -166
- kreuzberg-3.2.0/kreuzberg.egg-info/SOURCES.txt +0 -37
- kreuzberg-3.2.0/kreuzberg.egg-info/dependency_links.txt +0 -1
- kreuzberg-3.2.0/kreuzberg.egg-info/requires.txt +0 -35
- kreuzberg-3.2.0/kreuzberg.egg-info/top_level.txt +0 -1
- kreuzberg-3.2.0/setup.cfg +0 -4
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/LICENSE +0 -0
- {kreuzberg-3.2.0/kreuzberg/_extractors → kreuzberg-3.4.0/kreuzberg/_api}/__init__.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.2.0/kreuzberg/_utils → kreuzberg-3.4.0/kreuzberg/_extractors}/__init__.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_registry.py +0 -0
- /kreuzberg-3.2.0/kreuzberg/py.typed → /kreuzberg-3.4.0/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/exceptions.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
{ "extends": ["@commitlint/config-conventional"] }
|
@@ -0,0 +1,21 @@
|
|
1
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm as app
|
2
|
+
ARG EXTRAS=""
|
3
|
+
WORKDIR /app
|
4
|
+
ENV PYTHONDONTWRITEBYTECODE 1
|
5
|
+
ENV PYTHONUNBUFFERED 1
|
6
|
+
ENV UV_LINK_MODE=copy
|
7
|
+
|
8
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
9
|
+
pandoc \
|
10
|
+
tesseract-ocr \
|
11
|
+
&& apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
12
|
+
|
13
|
+
|
14
|
+
COPY pyproject.toml uv.lock README.md ./
|
15
|
+
COPY kreuzberg kreuzberg
|
16
|
+
|
17
|
+
RUN uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --compile-bytecode
|
18
|
+
|
19
|
+
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
|
20
|
+
USER appuser
|
21
|
+
CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# Kreuzberg Docker Images
|
2
|
+
|
3
|
+
[](https://github.com/Goldziher/kreuzberg)
|
4
|
+
[](https://badge.fury.io/py/kreuzberg)
|
5
|
+
[](https://goldziher.github.io/kreuzberg/)
|
6
|
+
[](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE)
|
7
|
+
|
8
|
+
High-performance Python library for text extraction from documents, available as Docker images.
|
9
|
+
|
10
|
+
**Source Code**: [github.com/Goldziher/kreuzberg](https://github.com/Goldziher/kreuzberg)
|
11
|
+
|
12
|
+
## Quick Start
|
13
|
+
|
14
|
+
```bash
|
15
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
16
|
+
```
|
17
|
+
|
18
|
+
## Available Tags
|
19
|
+
|
20
|
+
- `latest` - Latest stable release with API server and Tesseract OCR
|
21
|
+
- `X.Y.Z` - Specific version (e.g., `3.0.0`)
|
22
|
+
- `X.Y.Z-easyocr` - With EasyOCR support
|
23
|
+
- `X.Y.Z-paddle` - With PaddleOCR support
|
24
|
+
- `X.Y.Z-gmft` - With GMFT table extraction
|
25
|
+
- `X.Y.Z-all` - With all optional dependencies
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
### Extract Files via API
|
30
|
+
|
31
|
+
```bash
|
32
|
+
# Single file
|
33
|
+
curl -X POST http://localhost:8000/extract \
|
34
|
+
-F "data=@document.pdf"
|
35
|
+
|
36
|
+
# Multiple files
|
37
|
+
curl -X POST http://localhost:8000/extract \
|
38
|
+
-F "data=@document1.pdf" \
|
39
|
+
-F "data=@document2.docx"
|
40
|
+
```
|
41
|
+
|
42
|
+
### Docker Compose
|
43
|
+
|
44
|
+
```yaml
|
45
|
+
version: '3.8'
|
46
|
+
|
47
|
+
services:
|
48
|
+
kreuzberg:
|
49
|
+
image: goldziher/kreuzberg:latest
|
50
|
+
ports:
|
51
|
+
- "8000:8000"
|
52
|
+
restart: unless-stopped
|
53
|
+
```
|
54
|
+
|
55
|
+
## Features
|
56
|
+
|
57
|
+
- **🚀 High Performance**: Optimized for speed and efficiency
|
58
|
+
- **📄 Multiple Formats**: PDF, DOCX, images, HTML, and more
|
59
|
+
- **🔍 OCR Support**: Built-in Tesseract, optional EasyOCR/PaddleOCR
|
60
|
+
- **📊 Table Extraction**: Extract tables with GMFT
|
61
|
+
- **🔒 Secure**: Runs as non-root user, no external API calls
|
62
|
+
- **📦 Ready to Use**: Pre-configured API server
|
63
|
+
|
64
|
+
## Documentation
|
65
|
+
|
66
|
+
- **[GitHub Repository](https://github.com/Goldziher/kreuzberg)** - Source code and issue tracking
|
67
|
+
- **[Full Documentation](https://goldziher.github.io/kreuzberg/)** - Complete user guide and API reference
|
68
|
+
- **[API Documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/)** - REST API endpoints and usage
|
69
|
+
- **[Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/)** - Detailed Docker usage guide
|
70
|
+
|
71
|
+
## Support
|
72
|
+
|
73
|
+
- **Issues**: [github.com/Goldziher/kreuzberg/issues](https://github.com/Goldziher/kreuzberg/issues)
|
74
|
+
- **Discussions**: [github.com/Goldziher/kreuzberg/discussions](https://github.com/Goldziher/kreuzberg/discussions)
|
75
|
+
- **Discord**: [Join our community](https://discord.gg/pXxagNK2zN)
|
76
|
+
|
77
|
+
## Contributing
|
78
|
+
|
79
|
+
Contributions are welcome! See our [Contributing Guide](https://github.com/Goldziher/kreuzberg/blob/main/docs/contributing.md).
|
80
|
+
|
81
|
+
## License
|
82
|
+
|
83
|
+
MIT License - see [LICENSE](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE) for details.
|
84
|
+
|
85
|
+
______________________________________________________________________
|
86
|
+
|
87
|
+
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# Performance Baseline
|
2
|
+
|
3
|
+
This directory contains baseline performance metrics for the Kreuzberg library.
|
4
|
+
|
5
|
+
## Files
|
6
|
+
|
7
|
+
- `baseline.json` - Performance baseline automatically updated from main branch CI
|
8
|
+
- This file is used for performance regression detection in PRs
|
9
|
+
|
10
|
+
## How it works
|
11
|
+
|
12
|
+
1. When code is pushed to `main`, CI runs benchmarks and stores results as `baseline.json`
|
13
|
+
1. When PRs are opened, CI compares current performance against this baseline
|
14
|
+
1. If performance degrades beyond threshold (20%), the CI check fails
|
15
|
+
1. The baseline is automatically updated when new changes are merged to main
|
@@ -0,0 +1,124 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
branches:
|
6
|
+
- main
|
7
|
+
push:
|
8
|
+
branches:
|
9
|
+
- main
|
10
|
+
- feat/smart-multiprocessing
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
validate:
|
14
|
+
runs-on: ubuntu-latest
|
15
|
+
timeout-minutes: 10
|
16
|
+
steps:
|
17
|
+
- name: Checkout
|
18
|
+
uses: actions/checkout@v4
|
19
|
+
|
20
|
+
- name: Install uv
|
21
|
+
uses: astral-sh/setup-uv@v6
|
22
|
+
with:
|
23
|
+
enable-cache: true
|
24
|
+
|
25
|
+
- name: Set up Python
|
26
|
+
uses: actions/setup-python@v5
|
27
|
+
with:
|
28
|
+
python-version-file: "pyproject.toml"
|
29
|
+
|
30
|
+
- name: Install Dependencies
|
31
|
+
shell: bash
|
32
|
+
run: |
|
33
|
+
if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
|
34
|
+
echo "Removing existing .venv directory on Windows"
|
35
|
+
rm -rf .venv
|
36
|
+
fi
|
37
|
+
uv sync --all-packages --all-extras --dev
|
38
|
+
|
39
|
+
- name: Load Cached Pre-Commit Dependencies
|
40
|
+
id: cached-pre-commit-dependencies
|
41
|
+
uses: actions/cache@v4
|
42
|
+
with:
|
43
|
+
path: ~/.cache/pre-commit/
|
44
|
+
key: pre-commit|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
|
45
|
+
|
46
|
+
- name: Execute Pre-Commit
|
47
|
+
run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
|
48
|
+
|
49
|
+
test:
|
50
|
+
strategy:
|
51
|
+
matrix:
|
52
|
+
os: [ ubuntu-latest, macOS-latest, windows-latest ]
|
53
|
+
python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.9", "3.10", "3.11", "3.12", "3.13"]') }}
|
54
|
+
runs-on: ${{ matrix.os }}
|
55
|
+
timeout-minutes: 30
|
56
|
+
steps:
|
57
|
+
- name: Checkout
|
58
|
+
uses: actions/checkout@v4
|
59
|
+
|
60
|
+
- name: Install uv
|
61
|
+
uses: astral-sh/setup-uv@v6
|
62
|
+
with:
|
63
|
+
enable-cache: true
|
64
|
+
|
65
|
+
- name: Install Python
|
66
|
+
uses: actions/setup-python@v5
|
67
|
+
id: setup-python
|
68
|
+
with:
|
69
|
+
python-version: ${{ matrix.python }}
|
70
|
+
|
71
|
+
- name: Cache Python Dependencies
|
72
|
+
id: python-cache
|
73
|
+
uses: actions/cache@v4
|
74
|
+
with:
|
75
|
+
path: |
|
76
|
+
~/.cache/uv
|
77
|
+
.venv
|
78
|
+
key: python-dependencies-${{ matrix.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('uv.lock') }}
|
79
|
+
restore-keys: |
|
80
|
+
python-dependencies-${{ matrix.os }}-${{ matrix.python }}-
|
81
|
+
|
82
|
+
- name: Install Dependencies
|
83
|
+
shell: bash
|
84
|
+
run: |
|
85
|
+
if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
|
86
|
+
echo "Removing existing .venv directory on Windows"
|
87
|
+
rm -rf .venv
|
88
|
+
fi
|
89
|
+
uv sync --all-packages --all-extras --dev
|
90
|
+
|
91
|
+
- name: Cache Test Artifacts
|
92
|
+
uses: actions/cache@v4
|
93
|
+
with:
|
94
|
+
path: .pytest_cache/
|
95
|
+
key: pytest-cache-${{ matrix.os }}-${{ matrix.python }}
|
96
|
+
|
97
|
+
- name: Cache and Install Homebrew (macOS)
|
98
|
+
if: runner.os == 'macOS'
|
99
|
+
uses: tecolicom/actions-use-homebrew-tools@v1
|
100
|
+
with:
|
101
|
+
tools: 'tesseract tesseract-lang pandoc'
|
102
|
+
key: 'homebrew-tools-${{ runner.os }}'
|
103
|
+
cache: yes
|
104
|
+
verbose: false
|
105
|
+
|
106
|
+
- name: Cache and Install APT Packages (Linux)
|
107
|
+
if: runner.os == 'Linux'
|
108
|
+
uses: awalsh128/cache-apt-pkgs-action@latest
|
109
|
+
with:
|
110
|
+
packages: tesseract-ocr tesseract-ocr-deu pandoc
|
111
|
+
version: 1.0
|
112
|
+
|
113
|
+
- name: Install System Dependencies (Windows)
|
114
|
+
if: runner.os == 'Windows'
|
115
|
+
run: |
|
116
|
+
choco install -y tesseract pandoc
|
117
|
+
Write-Output "C:\Program Files\Tesseract-OCR" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
118
|
+
Write-Output "C:\Program Files\Pandoc" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
119
|
+
$env:PATH = "C:\Program Files\Tesseract-OCR;C:\Program Files\Pandoc;" + $env:PATH
|
120
|
+
tesseract --version
|
121
|
+
pandoc --version
|
122
|
+
|
123
|
+
- name: Run Tests
|
124
|
+
run: uv run pytest -s -vvv
|
@@ -0,0 +1,20 @@
|
|
1
|
+
name: "Check PR Title"
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request_target:
|
5
|
+
types:
|
6
|
+
- opened
|
7
|
+
- edited
|
8
|
+
- synchronize
|
9
|
+
|
10
|
+
permissions:
|
11
|
+
pull-requests: read
|
12
|
+
|
13
|
+
jobs:
|
14
|
+
main:
|
15
|
+
name: Validate PR title
|
16
|
+
runs-on: ubuntu-latest
|
17
|
+
steps:
|
18
|
+
- uses: amannn/action-semantic-pull-request@v5
|
19
|
+
env:
|
20
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# .github/workflows/publish-docker.yml
|
2
|
+
|
3
|
+
name: Publish Docker Images
|
4
|
+
|
5
|
+
on:
|
6
|
+
workflow_run:
|
7
|
+
workflows: ["Release"]
|
8
|
+
types:
|
9
|
+
- completed
|
10
|
+
branches:
|
11
|
+
- main
|
12
|
+
|
13
|
+
jobs:
|
14
|
+
build-and-push:
|
15
|
+
runs-on: ubuntu-latest
|
16
|
+
if: ${{ github.event.workflow_run.conclusion == 'success' }}
|
17
|
+
permissions:
|
18
|
+
contents: read
|
19
|
+
packages: write
|
20
|
+
|
21
|
+
strategy:
|
22
|
+
matrix:
|
23
|
+
include:
|
24
|
+
- name: core
|
25
|
+
extras: ""
|
26
|
+
tag_suffix: "" # The base image tag (includes API + tesseract)
|
27
|
+
- name: easyocr
|
28
|
+
extras: "easyocr"
|
29
|
+
tag_suffix: "-easyocr"
|
30
|
+
- name: paddle
|
31
|
+
extras: "paddleocr"
|
32
|
+
tag_suffix: "-paddle"
|
33
|
+
- name: gmft
|
34
|
+
extras: "gmft"
|
35
|
+
tag_suffix: "-gmft"
|
36
|
+
- name: all
|
37
|
+
extras: "all"
|
38
|
+
tag_suffix: "-all"
|
39
|
+
|
40
|
+
steps:
|
41
|
+
- name: Checkout repository
|
42
|
+
uses: actions/checkout@v4
|
43
|
+
with:
|
44
|
+
ref: ${{ github.event.workflow_run.head_branch }}
|
45
|
+
|
46
|
+
- name: Get release version
|
47
|
+
id: get_version
|
48
|
+
run: |
|
49
|
+
echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
|
50
|
+
# If triggered by a tag, extract version
|
51
|
+
if [[ "${{ github.event.workflow_run.head_branch }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
|
52
|
+
echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
|
53
|
+
else
|
54
|
+
# Get the latest tag
|
55
|
+
git fetch --tags
|
56
|
+
echo "VERSION=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
|
57
|
+
fi
|
58
|
+
|
59
|
+
- name: Set up QEMU
|
60
|
+
uses: docker/setup-qemu-action@v3
|
61
|
+
|
62
|
+
- name: Set up Docker Buildx
|
63
|
+
uses: docker/setup-buildx-action@v3
|
64
|
+
|
65
|
+
- name: Log in to Docker Hub
|
66
|
+
uses: docker/login-action@v3
|
67
|
+
with:
|
68
|
+
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
69
|
+
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
70
|
+
|
71
|
+
- name: Extract metadata (tags, labels) for Docker
|
72
|
+
id: meta
|
73
|
+
uses: docker/metadata-action@v5
|
74
|
+
with:
|
75
|
+
images: goldziher/kreuzberg
|
76
|
+
tags: |
|
77
|
+
# Release version tag (e.g., v3.0.0-easyocr)
|
78
|
+
type=raw,value=${{ steps.get_version.outputs.VERSION }}${{ matrix.tag_suffix }}
|
79
|
+
# Latest tag for each variant (e.g., latest-easyocr)
|
80
|
+
type=raw,value=latest${{ matrix.tag_suffix }}
|
81
|
+
|
82
|
+
- name: Build and push Docker image
|
83
|
+
uses: docker/build-push-action@v5
|
84
|
+
with:
|
85
|
+
context: .
|
86
|
+
file: ./.docker/Dockerfile
|
87
|
+
platforms: linux/amd64,linux/arm64
|
88
|
+
push: true
|
89
|
+
build-args: |
|
90
|
+
EXTRAS=${{ matrix.extras }}
|
91
|
+
tags: ${{ steps.meta.outputs.tags }}
|
92
|
+
labels: ${{ steps.meta.outputs.labels }}
|
93
|
+
|
94
|
+
- name: Update Docker Hub README
|
95
|
+
uses: peter-evans/dockerhub-description@v4
|
96
|
+
if: matrix.name == 'core'
|
97
|
+
with:
|
98
|
+
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
99
|
+
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
100
|
+
repository: goldziher/kreuzberg
|
101
|
+
readme-filepath: ./.docker/README.md
|
@@ -0,0 +1,31 @@
|
|
1
|
+
name: Release
|
2
|
+
|
3
|
+
on:
|
4
|
+
release:
|
5
|
+
types: [published]
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
release:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
environment: pypi
|
11
|
+
permissions:
|
12
|
+
id-token: write
|
13
|
+
steps:
|
14
|
+
- name: Checkout
|
15
|
+
uses: actions/checkout@v4
|
16
|
+
|
17
|
+
- name: Install uv
|
18
|
+
uses: astral-sh/setup-uv@v6
|
19
|
+
with:
|
20
|
+
enable-cache: true
|
21
|
+
|
22
|
+
- name: Set up Python
|
23
|
+
uses: actions/setup-python@v5
|
24
|
+
with:
|
25
|
+
python-version-file: "pyproject.toml"
|
26
|
+
|
27
|
+
- name: Install Dependencies
|
28
|
+
run: uv build
|
29
|
+
|
30
|
+
- name: Publish
|
31
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
@@ -0,0 +1,34 @@
|
|
1
|
+
*$py.class
|
2
|
+
*.Cache
|
3
|
+
*.cscfg
|
4
|
+
*.egg-info/
|
5
|
+
*.log
|
6
|
+
*.py[cod]
|
7
|
+
*.suo
|
8
|
+
*.user
|
9
|
+
*temp/
|
10
|
+
.coverage
|
11
|
+
.coverage*
|
12
|
+
.cursorrules
|
13
|
+
.dist/
|
14
|
+
.DS_store
|
15
|
+
.env
|
16
|
+
.idea/
|
17
|
+
.kreuzberg/
|
18
|
+
.mypy_cache/
|
19
|
+
.pytest_cache/
|
20
|
+
.python-version
|
21
|
+
.ropeproject
|
22
|
+
.ruff_cache/
|
23
|
+
.run/
|
24
|
+
.venv/
|
25
|
+
.vscode/
|
26
|
+
.windsurfrules
|
27
|
+
__pycache__/
|
28
|
+
benchmark_results.json
|
29
|
+
CLAUDE.md
|
30
|
+
coverage.xml
|
31
|
+
docker-compose.yaml
|
32
|
+
GEMINI.md
|
33
|
+
prompt_template.egg-info/
|
34
|
+
requirements.txt
|
@@ -0,0 +1,86 @@
|
|
1
|
+
repos:
|
2
|
+
- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
|
3
|
+
rev: "v9.22.0"
|
4
|
+
hooks:
|
5
|
+
- id: commitlint
|
6
|
+
stages: [commit-msg]
|
7
|
+
additional_dependencies: ["@commitlint/config-conventional"]
|
8
|
+
- repo: https://github.com/Goldziher/ai-rulez
|
9
|
+
rev: v1.1.2
|
10
|
+
hooks:
|
11
|
+
- id: ai-rulez-validate
|
12
|
+
- id: ai-rulez-generate
|
13
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
14
|
+
rev: v5.0.0
|
15
|
+
hooks:
|
16
|
+
- id: name-tests-test
|
17
|
+
args:
|
18
|
+
- --pytest
|
19
|
+
exclude: factories|test_utils|completion.py|test_data
|
20
|
+
- id: trailing-whitespace
|
21
|
+
- id: end-of-file-fixer
|
22
|
+
- id: check-toml
|
23
|
+
- id: check-case-conflict
|
24
|
+
- id: detect-private-key
|
25
|
+
- repo: https://github.com/abravalheri/validate-pyproject
|
26
|
+
rev: v0.24.1
|
27
|
+
hooks:
|
28
|
+
- id: validate-pyproject
|
29
|
+
- repo: https://github.com/executablebooks/mdformat
|
30
|
+
rev: 0.7.22
|
31
|
+
hooks:
|
32
|
+
- id: mdformat
|
33
|
+
additional_dependencies:
|
34
|
+
- mdformat-mkdocs==4.0.0
|
35
|
+
- repo: https://github.com/igorshubovych/markdownlint-cli
|
36
|
+
rev: v0.45.0
|
37
|
+
hooks:
|
38
|
+
- id: markdownlint-fix
|
39
|
+
- repo: https://github.com/adamchainz/blacken-docs
|
40
|
+
rev: 1.19.1
|
41
|
+
hooks:
|
42
|
+
- id: blacken-docs
|
43
|
+
args: ["--pyi", "--line-length", "130"]
|
44
|
+
additional_dependencies:
|
45
|
+
- black==25.1.0
|
46
|
+
- repo: https://github.com/rbubley/mirrors-prettier
|
47
|
+
rev: "v3.6.2"
|
48
|
+
hooks:
|
49
|
+
- id: prettier
|
50
|
+
exclude: ^tests|^.idea|^migrations|^.git|README.md|^docs
|
51
|
+
- repo: https://github.com/tox-dev/pyproject-fmt
|
52
|
+
rev: "v2.6.0"
|
53
|
+
hooks:
|
54
|
+
- id: pyproject-fmt
|
55
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
56
|
+
rev: v0.12.1
|
57
|
+
hooks:
|
58
|
+
- id: ruff
|
59
|
+
args: ["--fix", "--unsafe-fixes"]
|
60
|
+
- id: ruff-format
|
61
|
+
- repo: https://github.com/codespell-project/codespell
|
62
|
+
rev: v2.4.1
|
63
|
+
hooks:
|
64
|
+
- id: codespell
|
65
|
+
exclude: ^tests|^scripts|^kreuzberg/_tesseract|^kreuzberg/_mime_types
|
66
|
+
additional_dependencies:
|
67
|
+
- tomli
|
68
|
+
- repo: https://github.com/jsh9/pydoclint
|
69
|
+
rev: 0.6.7
|
70
|
+
hooks:
|
71
|
+
- id: pydoclint
|
72
|
+
args:
|
73
|
+
[
|
74
|
+
--style=google,
|
75
|
+
--check-return-types=False,
|
76
|
+
--arg-type-hints-in-docstring=False,
|
77
|
+
]
|
78
|
+
exclude: ^benchmarks/|^kreuzberg/_|^tests/|^scripts/|^run_benchmarks\.py
|
79
|
+
- repo: local
|
80
|
+
hooks:
|
81
|
+
- id: mypy
|
82
|
+
name: mypy
|
83
|
+
entry: uv run mypy
|
84
|
+
require_serial: true
|
85
|
+
language: system
|
86
|
+
types: [python]
|