kreuzberg 3.2.0__tar.gz → 3.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. kreuzberg-3.4.0/.commitlintrc +1 -0
  2. kreuzberg-3.4.0/.docker/Dockerfile +21 -0
  3. kreuzberg-3.4.0/.docker/README.md +87 -0
  4. kreuzberg-3.4.0/.dockerignore +15 -0
  5. kreuzberg-3.4.0/.github/benchmarks/README.md +15 -0
  6. kreuzberg-3.4.0/.github/dependabot.yaml +6 -0
  7. kreuzberg-3.4.0/.github/workflows/ci.yaml +124 -0
  8. kreuzberg-3.4.0/.github/workflows/pr-title.yaml +20 -0
  9. kreuzberg-3.4.0/.github/workflows/publish-docker.yml +101 -0
  10. kreuzberg-3.4.0/.github/workflows/release.yaml +31 -0
  11. kreuzberg-3.4.0/.gitignore +34 -0
  12. kreuzberg-3.4.0/.markdownlint.yaml +17 -0
  13. kreuzberg-3.4.0/.pre-commit-config.yaml +86 -0
  14. kreuzberg-3.4.0/PKG-INFO +290 -0
  15. kreuzberg-3.4.0/README.md +229 -0
  16. kreuzberg-3.4.0/ai-rulez.yaml +166 -0
  17. kreuzberg-3.4.0/benchmarks/README.md +152 -0
  18. kreuzberg-3.4.0/benchmarks/benchmark_baseline.py +117 -0
  19. kreuzberg-3.4.0/benchmarks/end_to_end_benchmark.py +239 -0
  20. kreuzberg-3.4.0/benchmarks/final_benchmark.py +147 -0
  21. kreuzberg-3.4.0/benchmarks/pyproject.toml +28 -0
  22. kreuzberg-3.4.0/benchmarks/results/baseline_results.json +35 -0
  23. kreuzberg-3.4.0/benchmarks/results/benchmark_msgpack_20250702_003800.json +50 -0
  24. kreuzberg-3.4.0/benchmarks/results/comprehensive_caching_results.json +55 -0
  25. kreuzberg-3.4.0/benchmarks/results/final_benchmark_results.json +12 -0
  26. kreuzberg-3.4.0/benchmarks/results/mime_caching_results.json +18 -0
  27. kreuzberg-3.4.0/benchmarks/results/msgspec_caching_results.json +10 -0
  28. kreuzberg-3.4.0/benchmarks/results/ocr_caching_results.json +17 -0
  29. kreuzberg-3.4.0/benchmarks/results/serialization_benchmark_results.json +42 -0
  30. kreuzberg-3.4.0/benchmarks/results/statistical_benchmark_results.json +26 -0
  31. kreuzberg-3.4.0/benchmarks/results/table_caching_results.json +17 -0
  32. kreuzberg-3.4.0/benchmarks/serialization_benchmark.py +167 -0
  33. kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/__init__.py +3 -0
  34. kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/__main__.py +6 -0
  35. kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +274 -0
  36. kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/cli.py +247 -0
  37. kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/models.py +145 -0
  38. kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/profiler.py +184 -0
  39. kreuzberg-3.4.0/benchmarks/src/kreuzberg_benchmarks/runner.py +278 -0
  40. kreuzberg-3.4.0/benchmarks/statistical_benchmark.py +220 -0
  41. kreuzberg-3.4.0/docs/advanced/custom-extractors.md +203 -0
  42. kreuzberg-3.4.0/docs/advanced/custom-hooks.md +148 -0
  43. kreuzberg-3.4.0/docs/advanced/error-handling.md +181 -0
  44. kreuzberg-3.4.0/docs/advanced/index.md +41 -0
  45. kreuzberg-3.4.0/docs/advanced/performance.md +269 -0
  46. kreuzberg-3.4.0/docs/api-reference/exceptions.md +33 -0
  47. kreuzberg-3.4.0/docs/api-reference/extraction-functions.md +59 -0
  48. kreuzberg-3.4.0/docs/api-reference/extractor-registry.md +5 -0
  49. kreuzberg-3.4.0/docs/api-reference/index.md +51 -0
  50. kreuzberg-3.4.0/docs/api-reference/ocr-configuration.md +27 -0
  51. kreuzberg-3.4.0/docs/api-reference/types.md +51 -0
  52. kreuzberg-3.4.0/docs/assets/favicon.png +0 -0
  53. kreuzberg-3.4.0/docs/assets/logo.png +0 -0
  54. kreuzberg-3.4.0/docs/changelog.md +30 -0
  55. kreuzberg-3.4.0/docs/cli.md +190 -0
  56. kreuzberg-3.4.0/docs/contributing.md +78 -0
  57. kreuzberg-3.4.0/docs/css/extra.css +56 -0
  58. kreuzberg-3.4.0/docs/examples/extraction-examples.md +195 -0
  59. kreuzberg-3.4.0/docs/examples/index.md +48 -0
  60. kreuzberg-3.4.0/docs/getting-started/index.md +20 -0
  61. kreuzberg-3.4.0/docs/getting-started/installation.md +117 -0
  62. kreuzberg-3.4.0/docs/getting-started/quick-start.md +111 -0
  63. kreuzberg-3.4.0/docs/index.md +15 -0
  64. kreuzberg-3.4.0/docs/user-guide/api-server.md +169 -0
  65. kreuzberg-3.4.0/docs/user-guide/basic-usage.md +133 -0
  66. kreuzberg-3.4.0/docs/user-guide/chunking.md +124 -0
  67. kreuzberg-3.4.0/docs/user-guide/docker.md +249 -0
  68. kreuzberg-3.4.0/docs/user-guide/extraction-configuration.md +162 -0
  69. kreuzberg-3.4.0/docs/user-guide/index.md +42 -0
  70. kreuzberg-3.4.0/docs/user-guide/metadata-extraction.md +74 -0
  71. kreuzberg-3.4.0/docs/user-guide/ocr-backends.md +238 -0
  72. kreuzberg-3.4.0/docs/user-guide/ocr-configuration.md +161 -0
  73. kreuzberg-3.4.0/docs/user-guide/supported-formats.md +48 -0
  74. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/__init__.py +3 -0
  75. kreuzberg-3.4.0/kreuzberg/__main__.py +8 -0
  76. kreuzberg-3.4.0/kreuzberg/_api/main.py +87 -0
  77. kreuzberg-3.4.0/kreuzberg/_cli_config.py +175 -0
  78. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_image.py +39 -4
  79. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_pandoc.py +158 -18
  80. kreuzberg-3.4.0/kreuzberg/_extractors/_pdf.py +351 -0
  81. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_presentation.py +1 -1
  82. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_spread_sheet.py +65 -7
  83. kreuzberg-3.4.0/kreuzberg/_gmft.py +380 -0
  84. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_mime_types.py +62 -16
  85. kreuzberg-3.4.0/kreuzberg/_multiprocessing/__init__.py +6 -0
  86. kreuzberg-3.4.0/kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  87. kreuzberg-3.4.0/kreuzberg/_multiprocessing/process_manager.py +188 -0
  88. kreuzberg-3.4.0/kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  89. kreuzberg-3.4.0/kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  90. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_easyocr.py +6 -12
  91. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_paddleocr.py +15 -13
  92. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_tesseract.py +136 -46
  93. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_playa.py +43 -0
  94. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_types.py +4 -0
  95. kreuzberg-3.4.0/kreuzberg/_utils/_cache.py +372 -0
  96. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_device.py +10 -27
  97. kreuzberg-3.4.0/kreuzberg/_utils/_document_cache.py +220 -0
  98. kreuzberg-3.4.0/kreuzberg/_utils/_errors.py +232 -0
  99. kreuzberg-3.4.0/kreuzberg/_utils/_pdf_lock.py +72 -0
  100. kreuzberg-3.4.0/kreuzberg/_utils/_process_pool.py +100 -0
  101. kreuzberg-3.4.0/kreuzberg/_utils/_serialization.py +82 -0
  102. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_string.py +1 -1
  103. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_sync.py +21 -0
  104. kreuzberg-3.4.0/kreuzberg/cli.py +338 -0
  105. kreuzberg-3.4.0/kreuzberg/extraction.py +462 -0
  106. kreuzberg-3.4.0/kreuzberg/py.typed +0 -0
  107. kreuzberg-3.4.0/mkdocs.yaml +157 -0
  108. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/pyproject.toml +68 -28
  109. kreuzberg-3.4.0/tests/__init__.py +0 -0
  110. kreuzberg-3.4.0/tests/api/__init__.py +0 -0
  111. kreuzberg-3.4.0/tests/api/main_test.py +252 -0
  112. kreuzberg-3.4.0/tests/chunker_test.py +102 -0
  113. kreuzberg-3.4.0/tests/cli_integration_test.py +523 -0
  114. kreuzberg-3.4.0/tests/cli_test.py +335 -0
  115. kreuzberg-3.4.0/tests/conftest.py +117 -0
  116. kreuzberg-3.4.0/tests/exceptions_test.py +101 -0
  117. kreuzberg-3.4.0/tests/extraction_batch_test.py +278 -0
  118. kreuzberg-3.4.0/tests/extraction_test.py +373 -0
  119. kreuzberg-3.4.0/tests/extractors/__init__.py +0 -0
  120. kreuzberg-3.4.0/tests/extractors/html_test.py +54 -0
  121. kreuzberg-3.4.0/tests/extractors/image_test.py +240 -0
  122. kreuzberg-3.4.0/tests/extractors/pandoc_metadata_test.py +323 -0
  123. kreuzberg-3.4.0/tests/extractors/pandoc_test.py +458 -0
  124. kreuzberg-3.4.0/tests/extractors/pdf_test.py +385 -0
  125. kreuzberg-3.4.0/tests/extractors/presentation_test.py +410 -0
  126. kreuzberg-3.4.0/tests/extractors/spreed_sheet_test.py +325 -0
  127. kreuzberg-3.4.0/tests/gmft_extended_test.py +163 -0
  128. kreuzberg-3.4.0/tests/gmft_test.py +387 -0
  129. kreuzberg-3.4.0/tests/hooks_test.py +205 -0
  130. kreuzberg-3.4.0/tests/mime_types_test.py +199 -0
  131. kreuzberg-3.4.0/tests/multiprocessing/__init__.py +1 -0
  132. kreuzberg-3.4.0/tests/multiprocessing/gmft_integration_test.py +104 -0
  133. kreuzberg-3.4.0/tests/multiprocessing/process_manager_test.py +282 -0
  134. kreuzberg-3.4.0/tests/multiprocessing/sync_tesseract_test.py +367 -0
  135. kreuzberg-3.4.0/tests/multiprocessing/tesseract_pool_test.py +349 -0
  136. kreuzberg-3.4.0/tests/ocr/__init__.py +0 -0
  137. kreuzberg-3.4.0/tests/ocr/base_test.py +79 -0
  138. kreuzberg-3.4.0/tests/ocr/device_integration_test.py +270 -0
  139. kreuzberg-3.4.0/tests/ocr/easyocr_test.py +462 -0
  140. kreuzberg-3.4.0/tests/ocr/init_test.py +41 -0
  141. kreuzberg-3.4.0/tests/ocr/paddleocr_test.py +857 -0
  142. kreuzberg-3.4.0/tests/ocr/tesseract_test.py +433 -0
  143. kreuzberg-3.4.0/tests/playa_test.py +111 -0
  144. kreuzberg-3.4.0/tests/registry_test.py +190 -0
  145. kreuzberg-3.4.0/tests/test_source_files/document.docx +0 -0
  146. kreuzberg-3.4.0/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  147. kreuzberg-3.4.0/tests/test_source_files/excel.xlsx +0 -0
  148. kreuzberg-3.4.0/tests/test_source_files/html.html +10 -0
  149. kreuzberg-3.4.0/tests/test_source_files/markdown.md +1 -0
  150. kreuzberg-3.4.0/tests/test_source_files/non-ascii-text.pdf +0 -0
  151. kreuzberg-3.4.0/tests/test_source_files/non-searchable.pdf +0 -0
  152. kreuzberg-3.4.0/tests/test_source_files/ocr-image.jpg +0 -0
  153. kreuzberg-3.4.0/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  154. kreuzberg-3.4.0/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  155. kreuzberg-3.4.0/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  156. kreuzberg-3.4.0/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  157. kreuzberg-3.4.0/tests/test_source_files/sample-contract.pdf +0 -0
  158. kreuzberg-3.4.0/tests/test_source_files/scanned.pdf +0 -0
  159. kreuzberg-3.4.0/tests/test_source_files/searchable.pdf +0 -0
  160. kreuzberg-3.4.0/tests/test_source_files/test-article.pdf +0 -0
  161. kreuzberg-3.4.0/tests/types_test.py +132 -0
  162. kreuzberg-3.4.0/tests/utils/__init__.py +0 -0
  163. kreuzberg-3.4.0/tests/utils/cache_test.py +473 -0
  164. kreuzberg-3.4.0/tests/utils/device_test.py +349 -0
  165. kreuzberg-3.4.0/tests/utils/errors_test.py +309 -0
  166. kreuzberg-3.4.0/tests/utils/pdf_lock_test.py +233 -0
  167. kreuzberg-3.4.0/tests/utils/process_pool_test.py +246 -0
  168. kreuzberg-3.4.0/tests/utils/serialization_test.py +336 -0
  169. kreuzberg-3.4.0/tests/utils/string_test.py +85 -0
  170. kreuzberg-3.4.0/tests/utils/sync_test.py +309 -0
  171. kreuzberg-3.4.0/tests/utils/tmp_test.py +50 -0
  172. kreuzberg-3.4.0/uv.lock +3804 -0
  173. kreuzberg-3.2.0/PKG-INFO +0 -166
  174. kreuzberg-3.2.0/README.md +0 -112
  175. kreuzberg-3.2.0/kreuzberg/_extractors/_pdf.py +0 -171
  176. kreuzberg-3.2.0/kreuzberg/_gmft.py +0 -174
  177. kreuzberg-3.2.0/kreuzberg/extraction.py +0 -251
  178. kreuzberg-3.2.0/kreuzberg.egg-info/PKG-INFO +0 -166
  179. kreuzberg-3.2.0/kreuzberg.egg-info/SOURCES.txt +0 -37
  180. kreuzberg-3.2.0/kreuzberg.egg-info/dependency_links.txt +0 -1
  181. kreuzberg-3.2.0/kreuzberg.egg-info/requires.txt +0 -35
  182. kreuzberg-3.2.0/kreuzberg.egg-info/top_level.txt +0 -1
  183. kreuzberg-3.2.0/setup.cfg +0 -4
  184. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/LICENSE +0 -0
  185. {kreuzberg-3.2.0/kreuzberg/_extractors → kreuzberg-3.4.0/kreuzberg/_api}/__init__.py +0 -0
  186. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_chunker.py +0 -0
  187. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_constants.py +0 -0
  188. {kreuzberg-3.2.0/kreuzberg/_utils → kreuzberg-3.4.0/kreuzberg/_extractors}/__init__.py +0 -0
  189. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_base.py +0 -0
  190. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_extractors/_html.py +0 -0
  191. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/__init__.py +0 -0
  192. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_ocr/_base.py +0 -0
  193. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_registry.py +0 -0
  194. /kreuzberg-3.2.0/kreuzberg/py.typed → /kreuzberg-3.4.0/kreuzberg/_utils/__init__.py +0 -0
  195. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/_utils/_tmp.py +0 -0
  196. {kreuzberg-3.2.0 → kreuzberg-3.4.0}/kreuzberg/exceptions.py +0 -0
@@ -0,0 +1 @@
1
+ { "extends": ["@commitlint/config-conventional"] }
@@ -0,0 +1,21 @@
1
+ FROM ghcr.io/astral-sh/uv:python3.13-bookworm as app
2
+ ARG EXTRAS=""
3
+ WORKDIR /app
4
+ ENV PYTHONDONTWRITEBYTECODE 1
5
+ ENV PYTHONUNBUFFERED 1
6
+ ENV UV_LINK_MODE=copy
7
+
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ pandoc \
10
+ tesseract-ocr \
11
+ && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
12
+
13
+
14
+ COPY pyproject.toml uv.lock README.md ./
15
+ COPY kreuzberg kreuzberg
16
+
17
+ RUN uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --compile-bytecode
18
+
19
+ RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
20
+ USER appuser
21
+ CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
@@ -0,0 +1,87 @@
1
+ # Kreuzberg Docker Images
2
+
3
+ [![GitHub](https://img.shields.io/badge/GitHub-Goldziher%2Fkreuzberg-blue)](https://github.com/Goldziher/kreuzberg)
4
+ [![PyPI](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
5
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE)
7
+
8
+ High-performance Python library for text extraction from documents, available as Docker images.
9
+
10
+ **Source Code**: [github.com/Goldziher/kreuzberg](https://github.com/Goldziher/kreuzberg)
11
+
12
+ ## Quick Start
13
+
14
+ ```bash
15
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
16
+ ```
17
+
18
+ ## Available Tags
19
+
20
+ - `latest` - Latest stable release with API server and Tesseract OCR
21
+ - `X.Y.Z` - Specific version (e.g., `3.0.0`)
22
+ - `X.Y.Z-easyocr` - With EasyOCR support
23
+ - `X.Y.Z-paddle` - With PaddleOCR support
24
+ - `X.Y.Z-gmft` - With GMFT table extraction
25
+ - `X.Y.Z-all` - With all optional dependencies
26
+
27
+ ## Usage
28
+
29
+ ### Extract Files via API
30
+
31
+ ```bash
32
+ # Single file
33
+ curl -X POST http://localhost:8000/extract \
34
+ -F "data=@document.pdf"
35
+
36
+ # Multiple files
37
+ curl -X POST http://localhost:8000/extract \
38
+ -F "data=@document1.pdf" \
39
+ -F "data=@document2.docx"
40
+ ```
41
+
42
+ ### Docker Compose
43
+
44
+ ```yaml
45
+ version: '3.8'
46
+
47
+ services:
48
+ kreuzberg:
49
+ image: goldziher/kreuzberg:latest
50
+ ports:
51
+ - "8000:8000"
52
+ restart: unless-stopped
53
+ ```
54
+
55
+ ## Features
56
+
57
+ - **🚀 High Performance**: Optimized for speed and efficiency
58
+ - **📄 Multiple Formats**: PDF, DOCX, images, HTML, and more
59
+ - **🔍 OCR Support**: Built-in Tesseract, optional EasyOCR/PaddleOCR
60
+ - **📊 Table Extraction**: Extract tables with GMFT
61
+ - **🔒 Secure**: Runs as non-root user, no external API calls
62
+ - **📦 Ready to Use**: Pre-configured API server
63
+
64
+ ## Documentation
65
+
66
+ - **[GitHub Repository](https://github.com/Goldziher/kreuzberg)** - Source code and issue tracking
67
+ - **[Full Documentation](https://goldziher.github.io/kreuzberg/)** - Complete user guide and API reference
68
+ - **[API Documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/)** - REST API endpoints and usage
69
+ - **[Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/)** - Detailed Docker usage guide
70
+
71
+ ## Support
72
+
73
+ - **Issues**: [github.com/Goldziher/kreuzberg/issues](https://github.com/Goldziher/kreuzberg/issues)
74
+ - **Discussions**: [github.com/Goldziher/kreuzberg/discussions](https://github.com/Goldziher/kreuzberg/discussions)
75
+ - **Discord**: [Join our community](https://discord.gg/pXxagNK2zN)
76
+
77
+ ## Contributing
78
+
79
+ Contributions are welcome! See our [Contributing Guide](https://github.com/Goldziher/kreuzberg/blob/main/docs/contributing.md).
80
+
81
+ ## License
82
+
83
+ MIT License - see [LICENSE](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE) for details.
84
+
85
+ ______________________________________________________________________
86
+
87
+ Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
@@ -0,0 +1,15 @@
1
+ *.pyc
2
+ *.pyd
3
+ *.pyo
4
+ .git
5
+ .github/
6
+ .gitignore
7
+ .idea
8
+ .mypy_cache
9
+ .pytest_cache
10
+ .ruff_cache
11
+ .vscode
12
+ __pycache__
13
+ benchmarks/
14
+ docs/
15
+ tests/
@@ -0,0 +1,15 @@
1
+ # Performance Baseline
2
+
3
+ This directory contains baseline performance metrics for the Kreuzberg library.
4
+
5
+ ## Files
6
+
7
+ - `baseline.json` - Performance baseline automatically updated from main branch CI
8
+ - This file is used for performance regression detection in PRs
9
+
10
+ ## How it works
11
+
12
+ 1. When code is pushed to `main`, CI runs benchmarks and stores results as `baseline.json`
13
+ 1. When PRs are opened, CI compares current performance against this baseline
14
+ 1. If performance degrades beyond threshold (20%), the CI check fails
15
+ 1. The baseline is automatically updated when new changes are merged to main
@@ -0,0 +1,6 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "github-actions"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "daily"
@@ -0,0 +1,124 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - main
7
+ push:
8
+ branches:
9
+ - main
10
+ - feat/smart-multiprocessing
11
+
12
+ jobs:
13
+ validate:
14
+ runs-on: ubuntu-latest
15
+ timeout-minutes: 10
16
+ steps:
17
+ - name: Checkout
18
+ uses: actions/checkout@v4
19
+
20
+ - name: Install uv
21
+ uses: astral-sh/setup-uv@v6
22
+ with:
23
+ enable-cache: true
24
+
25
+ - name: Set up Python
26
+ uses: actions/setup-python@v5
27
+ with:
28
+ python-version-file: "pyproject.toml"
29
+
30
+ - name: Install Dependencies
31
+ shell: bash
32
+ run: |
33
+ if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
34
+ echo "Removing existing .venv directory on Windows"
35
+ rm -rf .venv
36
+ fi
37
+ uv sync --all-packages --all-extras --dev
38
+
39
+ - name: Load Cached Pre-Commit Dependencies
40
+ id: cached-pre-commit-dependencies
41
+ uses: actions/cache@v4
42
+ with:
43
+ path: ~/.cache/pre-commit/
44
+ key: pre-commit|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
45
+
46
+ - name: Execute Pre-Commit
47
+ run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
48
+
49
+ test:
50
+ strategy:
51
+ matrix:
52
+ os: [ ubuntu-latest, macOS-latest, windows-latest ]
53
+ python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.9", "3.10", "3.11", "3.12", "3.13"]') }}
54
+ runs-on: ${{ matrix.os }}
55
+ timeout-minutes: 30
56
+ steps:
57
+ - name: Checkout
58
+ uses: actions/checkout@v4
59
+
60
+ - name: Install uv
61
+ uses: astral-sh/setup-uv@v6
62
+ with:
63
+ enable-cache: true
64
+
65
+ - name: Install Python
66
+ uses: actions/setup-python@v5
67
+ id: setup-python
68
+ with:
69
+ python-version: ${{ matrix.python }}
70
+
71
+ - name: Cache Python Dependencies
72
+ id: python-cache
73
+ uses: actions/cache@v4
74
+ with:
75
+ path: |
76
+ ~/.cache/uv
77
+ .venv
78
+ key: python-dependencies-${{ matrix.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('uv.lock') }}
79
+ restore-keys: |
80
+ python-dependencies-${{ matrix.os }}-${{ matrix.python }}-
81
+
82
+ - name: Install Dependencies
83
+ shell: bash
84
+ run: |
85
+ if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
86
+ echo "Removing existing .venv directory on Windows"
87
+ rm -rf .venv
88
+ fi
89
+ uv sync --all-packages --all-extras --dev
90
+
91
+ - name: Cache Test Artifacts
92
+ uses: actions/cache@v4
93
+ with:
94
+ path: .pytest_cache/
95
+ key: pytest-cache-${{ matrix.os }}-${{ matrix.python }}
96
+
97
+ - name: Cache and Install Homebrew (macOS)
98
+ if: runner.os == 'macOS'
99
+ uses: tecolicom/actions-use-homebrew-tools@v1
100
+ with:
101
+ tools: 'tesseract tesseract-lang pandoc'
102
+ key: 'homebrew-tools-${{ runner.os }}'
103
+ cache: yes
104
+ verbose: false
105
+
106
+ - name: Cache and Install APT Packages (Linux)
107
+ if: runner.os == 'Linux'
108
+ uses: awalsh128/cache-apt-pkgs-action@latest
109
+ with:
110
+ packages: tesseract-ocr tesseract-ocr-deu pandoc
111
+ version: 1.0
112
+
113
+ - name: Install System Dependencies (Windows)
114
+ if: runner.os == 'Windows'
115
+ run: |
116
+ choco install -y tesseract pandoc
117
+ Write-Output "C:\Program Files\Tesseract-OCR" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
118
+ Write-Output "C:\Program Files\Pandoc" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
119
+ $env:PATH = "C:\Program Files\Tesseract-OCR;C:\Program Files\Pandoc;" + $env:PATH
120
+ tesseract --version
121
+ pandoc --version
122
+
123
+ - name: Run Tests
124
+ run: uv run pytest -s -vvv
@@ -0,0 +1,20 @@
1
+ name: "Check PR Title"
2
+
3
+ on:
4
+ pull_request_target:
5
+ types:
6
+ - opened
7
+ - edited
8
+ - synchronize
9
+
10
+ permissions:
11
+ pull-requests: read
12
+
13
+ jobs:
14
+ main:
15
+ name: Validate PR title
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: amannn/action-semantic-pull-request@v5
19
+ env:
20
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,101 @@
1
+ # .github/workflows/publish-docker.yml
2
+
3
+ name: Publish Docker Images
4
+
5
+ on:
6
+ workflow_run:
7
+ workflows: ["Release"]
8
+ types:
9
+ - completed
10
+ branches:
11
+ - main
12
+
13
+ jobs:
14
+ build-and-push:
15
+ runs-on: ubuntu-latest
16
+ if: ${{ github.event.workflow_run.conclusion == 'success' }}
17
+ permissions:
18
+ contents: read
19
+ packages: write
20
+
21
+ strategy:
22
+ matrix:
23
+ include:
24
+ - name: core
25
+ extras: ""
26
+ tag_suffix: "" # The base image tag (includes API + tesseract)
27
+ - name: easyocr
28
+ extras: "easyocr"
29
+ tag_suffix: "-easyocr"
30
+ - name: paddle
31
+ extras: "paddleocr"
32
+ tag_suffix: "-paddle"
33
+ - name: gmft
34
+ extras: "gmft"
35
+ tag_suffix: "-gmft"
36
+ - name: all
37
+ extras: "all"
38
+ tag_suffix: "-all"
39
+
40
+ steps:
41
+ - name: Checkout repository
42
+ uses: actions/checkout@v4
43
+ with:
44
+ ref: ${{ github.event.workflow_run.head_branch }}
45
+
46
+ - name: Get release version
47
+ id: get_version
48
+ run: |
49
+ echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
50
+ # If triggered by a tag, extract version
51
+ if [[ "${{ github.event.workflow_run.head_branch }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
52
+ echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
53
+ else
54
+ # Get the latest tag
55
+ git fetch --tags
56
+ echo "VERSION=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
57
+ fi
58
+
59
+ - name: Set up QEMU
60
+ uses: docker/setup-qemu-action@v3
61
+
62
+ - name: Set up Docker Buildx
63
+ uses: docker/setup-buildx-action@v3
64
+
65
+ - name: Log in to Docker Hub
66
+ uses: docker/login-action@v3
67
+ with:
68
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
69
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
70
+
71
+ - name: Extract metadata (tags, labels) for Docker
72
+ id: meta
73
+ uses: docker/metadata-action@v5
74
+ with:
75
+ images: goldziher/kreuzberg
76
+ tags: |
77
+ # Release version tag (e.g., v3.0.0-easyocr)
78
+ type=raw,value=${{ steps.get_version.outputs.VERSION }}${{ matrix.tag_suffix }}
79
+ # Latest tag for each variant (e.g., latest-easyocr)
80
+ type=raw,value=latest${{ matrix.tag_suffix }}
81
+
82
+ - name: Build and push Docker image
83
+ uses: docker/build-push-action@v5
84
+ with:
85
+ context: .
86
+ file: ./.docker/Dockerfile
87
+ platforms: linux/amd64,linux/arm64
88
+ push: true
89
+ build-args: |
90
+ EXTRAS=${{ matrix.extras }}
91
+ tags: ${{ steps.meta.outputs.tags }}
92
+ labels: ${{ steps.meta.outputs.labels }}
93
+
94
+ - name: Update Docker Hub README
95
+ uses: peter-evans/dockerhub-description@v4
96
+ if: matrix.name == 'core'
97
+ with:
98
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
99
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
100
+ repository: goldziher/kreuzberg
101
+ readme-filepath: ./.docker/README.md
@@ -0,0 +1,31 @@
1
+ name: Release
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ release:
9
+ runs-on: ubuntu-latest
10
+ environment: pypi
11
+ permissions:
12
+ id-token: write
13
+ steps:
14
+ - name: Checkout
15
+ uses: actions/checkout@v4
16
+
17
+ - name: Install uv
18
+ uses: astral-sh/setup-uv@v6
19
+ with:
20
+ enable-cache: true
21
+
22
+ - name: Set up Python
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version-file: "pyproject.toml"
26
+
27
+ - name: Install Dependencies
28
+ run: uv build
29
+
30
+ - name: Publish
31
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,34 @@
1
+ *$py.class
2
+ *.Cache
3
+ *.cscfg
4
+ *.egg-info/
5
+ *.log
6
+ *.py[cod]
7
+ *.suo
8
+ *.user
9
+ *temp/
10
+ .coverage
11
+ .coverage*
12
+ .cursorrules
13
+ .dist/
14
+ .DS_store
15
+ .env
16
+ .idea/
17
+ .kreuzberg/
18
+ .mypy_cache/
19
+ .pytest_cache/
20
+ .python-version
21
+ .ropeproject
22
+ .ruff_cache/
23
+ .run/
24
+ .venv/
25
+ .vscode/
26
+ .windsurfrules
27
+ __pycache__/
28
+ benchmark_results.json
29
+ CLAUDE.md
30
+ coverage.xml
31
+ docker-compose.yaml
32
+ GEMINI.md
33
+ prompt_template.egg-info/
34
+ requirements.txt
@@ -0,0 +1,17 @@
1
+ default: true
2
+
3
+ MD007:
4
+ indent: 4
5
+
6
+ MD033: false
7
+
8
+ MD041: false
9
+
10
+ MD013: false
11
+
12
+ MD014: false
13
+
14
+ MD024:
15
+ siblings_only: true
16
+
17
+ MD046: false
@@ -0,0 +1,86 @@
1
+ repos:
2
+ - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
3
+ rev: "v9.22.0"
4
+ hooks:
5
+ - id: commitlint
6
+ stages: [commit-msg]
7
+ additional_dependencies: ["@commitlint/config-conventional"]
8
+ - repo: https://github.com/Goldziher/ai-rulez
9
+ rev: v1.1.2
10
+ hooks:
11
+ - id: ai-rulez-validate
12
+ - id: ai-rulez-generate
13
+ - repo: https://github.com/pre-commit/pre-commit-hooks
14
+ rev: v5.0.0
15
+ hooks:
16
+ - id: name-tests-test
17
+ args:
18
+ - --pytest
19
+ exclude: factories|test_utils|completion.py|test_data
20
+ - id: trailing-whitespace
21
+ - id: end-of-file-fixer
22
+ - id: check-toml
23
+ - id: check-case-conflict
24
+ - id: detect-private-key
25
+ - repo: https://github.com/abravalheri/validate-pyproject
26
+ rev: v0.24.1
27
+ hooks:
28
+ - id: validate-pyproject
29
+ - repo: https://github.com/executablebooks/mdformat
30
+ rev: 0.7.22
31
+ hooks:
32
+ - id: mdformat
33
+ additional_dependencies:
34
+ - mdformat-mkdocs==4.0.0
35
+ - repo: https://github.com/igorshubovych/markdownlint-cli
36
+ rev: v0.45.0
37
+ hooks:
38
+ - id: markdownlint-fix
39
+ - repo: https://github.com/adamchainz/blacken-docs
40
+ rev: 1.19.1
41
+ hooks:
42
+ - id: blacken-docs
43
+ args: ["--pyi", "--line-length", "130"]
44
+ additional_dependencies:
45
+ - black==25.1.0
46
+ - repo: https://github.com/rbubley/mirrors-prettier
47
+ rev: "v3.6.2"
48
+ hooks:
49
+ - id: prettier
50
+ exclude: ^tests|^.idea|^migrations|^.git|README.md|^docs
51
+ - repo: https://github.com/tox-dev/pyproject-fmt
52
+ rev: "v2.6.0"
53
+ hooks:
54
+ - id: pyproject-fmt
55
+ - repo: https://github.com/astral-sh/ruff-pre-commit
56
+ rev: v0.12.1
57
+ hooks:
58
+ - id: ruff
59
+ args: ["--fix", "--unsafe-fixes"]
60
+ - id: ruff-format
61
+ - repo: https://github.com/codespell-project/codespell
62
+ rev: v2.4.1
63
+ hooks:
64
+ - id: codespell
65
+ exclude: ^tests|^scripts|^kreuzberg/_tesseract|^kreuzberg/_mime_types
66
+ additional_dependencies:
67
+ - tomli
68
+ - repo: https://github.com/jsh9/pydoclint
69
+ rev: 0.6.7
70
+ hooks:
71
+ - id: pydoclint
72
+ args:
73
+ [
74
+ --style=google,
75
+ --check-return-types=False,
76
+ --arg-type-hints-in-docstring=False,
77
+ ]
78
+ exclude: ^benchmarks/|^kreuzberg/_|^tests/|^scripts/|^run_benchmarks\.py
79
+ - repo: local
80
+ hooks:
81
+ - id: mypy
82
+ name: mypy
83
+ entry: uv run mypy
84
+ require_serial: true
85
+ language: system
86
+ types: [python]