kreuzberg 3.11.3__tar.gz → 3.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.docker/Dockerfile +27 -2
  2. kreuzberg-3.13.0/.docker/README.md +190 -0
  3. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/ci.yaml +0 -6
  4. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/docker-e2e-tests.yml +4 -8
  5. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/docs.yml +1 -1
  6. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/publish-docker.yml +76 -27
  7. kreuzberg-3.13.0/.github/workflows/test-docker-builds.yml +97 -0
  8. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.gitignore +27 -0
  9. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.pre-commit-config.yaml +3 -3
  10. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/PKG-INFO +17 -14
  11. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/README.md +13 -9
  12. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/Taskfile.yml +17 -16
  13. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/ai-rulez.yaml +0 -37
  14. kreuzberg-3.13.0/benchmarks/README.md +234 -0
  15. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/benchmarks/pyproject.toml +5 -2
  16. kreuzberg-3.13.0/benchmarks/src/__init__.py +1 -0
  17. kreuzberg-3.13.0/benchmarks/src/__main__.py +4 -0
  18. {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/benchmarks.py +0 -9
  19. {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/cli.py +316 -15
  20. {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/models.py +2 -22
  21. {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/profiler.py +0 -14
  22. {kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks → kreuzberg-3.13.0/benchmarks/src}/runner.py +2 -26
  23. kreuzberg-3.13.0/docker-compose.example.yml +26 -0
  24. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/types.md +13 -0
  25. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/cli.md +36 -1
  26. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/examples/extraction-examples.md +103 -0
  27. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/getting-started/installation.md +8 -38
  28. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/index.md +1 -1
  29. kreuzberg-3.13.0/docs/user-guide/api-server.md +313 -0
  30. kreuzberg-3.13.0/docs/user-guide/docker.md +548 -0
  31. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/document-classification.md +1 -1
  32. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/extraction-configuration.md +162 -27
  33. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/ocr-backends.md +17 -13
  34. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/ocr-configuration.md +125 -3
  35. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/__init__.py +14 -13
  36. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/__main__.py +0 -2
  37. kreuzberg-3.13.0/kreuzberg/_api/main.py +218 -0
  38. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_config.py +248 -204
  39. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_document_classification.py +0 -8
  40. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_entity_extraction.py +1 -93
  41. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_base.py +0 -5
  42. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_email.py +1 -11
  43. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_html.py +9 -12
  44. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_image.py +1 -23
  45. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_pandoc.py +10 -89
  46. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_pdf.py +39 -92
  47. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_presentation.py +0 -17
  48. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_spread_sheet.py +13 -53
  49. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_extractors/_structured.py +1 -4
  50. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_gmft.py +14 -138
  51. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_language_detection.py +1 -22
  52. kreuzberg-3.13.0/kreuzberg/_mcp/__init__.py +3 -0
  53. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_mcp/server.py +3 -10
  54. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_mime_types.py +1 -2
  55. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_ocr/_easyocr.py +21 -108
  56. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_ocr/_paddleocr.py +16 -94
  57. kreuzberg-3.13.0/kreuzberg/_ocr/_table_extractor.py +260 -0
  58. kreuzberg-3.13.0/kreuzberg/_ocr/_tesseract.py +1629 -0
  59. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_playa.py +5 -4
  60. kreuzberg-3.13.0/kreuzberg/_types.py +1011 -0
  61. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_cache.py +88 -90
  62. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_device.py +0 -18
  63. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_document_cache.py +0 -2
  64. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_errors.py +0 -3
  65. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_pdf_lock.py +0 -2
  66. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_process_pool.py +19 -19
  67. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_quality.py +0 -43
  68. kreuzberg-3.13.0/kreuzberg/_utils/_ref.py +48 -0
  69. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_serialization.py +0 -5
  70. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_string.py +9 -39
  71. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_sync.py +0 -1
  72. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_table.py +50 -57
  73. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/cli.py +55 -77
  74. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/extraction.py +39 -32
  75. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/mkdocs.yaml +1 -1
  76. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/pyproject.toml +11 -13
  77. kreuzberg-3.13.0/results/baseline.json +9 -0
  78. kreuzberg-3.13.0/results/serialization.json +11 -0
  79. kreuzberg-3.13.0/results/statistical.json +21 -0
  80. kreuzberg-3.13.0/tests/api/conftest.py +17 -0
  81. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/api/main_test.py +140 -88
  82. kreuzberg-3.13.0/tests/api/runtime_config_test.py +322 -0
  83. kreuzberg-3.13.0/tests/cli_command_test.py +481 -0
  84. kreuzberg-3.13.0/tests/cli_integration_test.py +858 -0
  85. kreuzberg-3.13.0/tests/cli_test.py +324 -0
  86. kreuzberg-3.13.0/tests/config_test.py +1540 -0
  87. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/conftest.py +0 -8
  88. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/document_classification_test.py +12 -96
  89. kreuzberg-3.11.3/tests/e2e/docker_images_test.py → kreuzberg-3.13.0/tests/e2e/docker_e2e_test.py +11 -130
  90. kreuzberg-3.13.0/tests/entity_extraction_test.py +589 -0
  91. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/exceptions_test.py +0 -10
  92. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/extraction_batch_test.py +3 -28
  93. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/extraction_test.py +9 -91
  94. kreuzberg-3.13.0/tests/extractors/email_test.py +924 -0
  95. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/extractors/html_test.py +0 -2
  96. kreuzberg-3.13.0/tests/extractors/image_test.py +693 -0
  97. kreuzberg-3.13.0/tests/extractors/pandoc_test.py +1996 -0
  98. kreuzberg-3.13.0/tests/extractors/pdf_test.py +900 -0
  99. kreuzberg-3.13.0/tests/extractors/presentation_test.py +934 -0
  100. kreuzberg-3.13.0/tests/extractors/spreed_sheet_test.py +1121 -0
  101. kreuzberg-3.13.0/tests/extractors/structured_test.py +304 -0
  102. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/gmft_extended_test.py +2 -17
  103. kreuzberg-3.13.0/tests/gmft_test.py +785 -0
  104. kreuzberg-3.13.0/tests/html_to_markdown_config_test.py +217 -0
  105. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/language_detection_test.py +3 -24
  106. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/mcp_server_test.py +19 -145
  107. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/mime_types_test.py +0 -4
  108. kreuzberg-3.13.0/tests/multiprocessing/__init__.py +0 -0
  109. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/multiprocessing/gmft_integration_test.py +2 -9
  110. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/multiprocessing/gmft_isolated_test.py +4 -50
  111. kreuzberg-3.13.0/tests/multiprocessing/process_manager_test.py +273 -0
  112. kreuzberg-3.13.0/tests/multiprocessing/tesseract_pool_test.py +331 -0
  113. kreuzberg-3.13.0/tests/ocr/__init__.py +0 -0
  114. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/base_test.py +0 -13
  115. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/device_integration_test.py +0 -3
  116. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/easyocr_test.py +0 -15
  117. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/init_test.py +0 -6
  118. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/ocr/paddleocr_test.py +0 -10
  119. kreuzberg-3.13.0/tests/ocr/tesseract_test.py +1154 -0
  120. kreuzberg-3.13.0/tests/ocr/tesseract_tsv_integration_test.py +273 -0
  121. kreuzberg-3.13.0/tests/ocr/tesseract_tsv_test.py +382 -0
  122. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/playa_helpers_test.py +8 -84
  123. kreuzberg-3.13.0/tests/tesseract_sync_formats_test.py +168 -0
  124. kreuzberg-3.13.0/tests/test_source_files/contract.txt +1 -0
  125. kreuzberg-3.13.0/tests/test_source_files/tables/borderless_table.png +0 -0
  126. kreuzberg-3.13.0/tests/test_source_files/tables/complex_document.png +0 -0
  127. kreuzberg-3.13.0/tests/test_source_files/tables/simple_table.png +0 -0
  128. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/types_test.py +10 -76
  129. kreuzberg-3.13.0/tests/utils/__init__.py +0 -0
  130. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/cache_test.py +26 -69
  131. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/device_test.py +0 -2
  132. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/errors_test.py +129 -94
  133. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/pdf_lock_test.py +0 -18
  134. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/process_pool_test.py +9 -29
  135. kreuzberg-3.13.0/tests/utils/ref_test.py +90 -0
  136. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/serialization_test.py +1 -40
  137. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/string_test.py +12 -66
  138. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/sync_test.py +0 -43
  139. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/table_test.py +18 -78
  140. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/uv.lock +633 -615
  141. kreuzberg-3.11.3/.docker/README.md +0 -87
  142. kreuzberg-3.11.3/.task/checksum/docker-build-core +0 -1
  143. kreuzberg-3.11.3/.task/checksum/docker-build-easyocr +0 -1
  144. kreuzberg-3.11.3/.task/checksum/docker-build-gmft +0 -1
  145. kreuzberg-3.11.3/.task/checksum/docker-build-paddle +0 -1
  146. kreuzberg-3.11.3/benchmarks/README.md +0 -152
  147. kreuzberg-3.11.3/benchmarks/benchmark_baseline.py +0 -116
  148. kreuzberg-3.11.3/benchmarks/end_to_end_benchmark.py +0 -238
  149. kreuzberg-3.11.3/benchmarks/final_benchmark.py +0 -147
  150. kreuzberg-3.11.3/benchmarks/results/baseline_results.json +0 -35
  151. kreuzberg-3.11.3/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -50
  152. kreuzberg-3.11.3/benchmarks/results/comprehensive_caching_results.json +0 -55
  153. kreuzberg-3.11.3/benchmarks/results/final_benchmark_results.json +0 -12
  154. kreuzberg-3.11.3/benchmarks/results/latest.json +0 -607
  155. kreuzberg-3.11.3/benchmarks/results/mime_caching_results.json +0 -18
  156. kreuzberg-3.11.3/benchmarks/results/msgspec_caching_results.json +0 -10
  157. kreuzberg-3.11.3/benchmarks/results/ocr_caching_results.json +0 -17
  158. kreuzberg-3.11.3/benchmarks/results/serialization_benchmark_results.json +0 -42
  159. kreuzberg-3.11.3/benchmarks/results/statistical_benchmark_results.json +0 -26
  160. kreuzberg-3.11.3/benchmarks/results/table_caching_results.json +0 -17
  161. kreuzberg-3.11.3/benchmarks/serialization_benchmark.py +0 -165
  162. kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -3
  163. kreuzberg-3.11.3/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -6
  164. kreuzberg-3.11.3/benchmarks/statistical_benchmark.py +0 -219
  165. kreuzberg-3.11.3/docs/performance-analysis.md +0 -168
  166. kreuzberg-3.11.3/docs/user-guide/api-server.md +0 -169
  167. kreuzberg-3.11.3/docs/user-guide/docker.md +0 -389
  168. kreuzberg-3.11.3/kreuzberg/_api/main.py +0 -108
  169. kreuzberg-3.11.3/kreuzberg/_mcp/__init__.py +0 -5
  170. kreuzberg-3.11.3/kreuzberg/_ocr/_tesseract.py +0 -987
  171. kreuzberg-3.11.3/kreuzberg/_types.py +0 -413
  172. kreuzberg-3.11.3/tests/cli_command_test.py +0 -523
  173. kreuzberg-3.11.3/tests/cli_integration_test.py +0 -531
  174. kreuzberg-3.11.3/tests/cli_test.py +0 -335
  175. kreuzberg-3.11.3/tests/config_test.py +0 -1570
  176. kreuzberg-3.11.3/tests/e2e/run_docker_tests.sh +0 -371
  177. kreuzberg-3.11.3/tests/e2e/test_report.json +0 -14
  178. kreuzberg-3.11.3/tests/entity_extraction_test.py +0 -675
  179. kreuzberg-3.11.3/tests/extractors/email_test.py +0 -1003
  180. kreuzberg-3.11.3/tests/extractors/image_test.py +0 -768
  181. kreuzberg-3.11.3/tests/extractors/pandoc_test.py +0 -2123
  182. kreuzberg-3.11.3/tests/extractors/pdf_test.py +0 -973
  183. kreuzberg-3.11.3/tests/extractors/presentation_test.py +0 -1005
  184. kreuzberg-3.11.3/tests/extractors/spreed_sheet_test.py +0 -1237
  185. kreuzberg-3.11.3/tests/extractors/structured_test.py +0 -302
  186. kreuzberg-3.11.3/tests/gmft_test.py +0 -839
  187. kreuzberg-3.11.3/tests/multiprocessing/__init__.py +0 -1
  188. kreuzberg-3.11.3/tests/multiprocessing/process_manager_test.py +0 -282
  189. kreuzberg-3.11.3/tests/multiprocessing/tesseract_pool_test.py +0 -349
  190. kreuzberg-3.11.3/tests/ocr/tesseract_test.py +0 -1141
  191. kreuzberg-3.11.3/tests/utils_errors_test.py +0 -299
  192. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.commitlintrc +0 -0
  193. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.deepsource.toml +0 -0
  194. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.dockerignore +0 -0
  195. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/dependabot.yaml +0 -0
  196. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/pr-title.yaml +0 -0
  197. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.github/workflows/release.yaml +0 -0
  198. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/.markdownlint.yaml +0 -0
  199. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/LICENSE +0 -0
  200. {kreuzberg-3.11.3/kreuzberg/_api → kreuzberg-3.13.0/benchmarks}/__init__.py +0 -0
  201. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/custom-extractors.md +0 -0
  202. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/custom-hooks.md +0 -0
  203. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/error-handling.md +0 -0
  204. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/index.md +0 -0
  205. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/advanced/performance.md +0 -0
  206. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/exceptions.md +0 -0
  207. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/extraction-functions.md +0 -0
  208. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/extractor-registry.md +0 -0
  209. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/index.md +0 -0
  210. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/api-reference/ocr-configuration.md +0 -0
  211. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/assets/favicon.png +0 -0
  212. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/assets/logo.png +0 -0
  213. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/contributing.md +0 -0
  214. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/css/extra.css +0 -0
  215. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/examples/index.md +0 -0
  216. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/getting-started/index.md +0 -0
  217. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/getting-started/quick-start.md +0 -0
  218. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/basic-usage.md +0 -0
  219. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/chunking.md +0 -0
  220. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/index.md +0 -0
  221. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/mcp-server.md +0 -0
  222. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/metadata-extraction.md +0 -0
  223. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/docs/user-guide/supported-formats.md +0 -0
  224. {kreuzberg-3.11.3/kreuzberg/_extractors → kreuzberg-3.13.0/kreuzberg/_api}/__init__.py +0 -0
  225. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_chunker.py +0 -0
  226. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_constants.py +0 -0
  227. {kreuzberg-3.11.3/kreuzberg/_utils → kreuzberg-3.13.0/kreuzberg/_extractors}/__init__.py +0 -0
  228. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_ocr/__init__.py +0 -0
  229. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_ocr/_base.py +0 -0
  230. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_registry.py +0 -0
  231. {kreuzberg-3.11.3/tests → kreuzberg-3.13.0/kreuzberg/_utils}/__init__.py +0 -0
  232. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/_utils/_tmp.py +0 -0
  233. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/exceptions.py +0 -0
  234. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/kreuzberg/py.typed +0 -0
  235. /kreuzberg-3.11.3/tests/api/__init__.py → /kreuzberg-3.13.0/output.txt +0 -0
  236. {kreuzberg-3.11.3/tests/extractors → kreuzberg-3.13.0/tests}/__init__.py +0 -0
  237. {kreuzberg-3.11.3/tests/ocr → kreuzberg-3.13.0/tests/api}/__init__.py +0 -0
  238. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/chunker_test.py +0 -0
  239. {kreuzberg-3.11.3/tests/utils → kreuzberg-3.13.0/tests/e2e}/__init__.py +0 -0
  240. /kreuzberg-3.11.3/tests/test_source_files/contract.txt → /kreuzberg-3.13.0/tests/extractors/__init__.py +0 -0
  241. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  242. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/hooks_test.py +0 -0
  243. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/playa_test.py +0 -0
  244. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/registry_test.py +0 -0
  245. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/contract_test.txt +0 -0
  246. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/document.docx +0 -0
  247. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/email/sample-email.eml +0 -0
  248. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  249. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/excel.xlsx +0 -0
  250. /kreuzberg-3.11.3/tests/test_source_files/better-ocr-image.jpg → /kreuzberg-3.13.0/tests/test_source_files/flower-no-text.jpg +0 -0
  251. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/form_test.txt +0 -0
  252. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/french-text.txt +0 -0
  253. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/german-text.txt +0 -0
  254. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/html.html +0 -0
  255. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/images/test_hello_world.png +0 -0
  256. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/invoice_image.png +0 -0
  257. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/invoice_test.txt +0 -0
  258. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/json/sample-document.json +0 -0
  259. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  260. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/markdown.md +0 -0
  261. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  262. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/non-searchable.pdf +0 -0
  263. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/ocr-image.jpg +0 -0
  264. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  265. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  266. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  267. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  268. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/receipt_test.txt +0 -0
  269. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/report_test.txt +0 -0
  270. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/sample-contract.pdf +0 -0
  271. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/scanned.pdf +0 -0
  272. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/searchable.pdf +0 -0
  273. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/spanish-text.txt +0 -0
  274. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/test-article.pdf +0 -0
  275. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  276. {kreuzberg-3.11.3 → kreuzberg-3.13.0}/tests/utils/tmp_test.py +0 -0
@@ -13,7 +13,15 @@ COPY kreuzberg kreuzberg
13
13
 
14
14
  # Install dependencies with optimizations
15
15
  RUN --mount=type=cache,target=/tmp/uv-cache \
16
- uv sync --extra api${EXTRAS:+ --extra ${EXTRAS}} --no-editable --no-dev --compile-bytecode && \
16
+ if [ -z "$EXTRAS" ]; then \
17
+ uv sync --extra api --no-editable --no-dev --compile-bytecode; \
18
+ else \
19
+ extras_args="--extra api"; \
20
+ for extra in $EXTRAS; do \
21
+ extras_args="$extras_args --extra $extra"; \
22
+ done; \
23
+ uv sync $extras_args --no-editable --no-dev --compile-bytecode; \
24
+ fi && \
17
25
  rm -rf /app/.venv/lib/python*/site-packages/**/__pycache__ && \
18
26
  find /app/.venv -type f -name "*.pyc" -delete && \
19
27
  find /app/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
@@ -28,11 +36,24 @@ ENV PYTHONUNBUFFERED=1
28
36
  ENV PATH="/app/.venv/bin:$PATH"
29
37
 
30
38
  # Install runtime dependencies
39
+ # Languages included: English (default), Spanish, French, German, Italian, Portuguese,
40
+ # Chinese (simplified & traditional), Japanese, Arabic, Russian, Hindi ~keep
31
41
  RUN apt-get update && apt-get install -y --no-install-recommends \
32
42
  pandoc \
33
43
  tesseract-ocr \
34
44
  tesseract-ocr-eng \
35
45
  tesseract-ocr-osd \
46
+ tesseract-ocr-spa \
47
+ tesseract-ocr-fra \
48
+ tesseract-ocr-deu \
49
+ tesseract-ocr-ita \
50
+ tesseract-ocr-por \
51
+ tesseract-ocr-chi-sim \
52
+ tesseract-ocr-chi-tra \
53
+ tesseract-ocr-jpn \
54
+ tesseract-ocr-ara \
55
+ tesseract-ocr-rus \
56
+ tesseract-ocr-hin \
36
57
  libglib2.0-0 \
37
58
  libsm6 \
38
59
  libxext6 \
@@ -46,9 +67,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
46
67
  COPY --from=builder /app/.venv /app/.venv
47
68
  COPY --from=builder /app/kreuzberg /app/kreuzberg
48
69
 
49
- # Create non-root user
70
+ # Create non-root user and cache directory
50
71
  RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser && \
72
+ mkdir -p /app/.kreuzberg && \
51
73
  chown -R appuser:appuser /app
52
74
 
75
+ # Set default cache directory to prevent permission issues
76
+ ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg
77
+
53
78
  USER appuser
54
79
  CMD ["litestar", "--app", "kreuzberg._api.main:app", "run", "--host", "0.0.0.0"]
@@ -0,0 +1,190 @@
1
+ # Kreuzberg Docker Images
2
+
3
+ [![GitHub](https://img.shields.io/badge/GitHub-Goldziher%2Fkreuzberg-blue)](https://github.com/Goldziher/kreuzberg)
4
+ [![PyPI](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
5
+ [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE)
7
+
8
+ High-performance Python library for text extraction from documents, available as optimized Docker images.
9
+
10
+ **Source Code**: [github.com/Goldziher/kreuzberg](https://github.com/Goldziher/kreuzberg)
11
+
12
+ ## Quick Start
13
+
14
+ ```bash
15
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
16
+ ```
17
+
18
+ ## Available Images
19
+
20
+ ### Base Image (`latest`)
21
+
22
+ - **Image**: `goldziher/kreuzberg:latest`
23
+ - **Size**: ~550MB compressed
24
+ - **Includes**: REST API server, CLI tools, Tesseract OCR with 12 business languages
25
+ - **Languages**: English, Spanish, French, German, Italian, Portuguese, Chinese (Simplified & Traditional), Japanese, Arabic, Russian, Hindi
26
+ - **Use cases**: Basic document processing, simple API deployments, cost-conscious workflows
27
+
28
+ ### Core Image (`core`)
29
+
30
+ - **Image**: `goldziher/kreuzberg-core:latest`
31
+ - **Size**: ~700MB compressed
32
+ - **Includes**: Everything from base plus:
33
+ - Text chunking (semantic-text-splitter)
34
+ - Encrypted PDF support (crypto)
35
+ - Document classification
36
+ - Language detection
37
+ - Email parsing (.eml, .msg)
38
+ - Additional format extensions
39
+ - **Use cases**: RAG applications, document intelligence, enterprise workflows, multi-language processing
40
+
41
+ ## Usage
42
+
43
+ ### Extract Files via API
44
+
45
+ ```bash
46
+ # Single file with base image
47
+ curl -X POST http://localhost:8000/extract \
48
+ -F "data=@document.pdf"
49
+
50
+ # With core image - chunking for RAG
51
+ docker run -p 8000:8000 goldziher/kreuzberg-core:latest
52
+ curl -X POST http://localhost:8000/extract \
53
+ -F "data=@document.pdf" \
54
+ -F "chunk_content=true" \
55
+ -F "max_chars=1000"
56
+
57
+ # Language detection
58
+ curl -X POST http://localhost:8000/extract \
59
+ -F "data=@document.pdf" \
60
+ -F "auto_detect_language=true"
61
+
62
+ # Encrypted PDF
63
+ curl -X POST http://localhost:8000/extract \
64
+ -F "data=@encrypted.pdf" \
65
+ -F "password=secretpassword"
66
+ ```
67
+
68
+ ### Docker Compose
69
+
70
+ ```yaml
71
+ version: '3.8'
72
+
73
+ services:
74
+ kreuzberg:
75
+ image: goldziher/kreuzberg-core:latest
76
+ ports:
77
+ - "8000:8000"
78
+ volumes:
79
+ - kreuzberg-cache:/app/.kreuzberg
80
+ environment:
81
+ - PYTHONUNBUFFERED=1
82
+ - KREUZBERG_CACHE_DIR=/app/.kreuzberg
83
+ restart: unless-stopped
84
+ healthcheck:
85
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
86
+ interval: 30s
87
+ timeout: 10s
88
+ retries: 3
89
+
90
+ volumes:
91
+ kreuzberg-cache:
92
+ ```
93
+
94
+ ## Custom Images
95
+
96
+ Create tailored images for your specific needs:
97
+
98
+ ### Example: RAG-Optimized
99
+
100
+ ```dockerfile
101
+ FROM goldziher/kreuzberg:latest
102
+
103
+ USER root
104
+
105
+ # Add chunking and language detection
106
+ RUN pip install --upgrade "kreuzberg[chunking,langdetect]"
107
+
108
+ USER appuser
109
+
110
+ # RAG-optimized defaults
111
+ ENV KREUZBERG_CHUNK_CONTENT=true
112
+ ENV KREUZBERG_MAX_CHARS=1000
113
+ ENV KREUZBERG_AUTO_DETECT_LANGUAGE=true
114
+ ```
115
+
116
+ ### Example: Crypto Support
117
+
118
+ ```dockerfile
119
+ FROM goldziher/kreuzberg:latest
120
+
121
+ USER root
122
+
123
+ # Add encrypted PDF support
124
+ RUN pip install --upgrade "kreuzberg[crypto]"
125
+
126
+ USER appuser
127
+ ```
128
+
129
+ ## Configuration
130
+
131
+ ### Environment Variables
132
+
133
+ - `KREUZBERG_CACHE_DIR`: Cache directory (default: `/app/.kreuzberg`)
134
+ - `KREUZBERG_CHUNK_CONTENT`: Enable chunking (`true`/`false`)
135
+ - `KREUZBERG_AUTO_DETECT_LANGUAGE`: Enable language detection (`true`/`false`)
136
+ - `KREUZBERG_OCR_BACKEND`: OCR backend (`tesseract` or `none`)
137
+
138
+ ### Configuration File
139
+
140
+ Mount `kreuzberg.toml`:
141
+
142
+ ```toml
143
+ chunk_content = true
144
+ auto_detect_language = true
145
+ max_chars = 1000
146
+ ocr_backend = "tesseract"
147
+
148
+ [tesseract]
149
+ language = "eng+spa+fra+deu"
150
+ psm = 6
151
+ ```
152
+
153
+ ```bash
154
+ docker run -p 8000:8000 \
155
+ -v "$(pwd)/kreuzberg.toml:/app/kreuzberg.toml:ro" \
156
+ goldziher/kreuzberg-core:latest
157
+ ```
158
+
159
+ ## Features
160
+
161
+ - **🚀 High Performance**: Optimized for speed and efficiency
162
+ - **📄 Multiple Formats**: PDF, DOCX, images, HTML, and more
163
+ - **🔍 OCR Support**: Built-in Tesseract with 12 business languages
164
+ - **🔒 Secure**: Runs as non-root user, no external API calls
165
+ - **📦 Ready to Use**: Pre-configured API server
166
+
167
+ ## Documentation
168
+
169
+ - **[GitHub Repository](https://github.com/Goldziher/kreuzberg)** - Source code and issue tracking
170
+ - **[Full Documentation](https://kreuzberg.dev/)** - Complete user guide and API reference
171
+ - **[API Documentation](https://kreuzberg.dev/user-guide/api-server/)** - REST API endpoints and usage
172
+ - **[Docker Guide](https://kreuzberg.dev/user-guide/docker/)** - Detailed Docker usage guide
173
+
174
+ ## Support
175
+
176
+ - **Issues**: [github.com/Goldziher/kreuzberg/issues](https://github.com/Goldziher/kreuzberg/issues)
177
+ - **Discussions**: [github.com/Goldziher/kreuzberg/discussions](https://github.com/Goldziher/kreuzberg/discussions)
178
+ - **Discord**: [Join our community](https://discord.gg/pXxagNK2zN)
179
+
180
+ ## Contributing
181
+
182
+ Contributions are welcome! See our [Contributing Guide](https://github.com/Goldziher/kreuzberg/blob/main/docs/contributing.md).
183
+
184
+ ## License
185
+
186
+ MIT License - see [LICENSE](https://github.com/Goldziher/kreuzberg/blob/main/LICENSE) for details.
187
+
188
+ ______________________________________________________________________
189
+
190
+ Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
@@ -51,7 +51,6 @@ jobs:
51
51
  - name: Execute Pre-Commit
52
52
  run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
53
53
 
54
- # Coverage job runs first, only on Python 3.13 Ubuntu
55
54
  coverage:
56
55
  needs: validate
57
56
  runs-on: ubuntu-latest
@@ -135,7 +134,6 @@ jobs:
135
134
  .coverage
136
135
  retention-days: 7
137
136
 
138
- # Full test matrix runs only after coverage succeeds
139
137
  test:
140
138
  needs: coverage
141
139
  runs-on: ${{ matrix.os }}
@@ -144,10 +142,6 @@ jobs:
144
142
  matrix:
145
143
  os: [ubuntu-latest, windows-latest, macos-latest]
146
144
  python: ["3.10", "3.11", "3.12", "3.13"]
147
- exclude:
148
- # Skip Python 3.13 on macOS for now due to compatibility issues
149
- - os: macos-latest
150
- python: "3.13"
151
145
  timeout-minutes: 30
152
146
  steps:
153
147
  - name: Checkout
@@ -11,10 +11,8 @@ jobs:
11
11
  strategy:
12
12
  matrix:
13
13
  image:
14
- - { name: "core", extras: "" }
15
- - { name: "easyocr", extras: "easyocr" }
16
- - { name: "paddle", extras: "paddleocr" }
17
- - { name: "gmft", extras: "gmft" }
14
+ - { name: "base", extras: "cli" }
15
+ - { name: "core", extras: "cli chunking crypto document-classification langdetect additional-extensions" }
18
16
  fail-fast: false
19
17
 
20
18
  steps:
@@ -92,9 +90,7 @@ jobs:
92
90
  docker build -f .docker/Dockerfile \
93
91
  --build-arg EXTRAS="${{ matrix.image.extras }}" \
94
92
  -t kreuzberg:${{ matrix.image.name }} \
95
- --cache-from type=gha \
96
- --cache-to type=gha,mode=max \
97
- --load \
93
+ --no-cache \
98
94
  .
99
95
 
100
96
  echo "Built image:"
@@ -104,7 +100,7 @@ jobs:
104
100
  run: |
105
101
  mkdir -p tests/e2e/logs
106
102
  echo "Running E2E tests for ${{ matrix.image.name }}..."
107
- python3 tests/e2e/docker_images_test.py --image ${{ matrix.image.name }}
103
+ python3 tests/e2e/docker_e2e_test.py --image ${{ matrix.image.name }}
108
104
 
109
105
  - name: Generate test report - ${{ matrix.image.name }}
110
106
  if: always()
@@ -50,7 +50,7 @@ jobs:
50
50
  uv run mkdocs build --clean --strict
51
51
 
52
52
  - name: Upload artifact
53
- uses: actions/upload-pages-artifact@v3
53
+ uses: actions/upload-pages-artifact@v4
54
54
  with:
55
55
  path: ./site
56
56
 
@@ -2,19 +2,32 @@ name: Publish Docker Images
2
2
 
3
3
  on:
4
4
  workflow_dispatch:
5
+ inputs:
6
+ version:
7
+ description: 'Version to build (leave empty to use latest git tag)'
8
+ required: false
9
+ type: string
10
+ build_base:
11
+ description: 'Build base image'
12
+ required: true
13
+ type: boolean
14
+ default: true
15
+ build_core:
16
+ description: 'Build core image'
17
+ required: true
18
+ type: boolean
19
+ default: true
5
20
  release:
6
21
  types: [published]
7
22
 
8
23
  jobs:
9
- # Run E2E tests first
10
24
  test-images:
11
25
  uses: ./.github/workflows/docker-e2e-tests.yml
12
26
 
13
- # Build and publish images after tests pass
14
27
  build-and-publish:
15
28
  needs: test-images
16
- runs-on: ubuntu-latest
17
29
  if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'release' }}
30
+ runs-on: ubuntu-latest
18
31
  permissions:
19
32
  contents: read
20
33
  packages: write
@@ -22,34 +35,49 @@ jobs:
22
35
  version: ${{ steps.get_version.outputs.VERSION }}
23
36
 
24
37
  strategy:
25
- max-parallel: 2
38
+ max-parallel: 1 # Build one at a time to save disk space ~keep
26
39
  matrix:
27
40
  include:
41
+ - name: base
42
+ extras: "cli"
43
+ image_name: "goldziher/kreuzberg"
44
+ should_build: ${{ github.event_name == 'release' || inputs.build_base == true }}
28
45
  - name: core
29
- extras: ""
30
- tag_suffix: ""
31
- - name: easyocr
32
- extras: "easyocr"
33
- tag_suffix: "-easyocr"
34
- - name: paddle
35
- extras: "paddleocr"
36
- tag_suffix: "-paddle"
37
- - name: gmft
38
- extras: "gmft"
39
- tag_suffix: "-gmft"
40
- - name: all
41
- extras: "all"
42
- tag_suffix: "-all"
46
+ extras: "cli chunking crypto document-classification langdetect additional-extensions"
47
+ image_name: "goldziher/kreuzberg-core"
48
+ should_build: ${{ github.event_name == 'release' || inputs.build_core == true }}
43
49
 
44
50
  steps:
45
51
  - name: Free up disk space
46
52
  run: |
53
+ echo "Initial disk space:"
54
+ df -h /
55
+
56
+ # Remove unnecessary large directories (saves ~30GB) ~keep
47
57
  sudo rm -rf /usr/share/dotnet
48
58
  sudo rm -rf /usr/local/lib/android
49
59
  sudo rm -rf /opt/ghc
50
60
  sudo rm -rf /opt/hostedtoolcache/CodeQL
51
- sudo docker system prune -af
52
- df -h
61
+ sudo rm -rf /usr/local/share/boost
62
+ sudo rm -rf /usr/local/lib/node_modules
63
+ sudo rm -rf /opt/microsoft
64
+ sudo rm -rf /usr/local/.ghcup
65
+ sudo rm -rf /opt/hostedtoolcache
66
+
67
+ # Clean apt
68
+ sudo apt-get clean
69
+ sudo rm -rf /var/lib/apt/lists/*
70
+
71
+ # Remove swap to free up space
72
+ sudo swapoff -a
73
+ sudo rm -f /swapfile
74
+
75
+ # Clean Docker completely
76
+ docker system prune -af --volumes || true
77
+ docker builder prune -af || true
78
+
79
+ echo "Available disk space after cleanup:"
80
+ df -h /
53
81
 
54
82
  - name: Checkout repository
55
83
  uses: actions/checkout@v5
@@ -61,6 +89,8 @@ jobs:
61
89
  run: |
62
90
  if [ "${{ github.event_name }}" = "release" ]; then
63
91
  VERSION="${{ github.event.release.tag_name }}"
92
+ elif [ -n "${{ inputs.version }}" ]; then
93
+ VERSION="${{ inputs.version }}"
64
94
  else
65
95
  git fetch --tags
66
96
  VERSION=$(git tag --sort=-version:refname | head -n1)
@@ -81,15 +111,17 @@ jobs:
81
111
  password: ${{ secrets.DOCKERHUB_TOKEN }}
82
112
 
83
113
  - name: Extract metadata (tags, labels) for Docker
114
+ if: ${{ matrix.should_build }}
84
115
  id: meta
85
116
  uses: docker/metadata-action@v5
86
117
  with:
87
- images: goldziher/kreuzberg
118
+ images: ${{ matrix.image_name }}
88
119
  tags: |
89
- type=raw,value=${{ steps.get_version.outputs.VERSION }}${{ matrix.tag_suffix }}
90
- type=raw,value=latest${{ matrix.tag_suffix }}
120
+ type=raw,value=${{ steps.get_version.outputs.VERSION }}
121
+ type=raw,value=latest
91
122
 
92
123
  - name: Build and push Docker image to Docker Hub
124
+ if: ${{ matrix.should_build }}
93
125
  uses: docker/build-push-action@v6
94
126
  with:
95
127
  context: .
@@ -100,15 +132,32 @@ jobs:
100
132
  EXTRAS=${{ matrix.extras }}
101
133
  tags: ${{ steps.meta.outputs.tags }}
102
134
  labels: ${{ steps.meta.outputs.labels }}
103
- cache-from: type=gha
104
- cache-to: type=gha,mode=max
135
+ no-cache: true
136
+
137
+ - name: Clean up after build
138
+ if: always()
139
+ run: |
140
+ # Remove all Docker images and containers
141
+ docker stop $(docker ps -aq) || true
142
+ docker rm $(docker ps -aq) || true
143
+ docker rmi $(docker images -q) || true
144
+
145
+ # Clean all Docker data
146
+ docker system prune -af --volumes || true
147
+ docker builder prune -af || true
148
+
149
+ # Clear buildkit cache
150
+ docker buildx prune -af || true
151
+
152
+ echo "Disk space after cleanup:"
153
+ df -h /
105
154
 
106
155
  - name: Update Docker Hub README
156
+ if: ${{ matrix.should_build }}
107
157
  uses: peter-evans/dockerhub-description@v4
108
- if: matrix.name == 'core'
109
158
  continue-on-error: true
110
159
  with:
111
160
  username: ${{ secrets.DOCKERHUB_USERNAME }}
112
161
  password: ${{ secrets.DOCKERHUB_TOKEN }}
113
- repository: goldziher/kreuzberg
162
+ repository: ${{ matrix.image_name }}
114
163
  readme-filepath: ./.docker/README.md
@@ -0,0 +1,97 @@
1
+ name: Test Docker Builds (No Push)
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ jobs:
7
+ test-build-all-images:
8
+ runs-on: ubuntu-latest
9
+ strategy:
10
+ max-parallel: 1 # Build one at a time to save disk space ~keep
11
+ matrix:
12
+ include:
13
+ - name: base
14
+ extras: "cli"
15
+ image_name: "goldziher/kreuzberg"
16
+ - name: core
17
+ extras: "cli,chunking,crypto,document-classification,langdetect,additional-extensions"
18
+ image_name: "goldziher/kreuzberg-core"
19
+
20
+ steps:
21
+ - name: Free up disk space
22
+ run: |
23
+ echo "Initial disk space:"
24
+ df -h /
25
+
26
+ # Remove unnecessary large directories (saves ~30GB)
27
+ sudo rm -rf /usr/share/dotnet
28
+ sudo rm -rf /usr/local/lib/android
29
+ sudo rm -rf /opt/ghc
30
+ sudo rm -rf /opt/hostedtoolcache/CodeQL
31
+ sudo rm -rf /usr/local/share/boost
32
+ sudo rm -rf /usr/local/lib/node_modules
33
+ sudo rm -rf /opt/microsoft
34
+ sudo rm -rf /usr/local/.ghcup
35
+ sudo rm -rf /opt/hostedtoolcache
36
+
37
+ # Clean apt
38
+ sudo apt-get clean
39
+ sudo rm -rf /var/lib/apt/lists/*
40
+
41
+ # Remove swap to free up space
42
+ sudo swapoff -a
43
+ sudo rm -f /swapfile
44
+
45
+ # Clean Docker completely
46
+ docker system prune -af --volumes || true
47
+ docker builder prune -af || true
48
+
49
+ echo "Available disk space after cleanup:"
50
+ df -h /
51
+
52
+ - name: Checkout repository
53
+ uses: actions/checkout@v5
54
+
55
+ - name: Set up Docker Buildx
56
+ uses: docker/setup-buildx-action@v3
57
+
58
+ - name: Build Docker image - ${{ matrix.name }}
59
+ run: |
60
+ echo "===================="
61
+ echo "Building ${{ matrix.name }} image"
62
+ echo "===================="
63
+
64
+ docker buildx build \
65
+ --file ./.docker/Dockerfile \
66
+ --build-arg EXTRAS="${{ matrix.extras }}" \
67
+ --tag ${{ matrix.image_name }}:test \
68
+ --platform linux/amd64 \
69
+ --no-cache \
70
+ --load \
71
+ .
72
+
73
+ echo "Image built successfully:"
74
+ docker images | grep "${{ matrix.image_name }}:test" || true
75
+
76
+ echo "Current disk usage:"
77
+ df -h /
78
+
79
+ - name: Clean up after build
80
+ if: always()
81
+ run: |
82
+ echo "Cleaning up Docker resources..."
83
+
84
+ # Remove all Docker images and containers
85
+ docker stop $(docker ps -aq) || true
86
+ docker rm $(docker ps -aq) || true
87
+ docker rmi $(docker images -q) || true
88
+
89
+ # Clean all Docker data
90
+ docker system prune -af --volumes || true
91
+ docker builder prune -af || true
92
+
93
+ # Clear buildkit cache
94
+ docker buildx prune -af || true
95
+
96
+ echo "Disk space after cleanup:"
97
+ df -h /
@@ -39,3 +39,30 @@ requirements.txt
39
39
  site/
40
40
  .cache/
41
41
  dist/
42
+ build/
43
+ .task/
44
+ tests/e2e/test_report.json
45
+ tests/e2e/logs/
46
+
47
+ # Additional build artifacts
48
+ *.whl
49
+ *.tar.gz
50
+ .tox/
51
+ .nox/
52
+ wheels/
53
+ share/python-wheels/
54
+
55
+ # Documentation builds
56
+ docs/_build/
57
+ docs/build/
58
+
59
+ # Node.js (if any frontend tools are used)
60
+ node_modules/
61
+ npm-debug.log*
62
+ yarn-debug.log*
63
+ yarn-error.log*
64
+
65
+ # Temporary files
66
+ *.tmp
67
+ *.temp
68
+ .tmp/
@@ -6,7 +6,7 @@ repos:
6
6
  stages: [commit-msg]
7
7
  additional_dependencies: ["@commitlint/config-conventional"]
8
8
  - repo: https://github.com/Goldziher/ai-rulez
9
- rev: v1.6.0
9
+ rev: v1.6.1
10
10
  hooks:
11
11
  - id: ai-rulez-validate
12
12
  - id: ai-rulez-generate
@@ -53,7 +53,7 @@ repos:
53
53
  hooks:
54
54
  - id: pyproject-fmt
55
55
  - repo: https://github.com/astral-sh/ruff-pre-commit
56
- rev: v0.12.10
56
+ rev: v0.12.11
57
57
  hooks:
58
58
  - id: ruff
59
59
  args: ["--fix", "--unsafe-fixes"]
@@ -66,7 +66,7 @@ repos:
66
66
  additional_dependencies:
67
67
  - tomli
68
68
  - repo: https://github.com/jsh9/pydoclint
69
- rev: 0.6.10
69
+ rev: 0.7.3
70
70
  hooks:
71
71
  - id: pydoclint
72
72
  args: