kreuzberg 3.4.0__tar.gz → 3.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.github/workflows/ci.yaml +1 -1
  2. kreuzberg-3.4.2/.github/workflows/docs.yml +66 -0
  3. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.github/workflows/publish-docker.yml +19 -9
  4. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.gitignore +1 -0
  5. kreuzberg-3.4.2/PKG-INFO +232 -0
  6. kreuzberg-3.4.2/README.md +168 -0
  7. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/ai-rulez.yaml +56 -3
  8. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/contributing.md +1 -1
  9. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/docker.md +7 -11
  10. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/__init__.py +3 -1
  11. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/pyproject.toml +8 -5
  12. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/uv.lock +2 -11
  13. kreuzberg-3.4.0/.github/benchmarks/README.md +0 -15
  14. kreuzberg-3.4.0/PKG-INFO +0 -290
  15. kreuzberg-3.4.0/README.md +0 -229
  16. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.commitlintrc +0 -0
  17. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.docker/Dockerfile +0 -0
  18. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.docker/README.md +0 -0
  19. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.dockerignore +0 -0
  20. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.github/dependabot.yaml +0 -0
  21. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.github/workflows/pr-title.yaml +0 -0
  22. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.github/workflows/release.yaml +0 -0
  23. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.markdownlint.yaml +0 -0
  24. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/.pre-commit-config.yaml +0 -0
  25. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/LICENSE +0 -0
  26. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/README.md +0 -0
  27. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/benchmark_baseline.py +0 -0
  28. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/end_to_end_benchmark.py +0 -0
  29. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/final_benchmark.py +0 -0
  30. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/pyproject.toml +0 -0
  31. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/baseline_results.json +0 -0
  32. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  33. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/comprehensive_caching_results.json +0 -0
  34. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/final_benchmark_results.json +0 -0
  35. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/mime_caching_results.json +0 -0
  36. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/msgspec_caching_results.json +0 -0
  37. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/ocr_caching_results.json +0 -0
  38. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/serialization_benchmark_results.json +0 -0
  39. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/statistical_benchmark_results.json +0 -0
  40. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/results/table_caching_results.json +0 -0
  41. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/serialization_benchmark.py +0 -0
  42. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  43. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  44. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  45. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  46. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  47. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  48. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  49. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/benchmarks/statistical_benchmark.py +0 -0
  50. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/advanced/custom-extractors.md +0 -0
  51. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/advanced/custom-hooks.md +0 -0
  52. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/advanced/error-handling.md +0 -0
  53. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/advanced/index.md +0 -0
  54. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/advanced/performance.md +0 -0
  55. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/api-reference/exceptions.md +0 -0
  56. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/api-reference/extraction-functions.md +0 -0
  57. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/api-reference/extractor-registry.md +0 -0
  58. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/api-reference/index.md +0 -0
  59. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/api-reference/ocr-configuration.md +0 -0
  60. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/api-reference/types.md +0 -0
  61. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/assets/favicon.png +0 -0
  62. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/assets/logo.png +0 -0
  63. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/changelog.md +0 -0
  64. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/cli.md +0 -0
  65. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/css/extra.css +0 -0
  66. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/examples/extraction-examples.md +0 -0
  67. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/examples/index.md +0 -0
  68. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/getting-started/index.md +0 -0
  69. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/getting-started/installation.md +0 -0
  70. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/getting-started/quick-start.md +0 -0
  71. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/index.md +0 -0
  72. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/api-server.md +0 -0
  73. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/basic-usage.md +0 -0
  74. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/chunking.md +0 -0
  75. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/extraction-configuration.md +0 -0
  76. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/index.md +0 -0
  77. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/metadata-extraction.md +0 -0
  78. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/ocr-backends.md +0 -0
  79. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/ocr-configuration.md +0 -0
  80. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/supported-formats.md +0 -0
  81. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/__main__.py +0 -0
  82. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_api/__init__.py +0 -0
  83. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_api/main.py +0 -0
  84. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_chunker.py +0 -0
  85. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_cli_config.py +0 -0
  86. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_constants.py +0 -0
  87. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_extractors/__init__.py +0 -0
  88. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_extractors/_base.py +0 -0
  89. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_extractors/_html.py +0 -0
  90. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_extractors/_image.py +0 -0
  91. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_extractors/_pandoc.py +0 -0
  92. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_extractors/_pdf.py +0 -0
  93. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_extractors/_presentation.py +0 -0
  94. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  95. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_gmft.py +0 -0
  96. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_mime_types.py +0 -0
  97. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_multiprocessing/__init__.py +0 -0
  98. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
  99. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_multiprocessing/process_manager.py +0 -0
  100. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
  101. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
  102. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_ocr/__init__.py +0 -0
  103. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_ocr/_base.py +0 -0
  104. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_ocr/_easyocr.py +0 -0
  105. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
  106. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_ocr/_tesseract.py +0 -0
  107. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_playa.py +0 -0
  108. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_registry.py +0 -0
  109. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_types.py +0 -0
  110. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/__init__.py +0 -0
  111. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_cache.py +0 -0
  112. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_device.py +0 -0
  113. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_document_cache.py +0 -0
  114. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_errors.py +0 -0
  115. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
  116. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_process_pool.py +0 -0
  117. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_serialization.py +0 -0
  118. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_string.py +0 -0
  119. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_sync.py +0 -0
  120. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/_utils/_tmp.py +0 -0
  121. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/cli.py +0 -0
  122. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/exceptions.py +0 -0
  123. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/extraction.py +0 -0
  124. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/kreuzberg/py.typed +0 -0
  125. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/mkdocs.yaml +0 -0
  126. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/__init__.py +0 -0
  127. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/api/__init__.py +0 -0
  128. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/api/main_test.py +0 -0
  129. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/chunker_test.py +0 -0
  130. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/cli_integration_test.py +0 -0
  131. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/cli_test.py +0 -0
  132. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/conftest.py +0 -0
  133. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/exceptions_test.py +0 -0
  134. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extraction_batch_test.py +0 -0
  135. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extraction_test.py +0 -0
  136. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extractors/__init__.py +0 -0
  137. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extractors/html_test.py +0 -0
  138. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extractors/image_test.py +0 -0
  139. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extractors/pandoc_metadata_test.py +0 -0
  140. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extractors/pandoc_test.py +0 -0
  141. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extractors/pdf_test.py +0 -0
  142. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extractors/presentation_test.py +0 -0
  143. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/extractors/spreed_sheet_test.py +0 -0
  144. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/gmft_extended_test.py +0 -0
  145. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/gmft_test.py +0 -0
  146. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/hooks_test.py +0 -0
  147. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/mime_types_test.py +0 -0
  148. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/multiprocessing/__init__.py +0 -0
  149. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/multiprocessing/gmft_integration_test.py +0 -0
  150. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/multiprocessing/process_manager_test.py +0 -0
  151. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/multiprocessing/sync_tesseract_test.py +0 -0
  152. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  153. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/ocr/__init__.py +0 -0
  154. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/ocr/base_test.py +0 -0
  155. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/ocr/device_integration_test.py +0 -0
  156. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/ocr/easyocr_test.py +0 -0
  157. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/ocr/init_test.py +0 -0
  158. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/ocr/paddleocr_test.py +0 -0
  159. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/ocr/tesseract_test.py +0 -0
  160. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/playa_test.py +0 -0
  161. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/registry_test.py +0 -0
  162. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/document.docx +0 -0
  163. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  164. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/excel.xlsx +0 -0
  165. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/html.html +0 -0
  166. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/markdown.md +0 -0
  167. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
  168. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/non-searchable.pdf +0 -0
  169. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/ocr-image.jpg +0 -0
  170. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  171. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  172. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  173. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  174. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/sample-contract.pdf +0 -0
  175. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/scanned.pdf +0 -0
  176. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/searchable.pdf +0 -0
  177. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/test_source_files/test-article.pdf +0 -0
  178. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/types_test.py +0 -0
  179. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/__init__.py +0 -0
  180. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/cache_test.py +0 -0
  181. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/device_test.py +0 -0
  182. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/errors_test.py +0 -0
  183. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/pdf_lock_test.py +0 -0
  184. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/process_pool_test.py +0 -0
  185. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/serialization_test.py +0 -0
  186. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/string_test.py +0 -0
  187. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/sync_test.py +0 -0
  188. {kreuzberg-3.4.0 → kreuzberg-3.4.2}/tests/utils/tmp_test.py +0 -0
@@ -50,7 +50,7 @@ jobs:
50
50
  strategy:
51
51
  matrix:
52
52
  os: [ ubuntu-latest, macOS-latest, windows-latest ]
53
- python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.9", "3.10", "3.11", "3.12", "3.13"]') }}
53
+ python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
54
54
  runs-on: ${{ matrix.os }}
55
55
  timeout-minutes: 30
56
56
  steps:
@@ -0,0 +1,66 @@
1
+ name: Deploy Documentation
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - 'docs/**'
9
+ - 'mkdocs.yaml'
10
+ - '.github/workflows/docs.yml'
11
+ workflow_dispatch:
12
+
13
+ permissions:
14
+ contents: read
15
+ pages: write
16
+ id-token: write
17
+
18
+ concurrency:
19
+ group: "pages"
20
+ cancel-in-progress: false
21
+
22
+ jobs:
23
+ build:
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - name: Checkout repository
27
+ uses: actions/checkout@v4
28
+ with:
29
+ fetch-depth: 0
30
+
31
+ - name: Setup Python
32
+ uses: actions/setup-python@v5
33
+ with:
34
+ python-version: '3.11'
35
+
36
+ - name: Install uv
37
+ uses: astral-sh/setup-uv@v6
38
+ with:
39
+ enable-cache: true
40
+
41
+ - name: Install dependencies
42
+ run: |
43
+ uv sync --group doc
44
+
45
+ - name: Setup Pages
46
+ uses: actions/configure-pages@v5
47
+
48
+ - name: Build documentation
49
+ run: |
50
+ uv run mkdocs build --clean --strict
51
+
52
+ - name: Upload artifact
53
+ uses: actions/upload-pages-artifact@v3
54
+ with:
55
+ path: ./site
56
+
57
+ deploy:
58
+ environment:
59
+ name: github-pages
60
+ url: ${{ steps.deployment.outputs.page_url }}
61
+ runs-on: ubuntu-latest
62
+ needs: build
63
+ steps:
64
+ - name: Deploy to GitHub Pages
65
+ id: deployment
66
+ uses: actions/deploy-pages@v4
@@ -9,11 +9,12 @@ on:
9
9
  - completed
10
10
  branches:
11
11
  - main
12
+ workflow_dispatch:
12
13
 
13
14
  jobs:
14
15
  build-and-push:
15
16
  runs-on: ubuntu-latest
16
- if: ${{ github.event.workflow_run.conclusion == 'success' }}
17
+ if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
17
18
  permissions:
18
19
  contents: read
19
20
  packages: write
@@ -41,20 +42,28 @@ jobs:
41
42
  - name: Checkout repository
42
43
  uses: actions/checkout@v4
43
44
  with:
44
- ref: ${{ github.event.workflow_run.head_branch }}
45
+ ref: ${{ github.event.workflow_run.head_branch || github.ref }}
45
46
 
46
47
  - name: Get release version
47
48
  id: get_version
48
49
  run: |
49
- echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
50
- # If triggered by a tag, extract version
51
- if [[ "${{ github.event.workflow_run.head_branch }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
52
- echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
53
- else
54
- # Get the latest tag
50
+ if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
51
+ # For manual dispatch, get the latest tag by listing all tags
55
52
  git fetch --tags
56
- echo "VERSION=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
53
+ VERSION=$(git tag --sort=-version:refname | head -n1)
54
+ else
55
+ # For workflow_run, use the head branch
56
+ VERSION="${{ github.event.workflow_run.head_branch }}"
57
+ # If triggered by a tag, extract version
58
+ if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
59
+ VERSION="$VERSION"
60
+ else
61
+ # Get the latest tag by listing all tags
62
+ git fetch --tags
63
+ VERSION=$(git tag --sort=-version:refname | head -n1)
64
+ fi
57
65
  fi
66
+ echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
58
67
 
59
68
  - name: Set up QEMU
60
69
  uses: docker/setup-qemu-action@v3
@@ -94,6 +103,7 @@ jobs:
94
103
  - name: Update Docker Hub README
95
104
  uses: peter-evans/dockerhub-description@v4
96
105
  if: matrix.name == 'core'
106
+ continue-on-error: true
97
107
  with:
98
108
  username: ${{ secrets.DOCKERHUB_USERNAME }}
99
109
  password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -32,3 +32,4 @@ docker-compose.yaml
32
32
  GEMINI.md
33
33
  prompt_template.egg-info/
34
34
  requirements.txt
35
+ site/
@@ -0,0 +1,232 @@
1
+ Metadata-Version: 2.4
2
+ Name: kreuzberg
3
+ Version: 3.4.2
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: General
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: anyio>=4.9.0
26
+ Requires-Dist: charset-normalizer>=3.4.2
27
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
28
+ Requires-Dist: html-to-markdown>=1.4.0
29
+ Requires-Dist: msgspec>=0.18.0
30
+ Requires-Dist: playa-pdf>=0.6.1
31
+ Requires-Dist: psutil>=7.0.0
32
+ Requires-Dist: pypdfium2==4.30.0
33
+ Requires-Dist: python-calamine>=0.3.2
34
+ Requires-Dist: python-pptx>=1.0.2
35
+ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
36
+ Provides-Extra: all
37
+ Requires-Dist: click>=8.2.1; extra == 'all'
38
+ Requires-Dist: easyocr>=1.7.2; extra == 'all'
39
+ Requires-Dist: gmft>=0.4.2; extra == 'all'
40
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
41
+ Requires-Dist: paddleocr>=3.1.0; extra == 'all'
42
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
43
+ Requires-Dist: rich>=14.0.0; extra == 'all'
44
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
45
+ Requires-Dist: setuptools>=80.9.0; extra == 'all'
46
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
47
+ Provides-Extra: api
48
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
49
+ Provides-Extra: chunking
50
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
51
+ Provides-Extra: cli
52
+ Requires-Dist: click>=8.2.1; extra == 'cli'
53
+ Requires-Dist: rich>=14.0.0; extra == 'cli'
54
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
55
+ Provides-Extra: easyocr
56
+ Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
57
+ Provides-Extra: gmft
58
+ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
59
+ Provides-Extra: paddleocr
60
+ Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
61
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
62
+ Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
63
+ Description-Content-Type: text/markdown
64
+
65
+ # Kreuzberg
66
+
67
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
68
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
69
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
70
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
71
+
72
+ **High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
73
+
74
+ 📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
75
+
76
+ ## Why Kreuzberg?
77
+
78
+ - **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
79
+ - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
80
+ - **⚡ Dual APIs**: Only library with both sync and async support
81
+ - **🔧 Zero Configuration**: Works out of the box with sane defaults
82
+ - **🏠 Local Processing**: No cloud dependencies or external API calls
83
+ - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
84
+ - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
85
+ - **🐳 Production Ready**: CLI, REST API, and Docker images included
86
+
87
+ ## Quick Start
88
+
89
+ ### Installation
90
+
91
+ ```bash
92
+ # Basic installation
93
+ pip install kreuzberg
94
+
95
+ # With optional features
96
+ pip install "kreuzberg[cli,api]" # CLI + REST API
97
+ pip install "kreuzberg[easyocr,gmft]" # EasyOCR + table extraction
98
+ pip install "kreuzberg[all]" # Everything
99
+ ```
100
+
101
+ ### System Dependencies
102
+
103
+ ```bash
104
+ # Ubuntu/Debian
105
+ sudo apt-get install tesseract-ocr pandoc
106
+
107
+ # macOS
108
+ brew install tesseract pandoc
109
+
110
+ # Windows
111
+ choco install tesseract pandoc
112
+ ```
113
+
114
+ ### Basic Usage
115
+
116
+ ```python
117
+ import asyncio
118
+ from kreuzberg import extract_file
119
+
120
+ async def main():
121
+ # Extract from any document type
122
+ result = await extract_file("document.pdf")
123
+ print(result.content)
124
+ print(result.metadata)
125
+
126
+ asyncio.run(main())
127
+ ```
128
+
129
+ ## Deployment Options
130
+
131
+ ### 🐳 Docker (Recommended)
132
+
133
+ ```bash
134
+ # Run API server
135
+ docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
136
+
137
+ # Extract files
138
+ curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
139
+ ```
140
+
141
+ Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
142
+
143
+ ### 🌐 REST API
144
+
145
+ ```bash
146
+ # Install and run
147
+ pip install "kreuzberg[api]"
148
+ litestar --app kreuzberg._api.main:app run
149
+
150
+ # Health check
151
+ curl http://localhost:8000/health
152
+
153
+ # Extract files
154
+ curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
155
+ ```
156
+
157
+ ### 💻 Command Line
158
+
159
+ ```bash
160
+ # Install CLI
161
+ pip install "kreuzberg[cli]"
162
+
163
+ # Extract to stdout
164
+ kreuzberg extract document.pdf
165
+
166
+ # JSON output with metadata
167
+ kreuzberg extract document.pdf --output-format json --show-metadata
168
+
169
+ # Batch processing
170
+ kreuzberg extract *.pdf --output-dir ./extracted/
171
+ ```
172
+
173
+ ## Supported Formats
174
+
175
+ | Category | Formats |
176
+ | ----------------- | ------------------------------ |
177
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
178
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
179
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
180
+ | **Presentations** | PPTX, PPT, ODP |
181
+ | **Web** | HTML, XML, MHTML |
182
+ | **Archives** | Support via extraction |
183
+
184
+ ## Performance
185
+
186
+ **Fastest extraction speeds** with minimal resource usage:
187
+
188
+ | Library | Speed | Memory | Size | Success Rate |
189
+ | ------------- | -------------- | ------------- | ----------- | ------------ |
190
+ | **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%** |
191
+ | Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
192
+ | MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
193
+ | Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
194
+
195
+ > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
196
+
197
+ ## Documentation
198
+
199
+ ### Quick Links
200
+
201
+ - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
202
+ - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
203
+ - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
204
+ - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
205
+ - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
206
+ - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
207
+ - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
208
+
209
+ ## Advanced Features
210
+
211
+ - **📊 Table Extraction**: Extract tables from PDFs with GMFT
212
+ - **🧩 Content Chunking**: Split documents for RAG applications
213
+ - **🎯 Custom Extractors**: Extend with your own document handlers
214
+ - **🔧 Configuration**: Flexible TOML-based configuration
215
+ - **🪝 Hooks**: Pre/post-processing customization
216
+ - **🌍 Multi-language OCR**: 100+ languages supported
217
+ - **⚙️ Metadata Extraction**: Rich document metadata
218
+ - **🔄 Batch Processing**: Efficient bulk document processing
219
+
220
+ ## License
221
+
222
+ MIT License - see [LICENSE](LICENSE) for details.
223
+
224
+ ______________________________________________________________________
225
+
226
+ <div align="center">
227
+
228
+ **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
229
+
230
+ Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
231
+
232
+ </div>
@@ -0,0 +1,168 @@
1
+ # Kreuzberg
2
+
3
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
4
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
5
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ **High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
9
+
10
+ 📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
11
+
12
+ ## Why Kreuzberg?
13
+
14
+ - **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
15
+ - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
16
+ - **⚡ Dual APIs**: Only library with both sync and async support
17
+ - **🔧 Zero Configuration**: Works out of the box with sane defaults
18
+ - **🏠 Local Processing**: No cloud dependencies or external API calls
19
+ - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
20
+ - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
21
+ - **🐳 Production Ready**: CLI, REST API, and Docker images included
22
+
23
+ ## Quick Start
24
+
25
+ ### Installation
26
+
27
+ ```bash
28
+ # Basic installation
29
+ pip install kreuzberg
30
+
31
+ # With optional features
32
+ pip install "kreuzberg[cli,api]" # CLI + REST API
33
+ pip install "kreuzberg[easyocr,gmft]" # EasyOCR + table extraction
34
+ pip install "kreuzberg[all]" # Everything
35
+ ```
36
+
37
+ ### System Dependencies
38
+
39
+ ```bash
40
+ # Ubuntu/Debian
41
+ sudo apt-get install tesseract-ocr pandoc
42
+
43
+ # macOS
44
+ brew install tesseract pandoc
45
+
46
+ # Windows
47
+ choco install tesseract pandoc
48
+ ```
49
+
50
+ ### Basic Usage
51
+
52
+ ```python
53
+ import asyncio
54
+ from kreuzberg import extract_file
55
+
56
+ async def main():
57
+ # Extract from any document type
58
+ result = await extract_file("document.pdf")
59
+ print(result.content)
60
+ print(result.metadata)
61
+
62
+ asyncio.run(main())
63
+ ```
64
+
65
+ ## Deployment Options
66
+
67
+ ### 🐳 Docker (Recommended)
68
+
69
+ ```bash
70
+ # Run API server
71
+ docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
72
+
73
+ # Extract files
74
+ curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
75
+ ```
76
+
77
+ Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
78
+
79
+ ### 🌐 REST API
80
+
81
+ ```bash
82
+ # Install and run
83
+ pip install "kreuzberg[api]"
84
+ litestar --app kreuzberg._api.main:app run
85
+
86
+ # Health check
87
+ curl http://localhost:8000/health
88
+
89
+ # Extract files
90
+ curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
91
+ ```
92
+
93
+ ### 💻 Command Line
94
+
95
+ ```bash
96
+ # Install CLI
97
+ pip install "kreuzberg[cli]"
98
+
99
+ # Extract to stdout
100
+ kreuzberg extract document.pdf
101
+
102
+ # JSON output with metadata
103
+ kreuzberg extract document.pdf --output-format json --show-metadata
104
+
105
+ # Batch processing
106
+ kreuzberg extract *.pdf --output-dir ./extracted/
107
+ ```
108
+
109
+ ## Supported Formats
110
+
111
+ | Category | Formats |
112
+ | ----------------- | ------------------------------ |
113
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
114
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
115
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
116
+ | **Presentations** | PPTX, PPT, ODP |
117
+ | **Web** | HTML, XML, MHTML |
118
+ | **Archives** | Support via extraction |
119
+
120
+ ## Performance
121
+
122
+ **Fastest extraction speeds** with minimal resource usage:
123
+
124
+ | Library | Speed | Memory | Size | Success Rate |
125
+ | ------------- | -------------- | ------------- | ----------- | ------------ |
126
+ | **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%** |
127
+ | Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
128
+ | MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
129
+ | Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
130
+
131
+ > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
132
+
133
+ ## Documentation
134
+
135
+ ### Quick Links
136
+
137
+ - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
138
+ - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
139
+ - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
140
+ - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
141
+ - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
142
+ - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
143
+ - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
144
+
145
+ ## Advanced Features
146
+
147
+ - **📊 Table Extraction**: Extract tables from PDFs with GMFT
148
+ - **🧩 Content Chunking**: Split documents for RAG applications
149
+ - **🎯 Custom Extractors**: Extend with your own document handlers
150
+ - **🔧 Configuration**: Flexible TOML-based configuration
151
+ - **🪝 Hooks**: Pre/post-processing customization
152
+ - **🌍 Multi-language OCR**: 100+ languages supported
153
+ - **⚙️ Metadata Extraction**: Rich document metadata
154
+ - **🔄 Batch Processing**: Efficient bulk document processing
155
+
156
+ ## License
157
+
158
+ MIT License - see [LICENSE](LICENSE) for details.
159
+
160
+ ______________________________________________________________________
161
+
162
+ <div align="center">
163
+
164
+ **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
165
+
166
+ Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
167
+
168
+ </div>
@@ -1,6 +1,6 @@
1
1
  metadata:
2
2
  name: "Kreuzberg"
3
- version: "3.2.0"
3
+ version: "3.4.0"
4
4
  description: "A text extraction library supporting PDFs, images, office documents and more"
5
5
 
6
6
  outputs:
@@ -96,8 +96,9 @@ rules:
96
96
  - Run manually: `pre-commit run --all-files`
97
97
 
98
98
  ### Documentation
99
- - Build docs: `mkdocs build`
100
- - Serve docs locally: `mkdocs serve`
99
+ - Build docs: `uv run mkdocs build --clean --strict`
100
+ - Serve docs locally: `uv run mkdocs serve`
101
+ - Install doc dependencies: `uv sync --group doc`
101
102
 
102
103
  - name: "Architecture"
103
104
  priority: 9
@@ -115,6 +116,8 @@ rules:
115
116
  - **GMFT Integration**: Table extraction using GMFT library for PDFs
116
117
  - **Chunking**: Text splitting functionality in `_chunker.py`
117
118
  - **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
119
+ - **API Server**: REST API using Litestar framework in `_api/main.py`
120
+ - **CLI**: Command-line interface for batch processing and automation
118
121
 
119
122
  ### Adding New Features
120
123
  - New extractors: Inherit from `BaseExtractor` and register with `ExtractorRegistry`
@@ -153,6 +156,56 @@ rules:
153
156
  - All builtin imports should be at the top level (except for cyclical or optional dependencies)
154
157
  - When committing, always use the format specified in the CLAUDE.md
155
158
 
159
+ - name: "CI/CD and Deployment"
160
+ priority: 6
161
+ content: |
162
+ ### GitHub Actions Workflows
163
+ - **Release**: Automated PyPI publishing via GitHub releases
164
+ - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
165
+ - **Documentation**: Auto-deploy to GitHub Pages on docs changes
166
+
167
+ ### Docker Variants
168
+ - **Core** (`goldziher/kreuzberg:v3.4.0`): API + Tesseract OCR
169
+ - **EasyOCR** (`goldziher/kreuzberg:v3.4.0-easyocr`): Core + EasyOCR
170
+ - **PaddleOCR** (`goldziher/kreuzberg:v3.4.0-paddle`): Core + PaddleOCR
171
+ - **GMFT** (`goldziher/kreuzberg:v3.4.0-gmft`): Core + table extraction
172
+ - **All** (`goldziher/kreuzberg:v3.4.0-all`): All features included
173
+
174
+ ### Manual Triggers
175
+ - Docker builds: `gh workflow run "Publish Docker Images"`
176
+ - Documentation: Auto-deploys on docs/ changes
177
+
178
+ ### Common Issues
179
+ - **Docker version detection**: Use `git tag --sort=-version:refname | head -n1` not `git describe`
180
+ - **Docs dependencies**: Use `uv sync --group doc` for proper mkdocs-material[imaging] support
181
+ - **Docker Hub README**: May fail due to permissions, use `continue-on-error: true`
182
+
183
+ - name: "Package Management"
184
+ priority: 6
185
+ content: |
186
+ ### Optional Dependencies Structure
187
+ ```toml
188
+ [project.optional-dependencies]
189
+ api = ["litestar[standard,structlog,opentelemetry]>=2.1.6"]
190
+ cli = ["click>=8.2.1", "rich>=14.0.0", "tomli>=2.0.0; python_version<'3.11'"]
191
+ chunking = ["semantic-text-splitter>=0.27.0"]
192
+ easyocr = ["easyocr>=1.7.2"]
193
+ gmft = ["gmft>=0.4.2"]
194
+ paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
195
+ all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
196
+ ```
197
+
198
+ ### Installation Patterns
199
+ - Basic: `pip install kreuzberg`
200
+ - With features: `pip install "kreuzberg[api,cli]"`
201
+ - All features: `pip install "kreuzberg[all]"`
202
+ - Development: `uv sync --all-extras`
203
+
204
+ ### Dependencies
205
+ - **Core**: pypdfium2, playa-pdf, python-pptx, etc.
206
+ - **System**: tesseract-ocr, pandoc (via package manager)
207
+ - **Development**: Uses dependency groups in pyproject.toml
208
+
156
209
  sections:
157
210
  - title: "Planned Features"
158
211
  content: |
@@ -68,7 +68,7 @@ Use [Conventional Commits](https://www.conventionalcommits.org/):
68
68
 
69
69
  ## Notes
70
70
 
71
- - Python 3.9-3.13 supported
71
+ - Python 3.10-3.13 supported
72
72
  - System dependencies (optional): Tesseract, Pandoc
73
73
  - Pre-commit runs automatically on commit
74
74
  - Join our [Discord](https://discord.gg/pXxagNK2zN) for help
@@ -7,11 +7,12 @@ Kreuzberg provides official Docker images for easy deployment and containerized
7
7
  Docker images are available on [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg):
8
8
 
9
9
  - `goldziher/kreuzberg:latest` - Core image with API server and Tesseract OCR
10
- - `goldziher/kreuzberg:VERSION` - Specific version (e.g., `3.0.0`)
11
- - `goldziher/kreuzberg:VERSION-easyocr` - With EasyOCR support
12
- - `goldziher/kreuzberg:VERSION-paddle` - With PaddleOCR support
13
- - `goldziher/kreuzberg:VERSION-gmft` - With GMFT table extraction
14
- - `goldziher/kreuzberg:VERSION-all` - With all optional dependencies
10
+ - `goldziher/kreuzberg:latest-easyocr` - With EasyOCR support
11
+ - `goldziher/kreuzberg:latest-paddle` - With PaddleOCR support
12
+ - `goldziher/kreuzberg:latest-gmft` - With GMFT table extraction
13
+ - `goldziher/kreuzberg:latest-all` - With all optional dependencies
14
+
15
+ > **Note**: Specific version tags are also available (e.g., `v3.4.0`, `v3.4.0-easyocr`)
15
16
 
16
17
  ## Quick Start
17
18
 
@@ -45,8 +46,6 @@ curl -X POST http://localhost:8000/extract \
45
46
  Create a `docker-compose.yml`:
46
47
 
47
48
  ```yaml
48
- version: '3.8'
49
-
50
49
  services:
51
50
  kreuzberg:
52
51
  image: goldziher/kreuzberg:latest
@@ -54,9 +53,6 @@ services:
54
53
  - "8000:8000"
55
54
  environment:
56
55
  - PYTHONUNBUFFERED=1
57
- volumes:
58
- # Optional: Mount local directory for file access
59
- - ./documents:/app/documents
60
56
  restart: unless-stopped
61
57
  ```
62
58
 
@@ -107,7 +103,7 @@ CMD ["python", "custom_config.py"]
107
103
 
108
104
  ### Base Image
109
105
 
110
- - Based on `python:3.13-bookworm`
106
+ - Based on `python:3.13-bookworm` (requires Python 3.10+)
111
107
  - Includes system dependencies: `pandoc`, `tesseract-ocr`
112
108
  - Runs as non-root user `appuser`
113
109
  - Exposes port 8000