kreuzberg 3.4.2__tar.gz → 3.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/publish-docker.yml +2 -3
  2. kreuzberg-3.5.0/.gitmodules +3 -0
  3. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/PKG-INFO +3 -1
  4. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/benchmark_baseline.py +1 -1
  5. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/final_benchmark.py +1 -1
  6. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/serialization_benchmark.py +0 -1
  7. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/performance.md +46 -9
  8. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/changelog.md +2 -0
  9. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/examples/extraction-examples.md +41 -0
  10. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/getting-started/installation.md +9 -1
  11. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/docker.md +49 -0
  12. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/extraction-configuration.md +53 -0
  13. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/ocr-configuration.md +86 -19
  14. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/__init__.py +2 -0
  15. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_image.py +21 -1
  16. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_pdf.py +44 -14
  17. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_spread_sheet.py +2 -2
  18. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_gmft.py +4 -4
  19. kreuzberg-3.5.0/kreuzberg/_language_detection.py +95 -0
  20. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
  21. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/process_manager.py +2 -1
  22. kreuzberg-3.5.0/kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
  23. kreuzberg-3.5.0/kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
  24. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/_easyocr.py +1 -1
  25. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/_tesseract.py +7 -3
  26. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_types.py +11 -4
  27. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_device.py +2 -2
  28. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_process_pool.py +2 -2
  29. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_sync.py +1 -5
  30. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_tmp.py +2 -2
  31. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/extraction.py +10 -0
  32. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/mkdocs.yaml +1 -0
  33. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/pyproject.toml +22 -1
  34. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/api/main_test.py +2 -5
  35. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/cli_integration_test.py +9 -1
  36. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extraction_test.py +10 -2
  37. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/image_test.py +17 -4
  38. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/pdf_test.py +8 -0
  39. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/gmft_extended_test.py +6 -17
  40. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/gmft_test.py +0 -3
  41. kreuzberg-3.5.0/tests/language_detection_test.py +237 -0
  42. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/sync_tesseract_test.py +2 -3
  43. kreuzberg-3.5.0/tests/test_source_files/french-text.txt +2 -0
  44. kreuzberg-3.5.0/tests/test_source_files/german-text.txt +2 -0
  45. kreuzberg-3.5.0/tests/test_source_files/spanish-text.txt +2 -0
  46. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/cache_test.py +0 -3
  47. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/errors_test.py +0 -1
  48. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/process_pool_test.py +0 -3
  49. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/sync_test.py +0 -7
  50. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/uv.lock +68 -2
  51. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.commitlintrc +0 -0
  52. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.docker/Dockerfile +0 -0
  53. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.docker/README.md +0 -0
  54. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.dockerignore +0 -0
  55. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/dependabot.yaml +0 -0
  56. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/ci.yaml +0 -0
  57. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/docs.yml +0 -0
  58. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/pr-title.yaml +0 -0
  59. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/release.yaml +0 -0
  60. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.gitignore +0 -0
  61. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.markdownlint.yaml +0 -0
  62. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/.pre-commit-config.yaml +0 -0
  63. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/LICENSE +0 -0
  64. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/README.md +0 -0
  65. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/ai-rulez.yaml +0 -0
  66. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/README.md +0 -0
  67. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/end_to_end_benchmark.py +0 -0
  68. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/pyproject.toml +0 -0
  69. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/baseline_results.json +0 -0
  70. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  71. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
  72. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/final_benchmark_results.json +0 -0
  73. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/mime_caching_results.json +0 -0
  74. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/msgspec_caching_results.json +0 -0
  75. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/ocr_caching_results.json +0 -0
  76. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
  77. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
  78. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/results/table_caching_results.json +0 -0
  79. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  80. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  81. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  82. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  83. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  84. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  85. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  86. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/statistical_benchmark.py +0 -0
  87. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/custom-extractors.md +0 -0
  88. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/custom-hooks.md +0 -0
  89. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/error-handling.md +0 -0
  90. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/index.md +0 -0
  91. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/exceptions.md +0 -0
  92. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/extraction-functions.md +0 -0
  93. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/extractor-registry.md +0 -0
  94. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/index.md +0 -0
  95. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/ocr-configuration.md +0 -0
  96. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/api-reference/types.md +0 -0
  97. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/assets/favicon.png +0 -0
  98. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/assets/logo.png +0 -0
  99. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/cli.md +0 -0
  100. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/contributing.md +0 -0
  101. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/css/extra.css +0 -0
  102. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/examples/index.md +0 -0
  103. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/getting-started/index.md +0 -0
  104. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/getting-started/quick-start.md +0 -0
  105. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/index.md +0 -0
  106. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/api-server.md +0 -0
  107. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/basic-usage.md +0 -0
  108. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/chunking.md +0 -0
  109. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/index.md +0 -0
  110. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/metadata-extraction.md +0 -0
  111. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/ocr-backends.md +0 -0
  112. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/supported-formats.md +0 -0
  113. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/__main__.py +0 -0
  114. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_api/__init__.py +0 -0
  115. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_api/main.py +0 -0
  116. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_chunker.py +0 -0
  117. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_cli_config.py +0 -0
  118. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_constants.py +0 -0
  119. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/__init__.py +0 -0
  120. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_base.py +0 -0
  121. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_html.py +0 -0
  122. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  123. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_presentation.py +0 -0
  124. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_mime_types.py +0 -0
  125. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/__init__.py +0 -0
  126. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
  127. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
  128. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/__init__.py +0 -0
  129. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/_base.py +0 -0
  130. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  131. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_playa.py +0 -0
  132. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_registry.py +0 -0
  133. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/__init__.py +0 -0
  134. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_cache.py +0 -0
  135. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_document_cache.py +0 -0
  136. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_errors.py +0 -0
  137. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  138. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_serialization.py +0 -0
  139. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_utils/_string.py +0 -0
  140. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/cli.py +0 -0
  141. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/exceptions.py +0 -0
  142. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/py.typed +0 -0
  143. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/__init__.py +0 -0
  144. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/api/__init__.py +0 -0
  145. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/chunker_test.py +0 -0
  146. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/cli_test.py +0 -0
  147. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/conftest.py +0 -0
  148. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/exceptions_test.py +0 -0
  149. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extraction_batch_test.py +0 -0
  150. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/__init__.py +0 -0
  151. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/html_test.py +0 -0
  152. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  153. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/pandoc_test.py +0 -0
  154. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/presentation_test.py +0 -0
  155. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/extractors/spreed_sheet_test.py +0 -0
  156. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/hooks_test.py +0 -0
  157. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/mime_types_test.py +0 -0
  158. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/__init__.py +0 -0
  159. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
  160. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/process_manager_test.py +0 -0
  161. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  162. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/__init__.py +0 -0
  163. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/base_test.py +0 -0
  164. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/device_integration_test.py +0 -0
  165. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/easyocr_test.py +0 -0
  166. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/init_test.py +0 -0
  167. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/paddleocr_test.py +0 -0
  168. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/ocr/tesseract_test.py +0 -0
  169. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/playa_test.py +0 -0
  170. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/registry_test.py +0 -0
  171. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/document.docx +0 -0
  172. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  173. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/excel.xlsx +0 -0
  174. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/html.html +0 -0
  175. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/markdown.md +0 -0
  176. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  177. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/non-searchable.pdf +0 -0
  178. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/ocr-image.jpg +0 -0
  179. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  180. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  181. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  182. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  183. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/sample-contract.pdf +0 -0
  184. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/scanned.pdf +0 -0
  185. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/searchable.pdf +0 -0
  186. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/test_source_files/test-article.pdf +0 -0
  187. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/types_test.py +0 -0
  188. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/__init__.py +0 -0
  189. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/device_test.py +0 -0
  190. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/pdf_lock_test.py +0 -0
  191. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/serialization_test.py +0 -0
  192. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/string_test.py +0 -0
  193. {kreuzberg-3.4.2 → kreuzberg-3.5.0}/tests/utils/tmp_test.py +0 -0
@@ -1,4 +1,3 @@
1
- # .github/workflows/publish-docker.yml
2
1
 
3
2
  name: Publish Docker Images
4
3
 
@@ -24,7 +23,7 @@ jobs:
24
23
  include:
25
24
  - name: core
26
25
  extras: ""
27
- tag_suffix: "" # The base image tag (includes API + tesseract)
26
+ tag_suffix: ""
28
27
  - name: easyocr
29
28
  extras: "easyocr"
30
29
  tag_suffix: "-easyocr"
@@ -89,7 +88,7 @@ jobs:
89
88
  type=raw,value=latest${{ matrix.tag_suffix }}
90
89
 
91
90
  - name: Build and push Docker image
92
- uses: docker/build-push-action@v5
91
+ uses: docker/build-push-action@v6
93
92
  with:
94
93
  context: .
95
94
  file: ./.docker/Dockerfile
@@ -0,0 +1,3 @@
1
+ [submodule "python-text-extraction-libs-benchmarks"]
2
+ path = python-text-extraction-libs-benchmarks
3
+ url = https://github.com/Goldziher/python-text-extraction-libs-benchmarks.git
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.4.2
3
+ Version: 3.5.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -56,6 +56,8 @@ Provides-Extra: easyocr
56
56
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
57
57
  Provides-Extra: gmft
58
58
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
59
+ Provides-Extra: langdetect
60
+ Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
59
61
  Provides-Extra: paddleocr
60
62
  Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
61
63
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
@@ -8,7 +8,7 @@ from kreuzberg import ExtractionConfig, batch_extract_file, extract_file_sync
8
8
  from kreuzberg._utils._document_cache import clear_document_cache, get_document_cache
9
9
 
10
10
 
11
- async def run_baseline_benchmark() -> dict[str, object] | None: # type: ignore[syntax]
11
+ async def run_baseline_benchmark() -> dict[str, object] | None:
12
12
  """Run comprehensive baseline benchmark."""
13
13
  test_files_dir = Path("tests/test_source_files")
14
14
  test_files = list(test_files_dir.glob("*.pdf"))
@@ -15,7 +15,7 @@ from kreuzberg._utils._cache import (
15
15
  )
16
16
 
17
17
 
18
- async def run_final_benchmark() -> dict[str, object] | None: # type: ignore[syntax]
18
+ async def run_final_benchmark() -> dict[str, object] | None:
19
19
  """Run comprehensive benchmark of all caching improvements."""
20
20
  test_files_dir = Path("tests/test_source_files")
21
21
  pdf_files = list(test_files_dir.glob("*.pdf"))
@@ -101,7 +101,6 @@ def benchmark_serialization() -> dict[str, object]:
101
101
  json_serialize = analyze_times(json_serialize_times, "JSON Serialize")
102
102
  json_deserialize = analyze_times(json_deserialize_times, "JSON Deserialize")
103
103
 
104
- # Type casting for arithmetic operations
105
104
  json_ser_mean = json_serialize["mean"]
106
105
  json_deser_mean = json_deserialize["mean"]
107
106
  msgpack_ser_mean = msgpack_serialize["mean"]
@@ -125,22 +125,59 @@ The async API leverages Python's asyncio with intelligent task scheduling:
125
125
  1. **Configure OCR appropriately** for your document types
126
126
  1. **Profile your specific workload** - results vary by content
127
127
 
128
- ### Configuration Examples
128
+ ### Optimized Default Configuration
129
+
130
+ Kreuzberg's default configuration is **optimized out-of-the-box for modern PDFs and standard documents**:
129
131
 
130
132
  ```python
131
- from kreuzberg import ExtractionConfig, extract_file_sync
132
- from kreuzberg._ocr import TesseractConfig
133
+ from kreuzberg import ExtractionConfig
133
134
 
134
- # Optimized for speed
135
- fast_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=6)) # Assume uniform text block
135
+ # Default configuration - already optimized for modern documents
136
+ config = ExtractionConfig() # Uses optimized defaults:
137
+ # - PSM: AUTO_ONLY (fast without orientation detection)
138
+ # - Language model: Disabled for performance
139
+ # - Dictionary correction: Enabled for accuracy
140
+ ```
136
141
 
137
- # Optimized for accuracy
138
- accurate_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=1)) # Auto page segmentation
142
+ ### Advanced Configuration Examples
139
143
 
140
- # For simple documents (no OCR)
141
- text_only_config = ExtractionConfig(force_ocr=False, ocr_backend=None)
144
+ ```python
145
+ from kreuzberg import ExtractionConfig, extract_file_sync
146
+ from kreuzberg._ocr._tesseract import TesseractConfig, PSMMode
147
+
148
+ # Maximum speed configuration (for high-volume processing)
149
+ speed_config = ExtractionConfig(
150
+ ocr_backend="tesseract",
151
+ ocr_config=TesseractConfig(
152
+ psm=PSMMode.SINGLE_BLOCK, # Assume simple layout
153
+ language_model_ngram_on=False, # Already disabled by default
154
+ tessedit_enable_dict_correction=False, # Disable for maximum speed
155
+ ),
156
+ )
157
+
158
+ # Maximum accuracy configuration (for degraded documents)
159
+ accuracy_config = ExtractionConfig(
160
+ ocr_backend="tesseract",
161
+ ocr_config=TesseractConfig(
162
+ psm=PSMMode.AUTO, # Full analysis with orientation detection
163
+ language_model_ngram_on=True, # Enable for historical/degraded text
164
+ tessedit_enable_dict_correction=True, # Default - keep enabled
165
+ ),
166
+ )
167
+
168
+ # No OCR configuration (text documents only)
169
+ text_only_config = ExtractionConfig(ocr_backend=None, force_ocr=False)
142
170
  ```
143
171
 
172
+ ### Performance Optimization Tips
173
+
174
+ Based on comprehensive benchmarking with 138+ documents:
175
+
176
+ 1. **Disable OCR for text documents**: Setting `ocr_backend=None` provides significant speedup for documents with text layers
177
+ 1. **Use PSM `AUTO_ONLY` (default)**: Optimized for modern documents without orientation detection overhead
178
+ 1. **Language model trade-offs**: Disabling `language_model_ngram_on` can provide 30x+ speedup with minimal quality impact on clean documents
179
+ 1. **Dictionary correction**: Disabling `tessedit_enable_dict_correction` speeds up processing for technical documents
180
+
144
181
  ### Batch Processing Best Practices
145
182
 
146
183
  ```python
@@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
12
12
  - Documentation site with comprehensive examples and API reference
13
13
  - Improved configuration for all OCR backends
14
14
  - Added hooks system for validation and post-processing
15
+ - Language detection feature with `auto_detect_language` configuration option
16
+ - New optional dependency group `langdetect` for automatic language detection
15
17
 
16
18
  ### Changed
17
19
 
@@ -79,6 +79,47 @@ async def extract_with_different_backends():
79
79
  print(f"No OCR result: {result.content[:100]}...")
80
80
  ```
81
81
 
82
+ ## Language Detection
83
+
84
+ ```python
85
+ from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
86
+
87
+ async def detect_document_language():
88
+ # Simple automatic language detection
89
+ result = await extract_file("document.pdf", config=ExtractionConfig(auto_detect_language=True))
90
+
91
+ # Access detected languages
92
+ if result.detected_languages:
93
+ print(f"Detected languages: {', '.join(result.detected_languages)}")
94
+ # Example output: "Detected languages: en, de, fr"
95
+
96
+ async def detect_multilingual_document():
97
+ # Advanced multilingual detection with custom configuration
98
+ lang_config = LanguageDetectionConfig(
99
+ multilingual=True, # Detect multiple languages in mixed text
100
+ top_k=5, # Return top 5 languages
101
+ low_memory=False, # Use high accuracy mode
102
+ )
103
+
104
+ result = await extract_file(
105
+ "multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
106
+ )
107
+
108
+ if result.detected_languages:
109
+ print(f"Detected languages: {result.detected_languages}")
110
+
111
+ # Use detected languages for OCR
112
+ from kreuzberg import TesseractConfig
113
+
114
+ # Create language string for Tesseract (e.g., "eng+deu+fra")
115
+ tesseract_langs = "+".join(result.detected_languages[:3])
116
+
117
+ result_with_ocr = await extract_file(
118
+ "multilingual_document.pdf",
119
+ config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=tesseract_langs)),
120
+ )
121
+ ```
122
+
82
123
  ## Table Extraction
83
124
 
84
125
  ```python
@@ -102,6 +102,14 @@ Table extraction is an optional feature that allows Kreuzberg to extract tables
102
102
  pip install "kreuzberg[gmft]"
103
103
  ```
104
104
 
105
+ ### Language Detection
106
+
107
+ Language detection is an optional feature that automatically detects the language of extracted text. It uses the [fast-langdetect](https://github.com/LlmKira/fast-langdetect) package. To install Kreuzberg with language detection support, you can use:
108
+
109
+ ```shell
110
+ pip install "kreuzberg[langdetect]"
111
+ ```
112
+
105
113
  ### All Optional Dependencies
106
114
 
107
115
  To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -113,5 +121,5 @@ pip install "kreuzberg[all]"
113
121
  This is equivalent to:
114
122
 
115
123
  ```shell
116
- pip install "kreuzberg[chunking,easyocr,gmft,paddleocr]"
124
+ pip install "kreuzberg[chunking,easyocr,gmft,langdetect,paddleocr]"
117
125
  ```
@@ -124,6 +124,34 @@ Additional dependencies by variant:
124
124
  - **gmft**: GMFT for table extraction
125
125
  - **all**: All optional dependencies
126
126
 
127
+ ### Health Check
128
+
129
+ All Docker images include a health check endpoint:
130
+
131
+ ```bash
132
+ # Check API health
133
+ curl http://localhost:8000/health
134
+ ```
135
+
136
+ Returns a JSON response with service status and version information.
137
+
138
+ ### Observability
139
+
140
+ The Docker images include built-in OpenTelemetry instrumentation via Litestar:
141
+
142
+ - **Tracing**: Automatic request/response tracing
143
+ - **Metrics**: Performance and usage metrics
144
+ - **Logging**: Structured JSON logging
145
+
146
+ Configure via standard OpenTelemetry environment variables:
147
+
148
+ ```bash
149
+ docker run -p 8000:8000 \
150
+ -e OTEL_SERVICE_NAME=kreuzberg-api \
151
+ -e OTEL_EXPORTER_OTLP_ENDPOINT=http://your-collector:4317 \
152
+ goldziher/kreuzberg:latest
153
+ ```
154
+
127
155
  ### Environment Variables
128
156
 
129
157
  - `PYTHONUNBUFFERED=1` - Ensures proper logging output
@@ -150,6 +178,12 @@ server {
150
178
  client_max_body_size 100M;
151
179
  proxy_read_timeout 300s;
152
180
  }
181
+
182
+ # Health check endpoint
183
+ location /health {
184
+ proxy_pass http://localhost:8000/health;
185
+ access_log off;
186
+ }
153
187
  }
154
188
  ```
155
189
 
@@ -175,6 +209,21 @@ spec:
175
209
  image: goldziher/kreuzberg:latest
176
210
  ports:
177
211
  - containerPort: 8000
212
+ livenessProbe:
213
+ httpGet:
214
+ path: /health
215
+ port: 8000
216
+ initialDelaySeconds: 30
217
+ periodSeconds: 10
218
+ readinessProbe:
219
+ httpGet:
220
+ path: /health
221
+ port: 8000
222
+ initialDelaySeconds: 5
223
+ periodSeconds: 5
224
+ env:
225
+ - name: OTEL_SERVICE_NAME
226
+ value: "kreuzberg-api"
178
227
  resources:
179
228
  requests:
180
229
  memory: "512Mi"
@@ -9,6 +9,7 @@ All extraction functions accept an optional `config` parameter of type `Extracti
9
9
  - Control OCR behavior with `force_ocr` and `ocr_backend`
10
10
  - Provide engine-specific OCR configuration via `ocr_config`
11
11
  - Enable table extraction with `extract_tables` and configure it via `gmft_config`
12
+ - Enable automatic language detection with `auto_detect_language`
12
13
  - Add validation and post-processing hooks
13
14
  - Configure custom extractors
14
15
 
@@ -100,6 +101,58 @@ Note that table extraction requires the `gmft` dependency. You can install it wi
100
101
  pip install "kreuzberg[gmft]"
101
102
  ```
102
103
 
104
+ ### Language Detection
105
+
106
+ Kreuzberg can automatically detect the language of extracted text using fast-langdetect:
107
+
108
+ ```python
109
+ from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
110
+
111
+ # Simple automatic language detection
112
+ result = await extract_file("multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True))
113
+
114
+ # Access detected languages (lowercase ISO 639-1 codes)
115
+ if result.detected_languages:
116
+ print(f"Detected languages: {', '.join(result.detected_languages)}")
117
+ # Example output: "Detected languages: en, de, fr"
118
+
119
+ # Advanced configuration with multilingual detection
120
+ lang_config = LanguageDetectionConfig(
121
+ multilingual=True, # Enable mixed-language detection
122
+ top_k=5, # Return top 5 languages
123
+ low_memory=False, # Use high accuracy mode
124
+ cache_dir="/tmp/lang_models", # Custom model cache directory
125
+ )
126
+
127
+ result = await extract_file(
128
+ "multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
129
+ )
130
+
131
+ # Use detected languages for OCR
132
+ if result.detected_languages:
133
+ # Re-extract with OCR using the primary detected language
134
+ from kreuzberg import TesseractConfig
135
+
136
+ result_with_ocr = await extract_file(
137
+ "multilingual_document.pdf",
138
+ config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=result.detected_languages[0])),
139
+ )
140
+ ```
141
+
142
+ #### Language Detection Configuration Options
143
+
144
+ - `low_memory` (default: `True`): Use smaller model (~200MB) vs larger, more accurate model
145
+ - `multilingual` (default: `False`): Enable detection of multiple languages in mixed text
146
+ - `top_k` (default: `3`): Maximum number of languages to return
147
+ - `cache_dir`: Custom directory for language model storage
148
+ - `allow_fallback` (default: `True`): Fall back to small model if large model fails
149
+
150
+ The feature requires the `langdetect` dependency:
151
+
152
+ ```shell
153
+ pip install "kreuzberg[langdetect]"
154
+ ```
155
+
103
156
  ### Batch Processing
104
157
 
105
158
  ```python
@@ -62,15 +62,15 @@ result = await extract_file("document.pdf", config=ExtractionConfig(ocr_config=T
62
62
 
63
63
  #### Available PSM Modes
64
64
 
65
- | Mode | Enum Value | Description | Best For |
66
- | -------------------- | ------------------------- | -------------------------------------------------------- | ---------------------------------------------- |
67
- | Automatic | `PSMMode.AUTO` | Automatic page segmentation with orientation detection | General purpose (default) |
68
- | Single Block | `PSMMode.SINGLE_BLOCK` | Treat the image as a single text block | Simple layouts, preserving paragraph structure |
69
- | Single Line | `PSMMode.SINGLE_LINE` | Treat the image as a single text line | Receipts, labels, single-line text |
70
- | Single Word | `PSMMode.SINGLE_WORD` | Treat the image as a single word | Word recognition tasks |
71
- | Single Character | `PSMMode.SINGLE_CHAR` | Treat the image as a single character | Character recognition tasks |
72
- | Sparse Text | `PSMMode.SPARSE_TEXT` | Find as much text as possible without assuming structure | Forms, tables, scattered text |
73
- | Sparse Text with OSD | `PSMMode.SPARSE_TEXT_OSD` | Like SPARSE_TEXT with orientation detection | Complex layouts with varying text orientation |
65
+ | Mode | Enum Value | Description | Best For |
66
+ | ------------- | ----------------------- | -------------------------------------------------------- | ---------------------------------------------- |
67
+ | Auto Only | `PSMMode.AUTO_ONLY` | Automatic segmentation without orientation detection | Modern documents (default - fastest) |
68
+ | Automatic | `PSMMode.AUTO` | Automatic page segmentation with orientation detection | Rotated/skewed documents |
69
+ | Single Block | `PSMMode.SINGLE_BLOCK` | Treat the image as a single text block | Simple layouts, preserving paragraph structure |
70
+ | Single Column | `PSMMode.SINGLE_COLUMN` | Assume a single column of text | Books, articles, single-column documents |
71
+ | Single Line | `PSMMode.SINGLE_LINE` | Treat the image as a single text line | Receipts, labels, single-line text |
72
+ | Single Word | `PSMMode.SINGLE_WORD` | Treat the image as a single word | Word recognition tasks |
73
+ | Sparse Text | `PSMMode.SPARSE_TEXT` | Find as much text as possible without assuming structure | Forms, tables, scattered text |
74
74
 
75
75
  ### Forcing OCR
76
76
 
@@ -139,23 +139,90 @@ result = await extract_file(
139
139
 
140
140
  ## Performance Optimization
141
141
 
142
- OCR performance and parallel processing can be controlled through process handlers and extraction hooks which are configured in the `ExtractionConfig` object. The default configuration handles performance optimization automatically.
142
+ ### Default Configuration
143
143
 
144
- This is useful for:
144
+ Kreuzberg's defaults are optimized out-of-the-box for modern PDFs and standard documents:
145
145
 
146
- - Limiting resource usage on systems with limited memory
147
- - Optimizing performance on systems with many CPU cores
148
- - Balancing OCR tasks with other application workloads
146
+ - **PSM Mode**: `AUTO_ONLY` - Faster than `AUTO` without orientation detection overhead
147
+ - **Language Model**: Disabled by default for optimal performance on modern documents
148
+ - **Dictionary Correction**: Enabled for accuracy
149
+
150
+ The default configuration provides excellent extraction quality for:
151
+
152
+ - Modern PDFs with embedded text
153
+ - Scanned documents with clear printing
154
+ - Office documents (DOCX, PPTX, XLSX)
155
+ - Standard business documents
156
+
157
+ ### Speed vs Quality Trade-offs
158
+
159
+ ```python
160
+ from kreuzberg import ExtractionConfig, TesseractConfig, PSMMode
161
+
162
+ # Default configuration (optimized for modern documents)
163
+ default_config = ExtractionConfig() # Already optimized for speed and quality
164
+
165
+ # Maximum speed configuration
166
+ speed_config = ExtractionConfig(
167
+ ocr_backend="tesseract",
168
+ ocr_config=TesseractConfig(
169
+ psm=PSMMode.SINGLE_BLOCK, # Assume simple layout
170
+ tessedit_enable_dict_correction=False, # Skip dictionary correction
171
+ ),
172
+ )
173
+
174
+ # Maximum accuracy configuration (for degraded/historical documents)
175
+ accuracy_config = ExtractionConfig(
176
+ ocr_backend="tesseract",
177
+ ocr_config=TesseractConfig(
178
+ psm=PSMMode.AUTO, # Full analysis with orientation detection
179
+ language_model_ngram_on=True, # Enable for degraded/historical text
180
+ tessedit_enable_dict_correction=True, # Correct OCR errors
181
+ ),
182
+ )
183
+ ```
184
+
185
+ ### Language Model N-gram Settings
186
+
187
+ The `language_model_ngram_on` parameter controls Tesseract's use of n-gram language models:
188
+
189
+ - **Default (False)**: Optimized for modern documents with clear text
190
+ - **When to enable**: Historical documents, degraded scans, handwritten text, or noisy images
191
+
192
+ ```python
193
+ # For degraded or historical documents
194
+ historical_config = ExtractionConfig(
195
+ ocr_backend="tesseract",
196
+ ocr_config=TesseractConfig(
197
+ language_model_ngram_on=True, # Enable for better accuracy on poor quality text
198
+ ),
199
+ )
200
+ ```
201
+
202
+ ### When to Disable OCR
203
+
204
+ For documents with text layers (searchable PDFs, Office docs), disable OCR entirely:
205
+
206
+ ```python
207
+ # No OCR overhead for text documents
208
+ text_config = ExtractionConfig(ocr_backend=None)
209
+ ```
210
+
211
+ This provides significant speedup (78% of PDFs have text layers and extract in \<0.01s)
149
212
 
150
213
  ## Best Practices
151
214
 
152
215
  - **Language Selection**: Always specify the correct language for your documents to improve OCR accuracy
153
216
  - **PSM Mode Selection**: Choose the appropriate PSM mode based on your document layout:
154
- - Use `PSM.SINGLE_BLOCK` for documents with simple layouts
155
- - Use `PSM.SPARSE_TEXT` for forms or documents with tables
156
- - Use `PSM.SINGLE_LINE` for receipts or labels
217
+ - Use `PSMMode.AUTO_ONLY` (default) for modern, well-formatted documents
218
+ - Use `PSMMode.SINGLE_BLOCK` for simple layouts with faster processing
219
+ - Use `PSMMode.SPARSE_TEXT` for forms or documents with tables
220
+ - Use `PSMMode.AUTO` only when orientation detection is needed
221
+ - **Performance Optimization**:
222
+ - Disable OCR (`ocr_backend=None`) for documents with text layers
223
+ - Disable language model for clean documents (`language_model_ngram_on=False`)
224
+ - Disable dictionary correction for technical documents
157
225
  - **Image Quality**: For best results, ensure images are:
158
226
  - High resolution (at least 300 DPI)
159
227
  - Well-lit with good contrast
160
- - Not skewed or rotated
161
- - **Performance**: For batch processing, adjust `max_processes` based on your system's capabilities
228
+ - Not skewed or rotated (unless using `PSMMode.AUTO`)
@@ -1,6 +1,7 @@
1
1
  from importlib.metadata import version
2
2
 
3
3
  from kreuzberg._gmft import GMFTConfig
4
+ from kreuzberg._language_detection import LanguageDetectionConfig
4
5
  from kreuzberg._ocr._easyocr import EasyOCRConfig
5
6
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
6
7
  from kreuzberg._ocr._tesseract import TesseractConfig
@@ -29,6 +30,7 @@ __all__ = [
29
30
  "ExtractorRegistry",
30
31
  "GMFTConfig",
31
32
  "KreuzbergError",
33
+ "LanguageDetectionConfig",
32
34
  "Metadata",
33
35
  "MissingDependencyError",
34
36
  "OCRError",
@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
80
80
  if self.config.ocr_backend is None:
81
81
  raise ValidationError("ocr_backend is None, cannot perform OCR")
82
82
 
83
- from kreuzberg._ocr._tesseract import TesseractConfig
84
83
  from kreuzberg._types import ExtractionResult
85
84
 
86
85
  if self.config.ocr_backend == "tesseract":
87
86
  from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
87
+ from kreuzberg._ocr._tesseract import TesseractConfig
88
88
 
89
89
  if isinstance(self.config.ocr_config, TesseractConfig):
90
90
  config = self.config.ocr_config
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
96
96
  return results[0]
97
97
  return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
98
98
 
99
+ if self.config.ocr_backend == "paddleocr":
100
+ from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
101
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
102
+
103
+ paddle_config = (
104
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
105
+ )
106
+
107
+ return paddle_process(path, paddle_config)
108
+
109
+ if self.config.ocr_backend == "easyocr":
110
+ from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
111
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
112
+
113
+ easy_config = (
114
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
115
+ )
116
+
117
+ return easy_process(path, easy_config)
118
+
99
119
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
100
120
 
101
121
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
@@ -299,8 +299,6 @@ class PDFExtractor(Extractor):
299
299
  """Extract text from PDF using OCR (sync version)."""
300
300
  pdf = None
301
301
  try:
302
- from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
303
-
304
302
  images = []
305
303
  with pypdfium_file_lock(path):
306
304
  pdf = pypdfium2.PdfDocument(str(path))
@@ -325,18 +323,7 @@ class PDFExtractor(Extractor):
325
323
  os.close(fd)
326
324
  image_paths.append(temp_path)
327
325
 
328
- if self.config.ocr_backend == "tesseract":
329
- from kreuzberg._ocr._tesseract import TesseractConfig
330
-
331
- if isinstance(self.config.ocr_config, TesseractConfig):
332
- config = self.config.ocr_config
333
- else:
334
- config = TesseractConfig()
335
- results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
336
- text_parts = [r.content for r in results]
337
- return "\n\n".join(text_parts)
338
-
339
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
326
+ return self._process_pdf_images_with_ocr(image_paths)
340
327
 
341
328
  finally:
342
329
  for _, temp_path in temp_files:
@@ -349,3 +336,46 @@ class PDFExtractor(Extractor):
349
336
  if pdf:
350
337
  with pypdfium_file_lock(path), contextlib.suppress(Exception):
351
338
  pdf.close()
339
+
340
+ def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
341
+ """Process PDF images with the configured OCR backend."""
342
+ if self.config.ocr_backend == "tesseract":
343
+ from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
344
+ from kreuzberg._ocr._tesseract import TesseractConfig
345
+
346
+ tesseract_config = (
347
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
348
+ )
349
+ results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
350
+ text_parts = [r.content for r in results]
351
+ return "\n\n".join(text_parts)
352
+
353
+ if self.config.ocr_backend == "paddleocr":
354
+ from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
355
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
356
+
357
+ paddle_config = (
358
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
359
+ )
360
+
361
+ text_parts = []
362
+ for image_path in image_paths:
363
+ result = paddle_process(Path(image_path), paddle_config)
364
+ text_parts.append(result.content)
365
+ return "\n\n".join(text_parts)
366
+
367
+ if self.config.ocr_backend == "easyocr":
368
+ from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
369
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
370
+
371
+ easy_config = (
372
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
373
+ )
374
+
375
+ text_parts = []
376
+ for image_path in image_paths:
377
+ result = easy_process(Path(image_path), easy_config)
378
+ text_parts.append(result.content)
379
+ return "\n\n".join(text_parts)
380
+
381
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
@@ -6,7 +6,7 @@ import sys
6
6
  from datetime import date, datetime, time, timedelta
7
7
  from io import StringIO
8
8
  from pathlib import Path
9
- from typing import Any, Union
9
+ from typing import Any
10
10
 
11
11
  from anyio import Path as AsyncPath
12
12
  from python_calamine import CalamineWorkbook
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11): # pragma: no cover
23
23
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
24
24
 
25
25
 
26
- CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
26
+ CellValue = int | float | str | bool | time | date | datetime | timedelta
27
27
 
28
28
 
29
29
  class SpreadSheetExtractor(Extractor):