kreuzberg 3.4.1__tar.gz → 3.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.github/workflows/ci.yaml +1 -1
  2. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.github/workflows/publish-docker.yml +2 -3
  3. kreuzberg-3.5.0/.gitmodules +3 -0
  4. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/PKG-INFO +4 -3
  5. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/benchmark_baseline.py +1 -1
  6. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/final_benchmark.py +1 -1
  7. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/serialization_benchmark.py +0 -1
  8. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/advanced/performance.md +46 -9
  9. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/changelog.md +2 -0
  10. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/contributing.md +1 -1
  11. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/examples/extraction-examples.md +41 -0
  12. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/getting-started/installation.md +9 -1
  13. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/docker.md +50 -1
  14. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/extraction-configuration.md +53 -0
  15. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/ocr-configuration.md +86 -19
  16. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/__init__.py +2 -0
  17. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_extractors/_image.py +21 -1
  18. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_extractors/_pdf.py +44 -14
  19. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_extractors/_spread_sheet.py +2 -2
  20. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_gmft.py +4 -4
  21. kreuzberg-3.5.0/kreuzberg/_language_detection.py +95 -0
  22. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
  23. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/process_manager.py +2 -1
  24. kreuzberg-3.5.0/kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
  25. kreuzberg-3.5.0/kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
  26. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_ocr/_easyocr.py +1 -1
  27. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_ocr/_tesseract.py +7 -3
  28. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_types.py +11 -4
  29. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_device.py +2 -2
  30. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_process_pool.py +2 -2
  31. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_sync.py +1 -5
  32. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_tmp.py +2 -2
  33. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/extraction.py +10 -0
  34. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/mkdocs.yaml +1 -0
  35. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/pyproject.toml +25 -5
  36. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/api/main_test.py +2 -5
  37. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/cli_integration_test.py +9 -1
  38. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extraction_test.py +10 -2
  39. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extractors/image_test.py +17 -4
  40. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extractors/pdf_test.py +8 -0
  41. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/gmft_extended_test.py +6 -17
  42. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/gmft_test.py +0 -3
  43. kreuzberg-3.5.0/tests/language_detection_test.py +237 -0
  44. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/multiprocessing/sync_tesseract_test.py +2 -3
  45. kreuzberg-3.5.0/tests/test_source_files/french-text.txt +2 -0
  46. kreuzberg-3.5.0/tests/test_source_files/german-text.txt +2 -0
  47. kreuzberg-3.5.0/tests/test_source_files/spanish-text.txt +2 -0
  48. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/cache_test.py +0 -3
  49. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/errors_test.py +0 -1
  50. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/process_pool_test.py +0 -3
  51. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/sync_test.py +0 -7
  52. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/uv.lock +68 -2
  53. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.commitlintrc +0 -0
  54. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.docker/Dockerfile +0 -0
  55. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.docker/README.md +0 -0
  56. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.dockerignore +0 -0
  57. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.github/dependabot.yaml +0 -0
  58. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.github/workflows/docs.yml +0 -0
  59. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.github/workflows/pr-title.yaml +0 -0
  60. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.github/workflows/release.yaml +0 -0
  61. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.gitignore +0 -0
  62. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.markdownlint.yaml +0 -0
  63. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/.pre-commit-config.yaml +0 -0
  64. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/LICENSE +0 -0
  65. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/README.md +0 -0
  66. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/ai-rulez.yaml +0 -0
  67. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/README.md +0 -0
  68. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/end_to_end_benchmark.py +0 -0
  69. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/pyproject.toml +0 -0
  70. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/baseline_results.json +0 -0
  71. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  72. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
  73. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/final_benchmark_results.json +0 -0
  74. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/mime_caching_results.json +0 -0
  75. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/msgspec_caching_results.json +0 -0
  76. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/ocr_caching_results.json +0 -0
  77. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
  78. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
  79. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/results/table_caching_results.json +0 -0
  80. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  81. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  82. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  83. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  84. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  85. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  86. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  87. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/benchmarks/statistical_benchmark.py +0 -0
  88. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/advanced/custom-extractors.md +0 -0
  89. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/advanced/custom-hooks.md +0 -0
  90. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/advanced/error-handling.md +0 -0
  91. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/advanced/index.md +0 -0
  92. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/api-reference/exceptions.md +0 -0
  93. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/api-reference/extraction-functions.md +0 -0
  94. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/api-reference/extractor-registry.md +0 -0
  95. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/api-reference/index.md +0 -0
  96. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/api-reference/ocr-configuration.md +0 -0
  97. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/api-reference/types.md +0 -0
  98. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/assets/favicon.png +0 -0
  99. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/assets/logo.png +0 -0
  100. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/cli.md +0 -0
  101. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/css/extra.css +0 -0
  102. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/examples/index.md +0 -0
  103. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/getting-started/index.md +0 -0
  104. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/getting-started/quick-start.md +0 -0
  105. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/index.md +0 -0
  106. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/api-server.md +0 -0
  107. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/basic-usage.md +0 -0
  108. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/chunking.md +0 -0
  109. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/index.md +0 -0
  110. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/metadata-extraction.md +0 -0
  111. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/ocr-backends.md +0 -0
  112. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/docs/user-guide/supported-formats.md +0 -0
  113. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/__main__.py +0 -0
  114. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_api/__init__.py +0 -0
  115. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_api/main.py +0 -0
  116. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_chunker.py +0 -0
  117. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_cli_config.py +0 -0
  118. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_constants.py +0 -0
  119. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_extractors/__init__.py +0 -0
  120. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_extractors/_base.py +0 -0
  121. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_extractors/_html.py +0 -0
  122. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  123. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_extractors/_presentation.py +0 -0
  124. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_mime_types.py +0 -0
  125. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/__init__.py +0 -0
  126. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
  127. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
  128. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_ocr/__init__.py +0 -0
  129. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_ocr/_base.py +0 -0
  130. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  131. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_playa.py +0 -0
  132. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_registry.py +0 -0
  133. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/__init__.py +0 -0
  134. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_cache.py +0 -0
  135. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_document_cache.py +0 -0
  136. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_errors.py +0 -0
  137. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  138. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_serialization.py +0 -0
  139. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/_utils/_string.py +0 -0
  140. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/cli.py +0 -0
  141. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/exceptions.py +0 -0
  142. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/kreuzberg/py.typed +0 -0
  143. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/__init__.py +0 -0
  144. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/api/__init__.py +0 -0
  145. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/chunker_test.py +0 -0
  146. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/cli_test.py +0 -0
  147. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/conftest.py +0 -0
  148. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/exceptions_test.py +0 -0
  149. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extraction_batch_test.py +0 -0
  150. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extractors/__init__.py +0 -0
  151. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extractors/html_test.py +0 -0
  152. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  153. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extractors/pandoc_test.py +0 -0
  154. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extractors/presentation_test.py +0 -0
  155. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/extractors/spreed_sheet_test.py +0 -0
  156. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/hooks_test.py +0 -0
  157. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/mime_types_test.py +0 -0
  158. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/multiprocessing/__init__.py +0 -0
  159. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
  160. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/multiprocessing/process_manager_test.py +0 -0
  161. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  162. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/ocr/__init__.py +0 -0
  163. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/ocr/base_test.py +0 -0
  164. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/ocr/device_integration_test.py +0 -0
  165. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/ocr/easyocr_test.py +0 -0
  166. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/ocr/init_test.py +0 -0
  167. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/ocr/paddleocr_test.py +0 -0
  168. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/ocr/tesseract_test.py +0 -0
  169. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/playa_test.py +0 -0
  170. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/registry_test.py +0 -0
  171. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/document.docx +0 -0
  172. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  173. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/excel.xlsx +0 -0
  174. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/html.html +0 -0
  175. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/markdown.md +0 -0
  176. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  177. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/non-searchable.pdf +0 -0
  178. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/ocr-image.jpg +0 -0
  179. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  180. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  181. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  182. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  183. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/sample-contract.pdf +0 -0
  184. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/scanned.pdf +0 -0
  185. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/searchable.pdf +0 -0
  186. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/test_source_files/test-article.pdf +0 -0
  187. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/types_test.py +0 -0
  188. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/__init__.py +0 -0
  189. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/device_test.py +0 -0
  190. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/pdf_lock_test.py +0 -0
  191. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/serialization_test.py +0 -0
  192. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/string_test.py +0 -0
  193. {kreuzberg-3.4.1 → kreuzberg-3.5.0}/tests/utils/tmp_test.py +0 -0
@@ -50,7 +50,7 @@ jobs:
50
50
  strategy:
51
51
  matrix:
52
52
  os: [ ubuntu-latest, macOS-latest, windows-latest ]
53
- python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.9", "3.10", "3.11", "3.12", "3.13"]') }}
53
+ python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
54
54
  runs-on: ${{ matrix.os }}
55
55
  timeout-minutes: 30
56
56
  steps:
@@ -1,4 +1,3 @@
1
- # .github/workflows/publish-docker.yml
2
1
 
3
2
  name: Publish Docker Images
4
3
 
@@ -24,7 +23,7 @@ jobs:
24
23
  include:
25
24
  - name: core
26
25
  extras: ""
27
- tag_suffix: "" # The base image tag (includes API + tesseract)
26
+ tag_suffix: ""
28
27
  - name: easyocr
29
28
  extras: "easyocr"
30
29
  tag_suffix: "-easyocr"
@@ -89,7 +88,7 @@ jobs:
89
88
  type=raw,value=latest${{ matrix.tag_suffix }}
90
89
 
91
90
  - name: Build and push Docker image
92
- uses: docker/build-push-action@v5
91
+ uses: docker/build-push-action@v6
93
92
  with:
94
93
  context: .
95
94
  file: ./.docker/Dockerfile
@@ -0,0 +1,3 @@
1
+ [submodule "python-text-extraction-libs-benchmarks"]
2
+ path = python-text-extraction-libs-benchmarks
3
+ url = https://github.com/Goldziher/python-text-extraction-libs-benchmarks.git
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.4.1
3
+ Version: 3.5.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -12,7 +12,6 @@ Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: Programming Language :: Python :: 3 :: Only
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
@@ -22,7 +21,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
21
  Classifier: Topic :: Text Processing :: General
23
22
  Classifier: Topic :: Utilities
24
23
  Classifier: Typing :: Typed
25
- Requires-Python: >=3.9
24
+ Requires-Python: >=3.10
26
25
  Requires-Dist: anyio>=4.9.0
27
26
  Requires-Dist: charset-normalizer>=3.4.2
28
27
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
@@ -57,6 +56,8 @@ Provides-Extra: easyocr
57
56
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
58
57
  Provides-Extra: gmft
59
58
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
59
+ Provides-Extra: langdetect
60
+ Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
60
61
  Provides-Extra: paddleocr
61
62
  Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
62
63
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
@@ -8,7 +8,7 @@ from kreuzberg import ExtractionConfig, batch_extract_file, extract_file_sync
8
8
  from kreuzberg._utils._document_cache import clear_document_cache, get_document_cache
9
9
 
10
10
 
11
- async def run_baseline_benchmark() -> dict[str, object] | None: # type: ignore[syntax]
11
+ async def run_baseline_benchmark() -> dict[str, object] | None:
12
12
  """Run comprehensive baseline benchmark."""
13
13
  test_files_dir = Path("tests/test_source_files")
14
14
  test_files = list(test_files_dir.glob("*.pdf"))
@@ -15,7 +15,7 @@ from kreuzberg._utils._cache import (
15
15
  )
16
16
 
17
17
 
18
- async def run_final_benchmark() -> dict[str, object] | None: # type: ignore[syntax]
18
+ async def run_final_benchmark() -> dict[str, object] | None:
19
19
  """Run comprehensive benchmark of all caching improvements."""
20
20
  test_files_dir = Path("tests/test_source_files")
21
21
  pdf_files = list(test_files_dir.glob("*.pdf"))
@@ -101,7 +101,6 @@ def benchmark_serialization() -> dict[str, object]:
101
101
  json_serialize = analyze_times(json_serialize_times, "JSON Serialize")
102
102
  json_deserialize = analyze_times(json_deserialize_times, "JSON Deserialize")
103
103
 
104
- # Type casting for arithmetic operations
105
104
  json_ser_mean = json_serialize["mean"]
106
105
  json_deser_mean = json_deserialize["mean"]
107
106
  msgpack_ser_mean = msgpack_serialize["mean"]
@@ -125,22 +125,59 @@ The async API leverages Python's asyncio with intelligent task scheduling:
125
125
  1. **Configure OCR appropriately** for your document types
126
126
  1. **Profile your specific workload** - results vary by content
127
127
 
128
- ### Configuration Examples
128
+ ### Optimized Default Configuration
129
+
130
+ Kreuzberg's default configuration is **optimized out-of-the-box for modern PDFs and standard documents**:
129
131
 
130
132
  ```python
131
- from kreuzberg import ExtractionConfig, extract_file_sync
132
- from kreuzberg._ocr import TesseractConfig
133
+ from kreuzberg import ExtractionConfig
133
134
 
134
- # Optimized for speed
135
- fast_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=6)) # Assume uniform text block
135
+ # Default configuration - already optimized for modern documents
136
+ config = ExtractionConfig() # Uses optimized defaults:
137
+ # - PSM: AUTO_ONLY (fast without orientation detection)
138
+ # - Language model: Disabled for performance
139
+ # - Dictionary correction: Enabled for accuracy
140
+ ```
136
141
 
137
- # Optimized for accuracy
138
- accurate_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=1)) # Auto page segmentation
142
+ ### Advanced Configuration Examples
139
143
 
140
- # For simple documents (no OCR)
141
- text_only_config = ExtractionConfig(force_ocr=False, ocr_backend=None)
144
+ ```python
145
+ from kreuzberg import ExtractionConfig, extract_file_sync
146
+ from kreuzberg._ocr._tesseract import TesseractConfig, PSMMode
147
+
148
+ # Maximum speed configuration (for high-volume processing)
149
+ speed_config = ExtractionConfig(
150
+ ocr_backend="tesseract",
151
+ ocr_config=TesseractConfig(
152
+ psm=PSMMode.SINGLE_BLOCK, # Assume simple layout
153
+ language_model_ngram_on=False, # Already disabled by default
154
+ tessedit_enable_dict_correction=False, # Disable for maximum speed
155
+ ),
156
+ )
157
+
158
+ # Maximum accuracy configuration (for degraded documents)
159
+ accuracy_config = ExtractionConfig(
160
+ ocr_backend="tesseract",
161
+ ocr_config=TesseractConfig(
162
+ psm=PSMMode.AUTO, # Full analysis with orientation detection
163
+ language_model_ngram_on=True, # Enable for historical/degraded text
164
+ tessedit_enable_dict_correction=True, # Default - keep enabled
165
+ ),
166
+ )
167
+
168
+ # No OCR configuration (text documents only)
169
+ text_only_config = ExtractionConfig(ocr_backend=None, force_ocr=False)
142
170
  ```
143
171
 
172
+ ### Performance Optimization Tips
173
+
174
+ Based on comprehensive benchmarking with 138+ documents:
175
+
176
+ 1. **Disable OCR for text documents**: Setting `ocr_backend=None` provides significant speedup for documents with text layers
177
+ 1. **Use PSM `AUTO_ONLY` (default)**: Optimized for modern documents without orientation detection overhead
178
+ 1. **Language model trade-offs**: Disabling `language_model_ngram_on` can provide 30x+ speedup with minimal quality impact on clean documents
179
+ 1. **Dictionary correction**: Disabling `tessedit_enable_dict_correction` speeds up processing for technical documents
180
+
144
181
  ### Batch Processing Best Practices
145
182
 
146
183
  ```python
@@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
12
12
  - Documentation site with comprehensive examples and API reference
13
13
  - Improved configuration for all OCR backends
14
14
  - Added hooks system for validation and post-processing
15
+ - Language detection feature with `auto_detect_language` configuration option
16
+ - New optional dependency group `langdetect` for automatic language detection
15
17
 
16
18
  ### Changed
17
19
 
@@ -68,7 +68,7 @@ Use [Conventional Commits](https://www.conventionalcommits.org/):
68
68
 
69
69
  ## Notes
70
70
 
71
- - Python 3.9-3.13 supported
71
+ - Python 3.10-3.13 supported
72
72
  - System dependencies (optional): Tesseract, Pandoc
73
73
  - Pre-commit runs automatically on commit
74
74
  - Join our [Discord](https://discord.gg/pXxagNK2zN) for help
@@ -79,6 +79,47 @@ async def extract_with_different_backends():
79
79
  print(f"No OCR result: {result.content[:100]}...")
80
80
  ```
81
81
 
82
+ ## Language Detection
83
+
84
+ ```python
85
+ from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
86
+
87
+ async def detect_document_language():
88
+ # Simple automatic language detection
89
+ result = await extract_file("document.pdf", config=ExtractionConfig(auto_detect_language=True))
90
+
91
+ # Access detected languages
92
+ if result.detected_languages:
93
+ print(f"Detected languages: {', '.join(result.detected_languages)}")
94
+ # Example output: "Detected languages: en, de, fr"
95
+
96
+ async def detect_multilingual_document():
97
+ # Advanced multilingual detection with custom configuration
98
+ lang_config = LanguageDetectionConfig(
99
+ multilingual=True, # Detect multiple languages in mixed text
100
+ top_k=5, # Return top 5 languages
101
+ low_memory=False, # Use high accuracy mode
102
+ )
103
+
104
+ result = await extract_file(
105
+ "multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
106
+ )
107
+
108
+ if result.detected_languages:
109
+ print(f"Detected languages: {result.detected_languages}")
110
+
111
+ # Use detected languages for OCR
112
+ from kreuzberg import TesseractConfig
113
+
114
+ # Create language string for Tesseract (e.g., "eng+deu+fra")
115
+ tesseract_langs = "+".join(result.detected_languages[:3])
116
+
117
+ result_with_ocr = await extract_file(
118
+ "multilingual_document.pdf",
119
+ config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=tesseract_langs)),
120
+ )
121
+ ```
122
+
82
123
  ## Table Extraction
83
124
 
84
125
  ```python
@@ -102,6 +102,14 @@ Table extraction is an optional feature that allows Kreuzberg to extract tables
102
102
  pip install "kreuzberg[gmft]"
103
103
  ```
104
104
 
105
+ ### Language Detection
106
+
107
+ Language detection is an optional feature that automatically detects the language of extracted text. It uses the [fast-langdetect](https://github.com/LlmKira/fast-langdetect) package. To install Kreuzberg with language detection support, you can use:
108
+
109
+ ```shell
110
+ pip install "kreuzberg[langdetect]"
111
+ ```
112
+
105
113
  ### All Optional Dependencies
106
114
 
107
115
  To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -113,5 +121,5 @@ pip install "kreuzberg[all]"
113
121
  This is equivalent to:
114
122
 
115
123
  ```shell
116
- pip install "kreuzberg[chunking,easyocr,gmft,paddleocr]"
124
+ pip install "kreuzberg[chunking,easyocr,gmft,langdetect,paddleocr]"
117
125
  ```
@@ -103,7 +103,7 @@ CMD ["python", "custom_config.py"]
103
103
 
104
104
  ### Base Image
105
105
 
106
- - Based on `python:3.13-bookworm`
106
+ - Based on `python:3.13-bookworm` (requires Python 3.10+)
107
107
  - Includes system dependencies: `pandoc`, `tesseract-ocr`
108
108
  - Runs as non-root user `appuser`
109
109
  - Exposes port 8000
@@ -124,6 +124,34 @@ Additional dependencies by variant:
124
124
  - **gmft**: GMFT for table extraction
125
125
  - **all**: All optional dependencies
126
126
 
127
+ ### Health Check
128
+
129
+ All Docker images include a health check endpoint:
130
+
131
+ ```bash
132
+ # Check API health
133
+ curl http://localhost:8000/health
134
+ ```
135
+
136
+ Returns a JSON response with service status and version information.
137
+
138
+ ### Observability
139
+
140
+ The Docker images include built-in OpenTelemetry instrumentation via Litestar:
141
+
142
+ - **Tracing**: Automatic request/response tracing
143
+ - **Metrics**: Performance and usage metrics
144
+ - **Logging**: Structured JSON logging
145
+
146
+ Configure via standard OpenTelemetry environment variables:
147
+
148
+ ```bash
149
+ docker run -p 8000:8000 \
150
+ -e OTEL_SERVICE_NAME=kreuzberg-api \
151
+ -e OTEL_EXPORTER_OTLP_ENDPOINT=http://your-collector:4317 \
152
+ goldziher/kreuzberg:latest
153
+ ```
154
+
127
155
  ### Environment Variables
128
156
 
129
157
  - `PYTHONUNBUFFERED=1` - Ensures proper logging output
@@ -150,6 +178,12 @@ server {
150
178
  client_max_body_size 100M;
151
179
  proxy_read_timeout 300s;
152
180
  }
181
+
182
+ # Health check endpoint
183
+ location /health {
184
+ proxy_pass http://localhost:8000/health;
185
+ access_log off;
186
+ }
153
187
  }
154
188
  ```
155
189
 
@@ -175,6 +209,21 @@ spec:
175
209
  image: goldziher/kreuzberg:latest
176
210
  ports:
177
211
  - containerPort: 8000
212
+ livenessProbe:
213
+ httpGet:
214
+ path: /health
215
+ port: 8000
216
+ initialDelaySeconds: 30
217
+ periodSeconds: 10
218
+ readinessProbe:
219
+ httpGet:
220
+ path: /health
221
+ port: 8000
222
+ initialDelaySeconds: 5
223
+ periodSeconds: 5
224
+ env:
225
+ - name: OTEL_SERVICE_NAME
226
+ value: "kreuzberg-api"
178
227
  resources:
179
228
  requests:
180
229
  memory: "512Mi"
@@ -9,6 +9,7 @@ All extraction functions accept an optional `config` parameter of type `Extracti
9
9
  - Control OCR behavior with `force_ocr` and `ocr_backend`
10
10
  - Provide engine-specific OCR configuration via `ocr_config`
11
11
  - Enable table extraction with `extract_tables` and configure it via `gmft_config`
12
+ - Enable automatic language detection with `auto_detect_language`
12
13
  - Add validation and post-processing hooks
13
14
  - Configure custom extractors
14
15
 
@@ -100,6 +101,58 @@ Note that table extraction requires the `gmft` dependency. You can install it wi
100
101
  pip install "kreuzberg[gmft]"
101
102
  ```
102
103
 
104
+ ### Language Detection
105
+
106
+ Kreuzberg can automatically detect the language of extracted text using fast-langdetect:
107
+
108
+ ```python
109
+ from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
110
+
111
+ # Simple automatic language detection
112
+ result = await extract_file("multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True))
113
+
114
+ # Access detected languages (lowercase ISO 639-1 codes)
115
+ if result.detected_languages:
116
+ print(f"Detected languages: {', '.join(result.detected_languages)}")
117
+ # Example output: "Detected languages: en, de, fr"
118
+
119
+ # Advanced configuration with multilingual detection
120
+ lang_config = LanguageDetectionConfig(
121
+ multilingual=True, # Enable mixed-language detection
122
+ top_k=5, # Return top 5 languages
123
+ low_memory=False, # Use high accuracy mode
124
+ cache_dir="/tmp/lang_models", # Custom model cache directory
125
+ )
126
+
127
+ result = await extract_file(
128
+ "multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
129
+ )
130
+
131
+ # Use detected languages for OCR
132
+ if result.detected_languages:
133
+ # Re-extract with OCR using the primary detected language
134
+ from kreuzberg import TesseractConfig
135
+
136
+ result_with_ocr = await extract_file(
137
+ "multilingual_document.pdf",
138
+ config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=result.detected_languages[0])),
139
+ )
140
+ ```
141
+
142
+ #### Language Detection Configuration Options
143
+
144
+ - `low_memory` (default: `True`): Use smaller model (~200MB) vs larger, more accurate model
145
+ - `multilingual` (default: `False`): Enable detection of multiple languages in mixed text
146
+ - `top_k` (default: `3`): Maximum number of languages to return
147
+ - `cache_dir`: Custom directory for language model storage
148
+ - `allow_fallback` (default: `True`): Fall back to small model if large model fails
149
+
150
+ The feature requires the `langdetect` dependency:
151
+
152
+ ```shell
153
+ pip install "kreuzberg[langdetect]"
154
+ ```
155
+
103
156
  ### Batch Processing
104
157
 
105
158
  ```python
@@ -62,15 +62,15 @@ result = await extract_file("document.pdf", config=ExtractionConfig(ocr_config=T
62
62
 
63
63
  #### Available PSM Modes
64
64
 
65
- | Mode | Enum Value | Description | Best For |
66
- | -------------------- | ------------------------- | -------------------------------------------------------- | ---------------------------------------------- |
67
- | Automatic | `PSMMode.AUTO` | Automatic page segmentation with orientation detection | General purpose (default) |
68
- | Single Block | `PSMMode.SINGLE_BLOCK` | Treat the image as a single text block | Simple layouts, preserving paragraph structure |
69
- | Single Line | `PSMMode.SINGLE_LINE` | Treat the image as a single text line | Receipts, labels, single-line text |
70
- | Single Word | `PSMMode.SINGLE_WORD` | Treat the image as a single word | Word recognition tasks |
71
- | Single Character | `PSMMode.SINGLE_CHAR` | Treat the image as a single character | Character recognition tasks |
72
- | Sparse Text | `PSMMode.SPARSE_TEXT` | Find as much text as possible without assuming structure | Forms, tables, scattered text |
73
- | Sparse Text with OSD | `PSMMode.SPARSE_TEXT_OSD` | Like SPARSE_TEXT with orientation detection | Complex layouts with varying text orientation |
65
+ | Mode | Enum Value | Description | Best For |
66
+ | ------------- | ----------------------- | -------------------------------------------------------- | ---------------------------------------------- |
67
+ | Auto Only | `PSMMode.AUTO_ONLY` | Automatic segmentation without orientation detection | Modern documents (default - fastest) |
68
+ | Automatic | `PSMMode.AUTO` | Automatic page segmentation with orientation detection | Rotated/skewed documents |
69
+ | Single Block | `PSMMode.SINGLE_BLOCK` | Treat the image as a single text block | Simple layouts, preserving paragraph structure |
70
+ | Single Column | `PSMMode.SINGLE_COLUMN` | Assume a single column of text | Books, articles, single-column documents |
71
+ | Single Line | `PSMMode.SINGLE_LINE` | Treat the image as a single text line | Receipts, labels, single-line text |
72
+ | Single Word | `PSMMode.SINGLE_WORD` | Treat the image as a single word | Word recognition tasks |
73
+ | Sparse Text | `PSMMode.SPARSE_TEXT` | Find as much text as possible without assuming structure | Forms, tables, scattered text |
74
74
 
75
75
  ### Forcing OCR
76
76
 
@@ -139,23 +139,90 @@ result = await extract_file(
139
139
 
140
140
  ## Performance Optimization
141
141
 
142
- OCR performance and parallel processing can be controlled through process handlers and extraction hooks which are configured in the `ExtractionConfig` object. The default configuration handles performance optimization automatically.
142
+ ### Default Configuration
143
143
 
144
- This is useful for:
144
+ Kreuzberg's defaults are optimized out-of-the-box for modern PDFs and standard documents:
145
145
 
146
- - Limiting resource usage on systems with limited memory
147
- - Optimizing performance on systems with many CPU cores
148
- - Balancing OCR tasks with other application workloads
146
+ - **PSM Mode**: `AUTO_ONLY` - Faster than `AUTO` without orientation detection overhead
147
+ - **Language Model**: Disabled by default for optimal performance on modern documents
148
+ - **Dictionary Correction**: Enabled for accuracy
149
+
150
+ The default configuration provides excellent extraction quality for:
151
+
152
+ - Modern PDFs with embedded text
153
+ - Scanned documents with clear printing
154
+ - Office documents (DOCX, PPTX, XLSX)
155
+ - Standard business documents
156
+
157
+ ### Speed vs Quality Trade-offs
158
+
159
+ ```python
160
+ from kreuzberg import ExtractionConfig, TesseractConfig, PSMMode
161
+
162
+ # Default configuration (optimized for modern documents)
163
+ default_config = ExtractionConfig() # Already optimized for speed and quality
164
+
165
+ # Maximum speed configuration
166
+ speed_config = ExtractionConfig(
167
+ ocr_backend="tesseract",
168
+ ocr_config=TesseractConfig(
169
+ psm=PSMMode.SINGLE_BLOCK, # Assume simple layout
170
+ tessedit_enable_dict_correction=False, # Skip dictionary correction
171
+ ),
172
+ )
173
+
174
+ # Maximum accuracy configuration (for degraded/historical documents)
175
+ accuracy_config = ExtractionConfig(
176
+ ocr_backend="tesseract",
177
+ ocr_config=TesseractConfig(
178
+ psm=PSMMode.AUTO, # Full analysis with orientation detection
179
+ language_model_ngram_on=True, # Enable for degraded/historical text
180
+ tessedit_enable_dict_correction=True, # Correct OCR errors
181
+ ),
182
+ )
183
+ ```
184
+
185
+ ### Language Model N-gram Settings
186
+
187
+ The `language_model_ngram_on` parameter controls Tesseract's use of n-gram language models:
188
+
189
+ - **Default (False)**: Optimized for modern documents with clear text
190
+ - **When to enable**: Historical documents, degraded scans, handwritten text, or noisy images
191
+
192
+ ```python
193
+ # For degraded or historical documents
194
+ historical_config = ExtractionConfig(
195
+ ocr_backend="tesseract",
196
+ ocr_config=TesseractConfig(
197
+ language_model_ngram_on=True, # Enable for better accuracy on poor quality text
198
+ ),
199
+ )
200
+ ```
201
+
202
+ ### When to Disable OCR
203
+
204
+ For documents with text layers (searchable PDFs, Office docs), disable OCR entirely:
205
+
206
+ ```python
207
+ # No OCR overhead for text documents
208
+ text_config = ExtractionConfig(ocr_backend=None)
209
+ ```
210
+
211
+ This provides significant speedup (78% of PDFs have text layers and extract in \<0.01s)
149
212
 
150
213
  ## Best Practices
151
214
 
152
215
  - **Language Selection**: Always specify the correct language for your documents to improve OCR accuracy
153
216
  - **PSM Mode Selection**: Choose the appropriate PSM mode based on your document layout:
154
- - Use `PSM.SINGLE_BLOCK` for documents with simple layouts
155
- - Use `PSM.SPARSE_TEXT` for forms or documents with tables
156
- - Use `PSM.SINGLE_LINE` for receipts or labels
217
+ - Use `PSMMode.AUTO_ONLY` (default) for modern, well-formatted documents
218
+ - Use `PSMMode.SINGLE_BLOCK` for simple layouts with faster processing
219
+ - Use `PSMMode.SPARSE_TEXT` for forms or documents with tables
220
+ - Use `PSMMode.AUTO` only when orientation detection is needed
221
+ - **Performance Optimization**:
222
+ - Disable OCR (`ocr_backend=None`) for documents with text layers
223
+ - Disable language model for clean documents (`language_model_ngram_on=False`)
224
+ - Disable dictionary correction for technical documents
157
225
  - **Image Quality**: For best results, ensure images are:
158
226
  - High resolution (at least 300 DPI)
159
227
  - Well-lit with good contrast
160
- - Not skewed or rotated
161
- - **Performance**: For batch processing, adjust `max_processes` based on your system's capabilities
228
+ - Not skewed or rotated (unless using `PSMMode.AUTO`)
@@ -1,6 +1,7 @@
1
1
  from importlib.metadata import version
2
2
 
3
3
  from kreuzberg._gmft import GMFTConfig
4
+ from kreuzberg._language_detection import LanguageDetectionConfig
4
5
  from kreuzberg._ocr._easyocr import EasyOCRConfig
5
6
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
6
7
  from kreuzberg._ocr._tesseract import TesseractConfig
@@ -29,6 +30,7 @@ __all__ = [
29
30
  "ExtractorRegistry",
30
31
  "GMFTConfig",
31
32
  "KreuzbergError",
33
+ "LanguageDetectionConfig",
32
34
  "Metadata",
33
35
  "MissingDependencyError",
34
36
  "OCRError",
@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
80
80
  if self.config.ocr_backend is None:
81
81
  raise ValidationError("ocr_backend is None, cannot perform OCR")
82
82
 
83
- from kreuzberg._ocr._tesseract import TesseractConfig
84
83
  from kreuzberg._types import ExtractionResult
85
84
 
86
85
  if self.config.ocr_backend == "tesseract":
87
86
  from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
87
+ from kreuzberg._ocr._tesseract import TesseractConfig
88
88
 
89
89
  if isinstance(self.config.ocr_config, TesseractConfig):
90
90
  config = self.config.ocr_config
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
96
96
  return results[0]
97
97
  return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
98
98
 
99
+ if self.config.ocr_backend == "paddleocr":
100
+ from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
101
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
102
+
103
+ paddle_config = (
104
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
105
+ )
106
+
107
+ return paddle_process(path, paddle_config)
108
+
109
+ if self.config.ocr_backend == "easyocr":
110
+ from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
111
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
112
+
113
+ easy_config = (
114
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
115
+ )
116
+
117
+ return easy_process(path, easy_config)
118
+
99
119
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
100
120
 
101
121
  def _get_extension_from_mime_type(self, mime_type: str) -> str: