kreuzberg 3.10.1__tar.gz → 3.11.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/ci.yaml +3 -3
  2. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/docs.yml +1 -1
  3. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/pr-title.yaml +1 -1
  4. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/publish-docker.yml +1 -1
  5. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/release.yaml +1 -1
  6. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.pre-commit-config.yaml +9 -7
  7. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/PKG-INFO +13 -11
  8. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/contributing.md +1 -1
  9. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/examples/extraction-examples.md +4 -4
  10. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/getting-started/installation.md +11 -1
  11. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/index.md +1 -1
  12. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/document-classification.md +9 -1
  13. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/extraction-configuration.md +3 -3
  14. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_config.py +18 -14
  15. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_document_classification.py +1 -1
  16. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_base.py +1 -2
  17. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_image.py +18 -17
  18. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_pdf.py +30 -33
  19. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_mcp/server.py +1 -1
  20. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_easyocr.py +8 -1
  21. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_paddleocr.py +2 -1
  22. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_types.py +11 -10
  23. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/mkdocs.yaml +0 -1
  24. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/pyproject.toml +14 -14
  25. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/document_classification_test.py +49 -14
  26. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/uv.lock +1037 -980
  27. kreuzberg-3.10.1/docs/changelog.md +0 -49
  28. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.commitlintrc +0 -0
  29. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.deepsource.toml +0 -0
  30. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.docker/Dockerfile +0 -0
  31. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.docker/README.md +0 -0
  32. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.dockerignore +0 -0
  33. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/dependabot.yaml +0 -0
  34. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.gitignore +0 -0
  35. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/.markdownlint.yaml +0 -0
  36. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/LICENSE +0 -0
  37. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/README.md +0 -0
  38. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/ai-rulez.yaml +0 -0
  39. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/README.md +0 -0
  40. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/benchmark_baseline.py +0 -0
  41. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/end_to_end_benchmark.py +0 -0
  42. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/final_benchmark.py +0 -0
  43. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/pyproject.toml +0 -0
  44. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/baseline_results.json +0 -0
  45. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  46. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
  47. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/final_benchmark_results.json +0 -0
  48. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/latest.json +0 -0
  49. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/mime_caching_results.json +0 -0
  50. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/msgspec_caching_results.json +0 -0
  51. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/ocr_caching_results.json +0 -0
  52. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
  53. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
  54. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/results/table_caching_results.json +0 -0
  55. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/serialization_benchmark.py +0 -0
  56. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  57. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  58. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  59. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  60. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  61. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  62. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  63. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/benchmarks/statistical_benchmark.py +0 -0
  64. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/custom-extractors.md +0 -0
  65. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/custom-hooks.md +0 -0
  66. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/error-handling.md +0 -0
  67. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/index.md +0 -0
  68. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/advanced/performance.md +0 -0
  69. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/exceptions.md +0 -0
  70. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/extraction-functions.md +0 -0
  71. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/extractor-registry.md +0 -0
  72. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/index.md +0 -0
  73. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/ocr-configuration.md +0 -0
  74. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/api-reference/types.md +0 -0
  75. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/assets/favicon.png +0 -0
  76. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/assets/logo.png +0 -0
  77. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/cli.md +0 -0
  78. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/css/extra.css +0 -0
  79. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/examples/index.md +0 -0
  80. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/getting-started/index.md +0 -0
  81. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/getting-started/quick-start.md +0 -0
  82. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/performance-analysis.md +0 -0
  83. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/api-server.md +0 -0
  84. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/basic-usage.md +0 -0
  85. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/chunking.md +0 -0
  86. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/docker.md +0 -0
  87. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/index.md +0 -0
  88. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/mcp-server.md +0 -0
  89. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/metadata-extraction.md +0 -0
  90. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/ocr-backends.md +0 -0
  91. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/ocr-configuration.md +0 -0
  92. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/supported-formats.md +0 -0
  93. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/__init__.py +0 -0
  94. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/__main__.py +0 -0
  95. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_api/__init__.py +0 -0
  96. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_api/main.py +0 -0
  97. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_chunker.py +0 -0
  98. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_constants.py +0 -0
  99. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_entity_extraction.py +0 -0
  100. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/__init__.py +0 -0
  101. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_email.py +0 -0
  102. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_html.py +0 -0
  103. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_pandoc.py +0 -0
  104. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_presentation.py +0 -0
  105. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  106. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_structured.py +0 -0
  107. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_gmft.py +0 -0
  108. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_language_detection.py +0 -0
  109. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_mcp/__init__.py +0 -0
  110. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_mime_types.py +0 -0
  111. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/__init__.py +0 -0
  112. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_base.py +0 -0
  113. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_tesseract.py +0 -0
  114. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_playa.py +0 -0
  115. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_registry.py +0 -0
  116. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/__init__.py +0 -0
  117. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_cache.py +0 -0
  118. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_device.py +0 -0
  119. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_document_cache.py +0 -0
  120. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_errors.py +0 -0
  121. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  122. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_process_pool.py +0 -0
  123. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_quality.py +0 -0
  124. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_serialization.py +0 -0
  125. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_string.py +0 -0
  126. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_sync.py +0 -0
  127. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_table.py +0 -0
  128. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_utils/_tmp.py +0 -0
  129. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/cli.py +0 -0
  130. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/exceptions.py +0 -0
  131. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/extraction.py +0 -0
  132. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/py.typed +0 -0
  133. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/__init__.py +0 -0
  134. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/api/__init__.py +0 -0
  135. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/api/main_test.py +0 -0
  136. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/chunker_test.py +0 -0
  137. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/cli_command_test.py +0 -0
  138. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/cli_integration_test.py +0 -0
  139. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/cli_test.py +0 -0
  140. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/config_test.py +0 -0
  141. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/conftest.py +0 -0
  142. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/entity_extraction_test.py +0 -0
  143. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/exceptions_test.py +0 -0
  144. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extraction_batch_test.py +0 -0
  145. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extraction_test.py +0 -0
  146. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/__init__.py +0 -0
  147. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/email_test.py +0 -0
  148. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/html_test.py +0 -0
  149. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/image_test.py +0 -0
  150. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  151. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/pandoc_test.py +0 -0
  152. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/pdf_test.py +0 -0
  153. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/presentation_test.py +0 -0
  154. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/spreed_sheet_test.py +0 -0
  155. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/extractors/structured_test.py +0 -0
  156. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/gmft_extended_test.py +0 -0
  157. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/gmft_test.py +0 -0
  158. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/hooks_test.py +0 -0
  159. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/language_detection_test.py +0 -0
  160. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/mcp_server_test.py +0 -0
  161. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/mime_types_test.py +0 -0
  162. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/__init__.py +0 -0
  163. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
  164. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  165. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/process_manager_test.py +0 -0
  166. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  167. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/__init__.py +0 -0
  168. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/base_test.py +0 -0
  169. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/device_integration_test.py +0 -0
  170. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/easyocr_test.py +0 -0
  171. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/init_test.py +0 -0
  172. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/paddleocr_test.py +0 -0
  173. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/ocr/tesseract_test.py +0 -0
  174. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/playa_helpers_test.py +0 -0
  175. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/playa_test.py +0 -0
  176. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/registry_test.py +0 -0
  177. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
  178. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/contract.txt +0 -0
  179. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/contract_test.txt +0 -0
  180. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/document.docx +0 -0
  181. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/email/sample-email.eml +0 -0
  182. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  183. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/excel.xlsx +0 -0
  184. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/form_test.txt +0 -0
  185. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/french-text.txt +0 -0
  186. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/german-text.txt +0 -0
  187. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/html.html +0 -0
  188. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/images/test_hello_world.png +0 -0
  189. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/invoice_image.png +0 -0
  190. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/invoice_test.txt +0 -0
  191. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/json/sample-document.json +0 -0
  192. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  193. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/markdown.md +0 -0
  194. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  195. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/non-searchable.pdf +0 -0
  196. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/ocr-image.jpg +0 -0
  197. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  198. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  199. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  200. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  201. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/receipt_test.txt +0 -0
  202. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/report_test.txt +0 -0
  203. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/sample-contract.pdf +0 -0
  204. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/scanned.pdf +0 -0
  205. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/searchable.pdf +0 -0
  206. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/spanish-text.txt +0 -0
  207. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/test-article.pdf +0 -0
  208. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  209. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/types_test.py +0 -0
  210. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/__init__.py +0 -0
  211. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/cache_test.py +0 -0
  212. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/device_test.py +0 -0
  213. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/errors_test.py +0 -0
  214. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/pdf_lock_test.py +0 -0
  215. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/process_pool_test.py +0 -0
  216. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/serialization_test.py +0 -0
  217. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/string_test.py +0 -0
  218. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/sync_test.py +0 -0
  219. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/table_test.py +0 -0
  220. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils/tmp_test.py +0 -0
  221. {kreuzberg-3.10.1 → kreuzberg-3.11.1}/tests/utils_errors_test.py +0 -0
@@ -15,7 +15,7 @@ jobs:
15
15
  timeout-minutes: 10
16
16
  steps:
17
17
  - name: Checkout
18
- uses: actions/checkout@v4
18
+ uses: actions/checkout@v5
19
19
 
20
20
  - name: Install uv
21
21
  uses: astral-sh/setup-uv@v6
@@ -58,7 +58,7 @@ jobs:
58
58
  timeout-minutes: 20
59
59
  steps:
60
60
  - name: Checkout
61
- uses: actions/checkout@v4
61
+ uses: actions/checkout@v5
62
62
 
63
63
  - name: Install uv
64
64
  uses: astral-sh/setup-uv@v6
@@ -151,7 +151,7 @@ jobs:
151
151
  timeout-minutes: 30
152
152
  steps:
153
153
  - name: Checkout
154
- uses: actions/checkout@v4
154
+ uses: actions/checkout@v5
155
155
 
156
156
  - name: Install uv
157
157
  uses: astral-sh/setup-uv@v6
@@ -24,7 +24,7 @@ jobs:
24
24
  runs-on: ubuntu-latest
25
25
  steps:
26
26
  - name: Checkout repository
27
- uses: actions/checkout@v4
27
+ uses: actions/checkout@v5
28
28
  with:
29
29
  fetch-depth: 0
30
30
 
@@ -15,6 +15,6 @@ jobs:
15
15
  name: Validate PR title
16
16
  runs-on: ubuntu-latest
17
17
  steps:
18
- - uses: amannn/action-semantic-pull-request@v5
18
+ - uses: amannn/action-semantic-pull-request@v6
19
19
  env:
20
20
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -46,7 +46,7 @@ jobs:
46
46
  df -h
47
47
 
48
48
  - name: Checkout repository
49
- uses: actions/checkout@v4
49
+ uses: actions/checkout@v5
50
50
  with:
51
51
  ref: ${{ github.ref }}
52
52
 
@@ -13,7 +13,7 @@ jobs:
13
13
  contents: read
14
14
  steps:
15
15
  - name: Checkout
16
- uses: actions/checkout@v4
16
+ uses: actions/checkout@v5
17
17
 
18
18
  - name: Install uv
19
19
  uses: astral-sh/setup-uv@v6
@@ -5,13 +5,15 @@ repos:
5
5
  - id: commitlint
6
6
  stages: [commit-msg]
7
7
  additional_dependencies: ["@commitlint/config-conventional"]
8
- - repo: https://github.com/Goldziher/ai-rulez
9
- rev: v1.1.4
10
- hooks:
11
- - id: ai-rulez-validate
12
- - id: ai-rulez-generate
8
+ # Temporarily disabled - ai-rulez Go build failing in CI
9
+ # TODO: Re-enable once ai-rulez v1.4.4+ Python migration is stable
10
+ # - repo: https://github.com/Goldziher/ai-rulez
11
+ # rev: v1.4.3
12
+ # hooks:
13
+ # - id: ai-rulez-validate
14
+ # - id: ai-rulez-generate
13
15
  - repo: https://github.com/pre-commit/pre-commit-hooks
14
- rev: v5.0.0
16
+ rev: v6.0.0
15
17
  hooks:
16
18
  - id: name-tests-test
17
19
  args:
@@ -53,7 +55,7 @@ repos:
53
55
  hooks:
54
56
  - id: pyproject-fmt
55
57
  - repo: https://github.com/astral-sh/ruff-pre-commit
56
- rev: v0.12.7
58
+ rev: v0.12.8
57
59
  hooks:
58
60
  - id: ruff
59
61
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.10.1
3
+ Version: 3.11.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -28,13 +28,13 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Classifier: Topic :: Text Processing :: General
29
29
  Classifier: Typing :: Typed
30
30
  Requires-Python: >=3.10
31
- Requires-Dist: anyio>=4.9.0
31
+ Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.9.0
35
- Requires-Dist: mcp>=1.12.2
35
+ Requires-Dist: mcp>=1.12.4
36
36
  Requires-Dist: msgspec>=0.18.0
37
- Requires-Dist: playa-pdf>=0.6.4
37
+ Requires-Dist: playa-pdf>=0.7.0
38
38
  Requires-Dist: psutil>=7.0.0
39
39
  Requires-Dist: pypdfium2==4.30.0
40
40
  Requires-Dist: python-calamine>=0.3.2
@@ -45,25 +45,24 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
45
45
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
46
46
  Provides-Extra: all
47
47
  Requires-Dist: click>=8.2.1; extra == 'all'
48
+ Requires-Dist: deep-translator>=1.11.4; extra == 'all'
48
49
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
49
50
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
50
51
  Requires-Dist: gmft>=0.4.2; extra == 'all'
51
52
  Requires-Dist: keybert>=0.9.0; extra == 'all'
52
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
53
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
53
54
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
55
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
56
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
- Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
57
+ Requires-Dist: pandas>=2.3.1; extra == 'all'
58
+ Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
57
59
  Requires-Dist: rich>=14.1.0; extra == 'all'
58
60
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
59
61
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
60
62
  Requires-Dist: spacy>=3.8.7; extra == 'all'
61
63
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
62
64
  Provides-Extra: api
63
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
64
- Provides-Extra: auto-classify-document-type
65
- Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
66
- Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
65
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
67
66
  Provides-Extra: chunking
68
67
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
69
68
  Provides-Extra: cli
@@ -71,7 +70,10 @@ Requires-Dist: click>=8.2.1; extra == 'cli'
71
70
  Requires-Dist: rich>=14.1.0; extra == 'cli'
72
71
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
73
72
  Provides-Extra: crypto
74
- Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
73
+ Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
74
+ Provides-Extra: document-classification
75
+ Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
76
+ Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
75
77
  Provides-Extra: easyocr
76
78
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
77
79
  Provides-Extra: entity-extraction
@@ -34,7 +34,7 @@ All commands run through `uv run`:
34
34
  # Testing
35
35
  uv run pytest # Run all tests
36
36
  uv run pytest tests/foo_test.py # Run specific test
37
- uv run pytest --cov # With coverage (must be ≥95%)
37
+ uv run pytest --cov # With coverage (must be ≥85%)
38
38
 
39
39
  # Code quality
40
40
  uv run ruff format # Format code
@@ -132,15 +132,15 @@ async def extract_tables_from_pdf():
132
132
  # Process extracted tables
133
133
  print(f"Found {len(result.tables)} tables")
134
134
  for i, table in enumerate(result.tables):
135
- print(f"Table {i+1} on page {table.page_number}:")
136
- print(table.text) # Markdown formatted table
135
+ print(f"Table {i+1} on page {table['page_number']}:")
136
+ print(table["text"]) # Markdown formatted table
137
137
 
138
138
  # Work with the pandas DataFrame
139
- df = table.df
139
+ df = table["df"]
140
140
  print(f"Table shape: {df.shape}")
141
141
 
142
142
  # The cropped table image is also available
143
- # table.cropped_image.save(f"table_{i+1}.png")
143
+ # table['cropped_image'].save(f"table_{i+1}.png")
144
144
 
145
145
  # With custom GMFT configuration
146
146
  custom_config = ExtractionConfig(
@@ -134,6 +134,16 @@ python -m spacy download es_core_news_sm # Spanish
134
134
 
135
135
  spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
136
136
 
137
+ ### Document Classification
138
+
139
+ For automatic document type detection (invoice, contract, receipt, etc.), install the document classification extra:
140
+
141
+ ```shell
142
+ pip install "kreuzberg[document-classification]"
143
+ ```
144
+
145
+ This feature uses Google Translate for multi-language support and requires explicit opt-in by setting `auto_detect_document_type=True` in your configuration.
146
+
137
147
  ### All Optional Dependencies
138
148
 
139
149
  To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -145,5 +155,5 @@ pip install "kreuzberg[all]"
145
155
  This is equivalent to:
146
156
 
147
157
  ```shell
148
- pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
158
+ pip install "kreuzberg[chunking,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
149
159
  ```
@@ -22,7 +22,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
22
22
 
23
23
  ### Engineering Principles
24
24
 
25
- - **Test Coverage**: 95%+ coverage with comprehensive test suites
25
+ - **Test Coverage**: Comprehensive test suites ensuring code reliability
26
26
  - **API Design**: True async/await implementation alongside synchronous APIs
27
27
  - **Error Handling**: Consistent exception hierarchy with detailed context
28
28
  - **Type Safety**: Full type annotations for enhanced developer experience
@@ -2,9 +2,17 @@
2
2
 
3
3
  Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
4
4
 
5
+ ## Installation
6
+
7
+ Document classification requires the `document-classification` extra to be installed:
8
+
9
+ ```bash
10
+ pip install "kreuzberg[document-classification]"
11
+ ```
12
+
5
13
  ## Enabling Document Classification
6
14
 
7
- To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
15
+ Document classification is disabled by default. To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
8
16
 
9
17
  ```python
10
18
  from kreuzberg import ExtractionConfig, extract_file
@@ -237,10 +237,10 @@ result = await extract_file("document_with_tables.pdf", config=config)
237
237
 
238
238
  # Access extracted tables
239
239
  for i, table in enumerate(result.tables):
240
- print(f"Table {i+1} on page {table.page_number}:")
241
- print(table.text) # Markdown formatted table text
240
+ print(f"Table {i+1} on page {table['page_number']}:")
241
+ print(table["text"]) # Markdown formatted table text
242
242
  # You can also access the pandas DataFrame directly
243
- df = table.df
243
+ df = table["df"]
244
244
  print(df.shape) # (rows, columns)
245
245
  ```
246
246
 
@@ -97,19 +97,21 @@ def parse_ocr_backend_config(
97
97
  if not isinstance(backend_config, dict):
98
98
  return None
99
99
 
100
- if backend == "tesseract":
101
- # Convert psm integer to PSMMode enum if needed
102
- processed_config = backend_config.copy()
103
- if "psm" in processed_config and isinstance(processed_config["psm"], int):
104
- from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
105
-
106
- processed_config["psm"] = PSMMode(processed_config["psm"])
107
- return TesseractConfig(**processed_config)
108
- if backend == "easyocr":
109
- return EasyOCRConfig(**backend_config)
110
- if backend == "paddleocr":
111
- return PaddleOCRConfig(**backend_config)
112
- return None
100
+ match backend:
101
+ case "tesseract":
102
+ # Convert psm integer to PSMMode enum if needed
103
+ processed_config = backend_config.copy()
104
+ if "psm" in processed_config and isinstance(processed_config["psm"], int):
105
+ from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
106
+
107
+ processed_config["psm"] = PSMMode(processed_config["psm"])
108
+ return TesseractConfig(**processed_config)
109
+ case "easyocr":
110
+ return EasyOCRConfig(**backend_config)
111
+ case "paddleocr":
112
+ return PaddleOCRConfig(**backend_config)
113
+ case _:
114
+ return None
113
115
 
114
116
 
115
117
  def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
@@ -140,7 +142,9 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
140
142
  "document_classification_mode",
141
143
  "keyword_count",
142
144
  }
143
- extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
145
+ extraction_config = extraction_config | {
146
+ field: config_dict[field] for field in basic_fields if field in config_dict
147
+ }
144
148
 
145
149
  # Handle OCR backend configuration
146
150
  ocr_backend = extraction_config.get("ocr_backend")
@@ -62,7 +62,7 @@ def _get_translated_text(result: ExtractionResult) -> str:
62
62
  from deep_translator import GoogleTranslator # noqa: PLC0415
63
63
  except ImportError as e: # pragma: no cover
64
64
  raise MissingDependencyError(
65
- "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
65
+ "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
66
66
  ) from e
67
67
 
68
68
  try:
@@ -116,8 +116,7 @@ class Extractor(ABC):
116
116
  quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
117
 
118
118
  # Add quality metadata
119
- enhanced_metadata = dict(result.metadata) if result.metadata else {}
120
- enhanced_metadata["quality_score"] = quality_score
119
+ enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
121
120
 
122
121
  # Return enhanced result
123
122
  return ExtractionResult(
@@ -85,23 +85,24 @@ class ImageExtractor(Extractor):
85
85
 
86
86
  backend = get_ocr_backend(self.config.ocr_backend)
87
87
 
88
- if self.config.ocr_backend == "tesseract":
89
- config = (
90
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
91
- )
92
- result = backend.process_file_sync(path, **asdict(config))
93
- elif self.config.ocr_backend == "paddleocr":
94
- paddle_config = (
95
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
96
- )
97
- result = backend.process_file_sync(path, **asdict(paddle_config))
98
- elif self.config.ocr_backend == "easyocr":
99
- easy_config = (
100
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
101
- )
102
- result = backend.process_file_sync(path, **asdict(easy_config))
103
- else:
104
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
88
+ match self.config.ocr_backend:
89
+ case "tesseract":
90
+ config = (
91
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
92
+ )
93
+ result = backend.process_file_sync(path, **asdict(config))
94
+ case "paddleocr":
95
+ paddle_config = (
96
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
97
+ )
98
+ result = backend.process_file_sync(path, **asdict(paddle_config))
99
+ case "easyocr":
100
+ easy_config = (
101
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
102
+ )
103
+ result = backend.process_file_sync(path, **asdict(easy_config))
104
+ case _:
105
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
105
106
  return self._apply_quality_processing(result)
106
107
 
107
108
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
@@ -88,14 +88,12 @@ class PDFExtractor(Extractor):
88
88
  # Enhance metadata with table information
89
89
  if result.tables:
90
90
  table_summary = generate_table_summary(result.tables)
91
- result.metadata.update(
92
- {
93
- "table_count": table_summary["table_count"],
94
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
95
- f"across {table_summary['pages_with_tables']} pages with "
96
- f"{table_summary['total_rows']} total rows",
97
- }
98
- )
91
+ result.metadata = result.metadata | {
92
+ "table_count": table_summary["table_count"],
93
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
94
+ f"across {table_summary['pages_with_tables']} pages with "
95
+ f"{table_summary['total_rows']} total rows",
96
+ }
99
97
 
100
98
  return self._apply_quality_processing(result)
101
99
 
@@ -153,14 +151,12 @@ class PDFExtractor(Extractor):
153
151
  # Enhance metadata with table information
154
152
  if tables:
155
153
  table_summary = generate_table_summary(tables)
156
- result.metadata.update(
157
- {
158
- "table_count": table_summary["table_count"],
159
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
160
- f"across {table_summary['pages_with_tables']} pages with "
161
- f"{table_summary['total_rows']} total rows",
162
- }
163
- )
154
+ result.metadata = result.metadata | {
155
+ "table_count": table_summary["table_count"],
156
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
157
+ f"across {table_summary['pages_with_tables']} pages with "
158
+ f"{table_summary['total_rows']} total rows",
159
+ }
164
160
 
165
161
  # Apply quality processing
166
162
  return self._apply_quality_processing(result)
@@ -386,23 +382,24 @@ class PDFExtractor(Extractor):
386
382
  backend = get_ocr_backend(self.config.ocr_backend)
387
383
  paths = [Path(p) for p in image_paths]
388
384
 
389
- if self.config.ocr_backend == "tesseract":
390
- config = (
391
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
392
- )
393
- results = backend.process_batch_sync(paths, **asdict(config))
394
- elif self.config.ocr_backend == "paddleocr":
395
- paddle_config = (
396
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
397
- )
398
- results = backend.process_batch_sync(paths, **asdict(paddle_config))
399
- elif self.config.ocr_backend == "easyocr":
400
- easy_config = (
401
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
402
- )
403
- results = backend.process_batch_sync(paths, **asdict(easy_config))
404
- else:
405
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
385
+ match self.config.ocr_backend:
386
+ case "tesseract":
387
+ config = (
388
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
389
+ )
390
+ results = backend.process_batch_sync(paths, **asdict(config))
391
+ case "paddleocr":
392
+ paddle_config = (
393
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
394
+ )
395
+ results = backend.process_batch_sync(paths, **asdict(paddle_config))
396
+ case "easyocr":
397
+ easy_config = (
398
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
399
+ )
400
+ results = backend.process_batch_sync(paths, **asdict(easy_config))
401
+ case _:
402
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
406
403
 
407
404
  # Use list comprehension and join for efficient string building
408
405
  return "\n\n".join(result.content for result in results)
@@ -51,7 +51,7 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
51
51
  }
52
52
 
53
53
  # Override with provided parameters
54
- config_dict.update(kwargs)
54
+ config_dict = config_dict | kwargs
55
55
 
56
56
  return ExtractionConfig(**config_dict)
57
57
 
@@ -4,7 +4,6 @@ import warnings
4
4
  from dataclasses import dataclass
5
5
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
6
6
 
7
- import numpy as np
8
7
  from PIL import Image
9
8
 
10
9
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -188,6 +187,9 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
188
187
 
189
188
  kwargs.pop("language", None)
190
189
  kwargs.pop("use_gpu", None)
190
+ kwargs.pop("device", None)
191
+ kwargs.pop("gpu_memory_limit", None)
192
+ kwargs.pop("fallback_to_cpu", None)
191
193
 
192
194
  try:
193
195
  result = await run_sync(
@@ -455,11 +457,16 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
455
457
  Raises:
456
458
  OCRError: If OCR processing fails.
457
459
  """
460
+ import numpy as np # noqa: PLC0415
461
+
458
462
  self._init_easyocr_sync(**kwargs)
459
463
 
460
464
  beam_width = kwargs.pop("beam_width")
461
465
  kwargs.pop("language", None)
462
466
  kwargs.pop("use_gpu", None)
467
+ kwargs.pop("device", None)
468
+ kwargs.pop("gpu_memory_limit", None)
469
+ kwargs.pop("fallback_to_cpu", None)
463
470
 
464
471
  try:
465
472
  result = self._reader.readtext(
@@ -7,7 +7,6 @@ from importlib.util import find_spec
7
7
  from pathlib import Path
8
8
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
9
9
 
10
- import numpy as np
11
10
  from PIL import Image
12
11
 
13
12
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -380,6 +379,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
380
379
  Raises:
381
380
  OCRError: If OCR processing fails.
382
381
  """
382
+ import numpy as np # noqa: PLC0415
383
+
383
384
  self._init_paddle_ocr_sync(**kwargs)
384
385
 
385
386
  if image.mode != "RGB":
@@ -349,7 +349,7 @@ class ExtractionConfig:
349
349
  """Configuration for language detection. If None, uses default settings."""
350
350
  spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
351
351
  """Configuration for spaCy entity extraction. If None, uses default settings."""
352
- auto_detect_document_type: bool = True
352
+ auto_detect_document_type: bool = False
353
353
  """Whether to automatically detect the document type."""
354
354
  document_type_confidence_threshold: float = 0.5
355
355
  """Confidence threshold for document type detection."""
@@ -398,15 +398,16 @@ class ExtractionConfig:
398
398
  return asdict(self.ocr_config)
399
399
 
400
400
  # Lazy load and cache default configs instead of creating new instances
401
- if self.ocr_backend == "tesseract":
402
- from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
401
+ match self.ocr_backend:
402
+ case "tesseract":
403
+ from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
403
404
 
404
- return asdict(TesseractConfig())
405
- if self.ocr_backend == "easyocr":
406
- from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
405
+ return asdict(TesseractConfig())
406
+ case "easyocr":
407
+ from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
407
408
 
408
- return asdict(EasyOCRConfig())
409
- # paddleocr
410
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
409
+ return asdict(EasyOCRConfig())
410
+ case _: # paddleocr or any other backend
411
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
411
412
 
412
- return asdict(PaddleOCRConfig())
413
+ return asdict(PaddleOCRConfig())
@@ -158,4 +158,3 @@ nav:
158
158
  - Custom Hooks: advanced/custom-hooks.md
159
159
  - Custom Extractors: advanced/custom-extractors.md
160
160
  - Contributing: contributing.md
161
- - Changelog: changelog.md