kreuzberg 3.10.1__tar.gz → 3.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/PKG-INFO +7 -5
  2. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/contributing.md +1 -1
  3. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/getting-started/installation.md +11 -1
  4. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/index.md +1 -1
  5. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/document-classification.md +9 -1
  6. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_config.py +18 -14
  7. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_document_classification.py +1 -1
  8. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_base.py +1 -2
  9. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_image.py +18 -17
  10. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_pdf.py +30 -33
  11. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_mcp/server.py +1 -1
  12. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_types.py +11 -10
  13. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/mkdocs.yaml +0 -1
  14. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/pyproject.toml +8 -8
  15. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/document_classification_test.py +49 -14
  16. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/uv.lock +85 -62
  17. kreuzberg-3.10.1/docs/changelog.md +0 -49
  18. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.commitlintrc +0 -0
  19. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.deepsource.toml +0 -0
  20. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.docker/Dockerfile +0 -0
  21. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.docker/README.md +0 -0
  22. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.dockerignore +0 -0
  23. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/dependabot.yaml +0 -0
  24. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/ci.yaml +0 -0
  25. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/docs.yml +0 -0
  26. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/pr-title.yaml +0 -0
  27. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/publish-docker.yml +0 -0
  28. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.github/workflows/release.yaml +0 -0
  29. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.gitignore +0 -0
  30. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.markdownlint.yaml +0 -0
  31. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/.pre-commit-config.yaml +0 -0
  32. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/LICENSE +0 -0
  33. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/README.md +0 -0
  34. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/ai-rulez.yaml +0 -0
  35. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/README.md +0 -0
  36. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/benchmark_baseline.py +0 -0
  37. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/end_to_end_benchmark.py +0 -0
  38. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/final_benchmark.py +0 -0
  39. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/pyproject.toml +0 -0
  40. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/baseline_results.json +0 -0
  41. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  42. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
  43. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/final_benchmark_results.json +0 -0
  44. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/latest.json +0 -0
  45. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/mime_caching_results.json +0 -0
  46. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/msgspec_caching_results.json +0 -0
  47. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/ocr_caching_results.json +0 -0
  48. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
  49. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
  50. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/results/table_caching_results.json +0 -0
  51. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/serialization_benchmark.py +0 -0
  52. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  53. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  54. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
  55. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  56. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  57. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  58. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  59. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/benchmarks/statistical_benchmark.py +0 -0
  60. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/custom-extractors.md +0 -0
  61. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/custom-hooks.md +0 -0
  62. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/error-handling.md +0 -0
  63. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/index.md +0 -0
  64. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/advanced/performance.md +0 -0
  65. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/exceptions.md +0 -0
  66. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/extraction-functions.md +0 -0
  67. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/extractor-registry.md +0 -0
  68. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/index.md +0 -0
  69. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/ocr-configuration.md +0 -0
  70. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/api-reference/types.md +0 -0
  71. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/assets/favicon.png +0 -0
  72. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/assets/logo.png +0 -0
  73. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/cli.md +0 -0
  74. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/css/extra.css +0 -0
  75. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/examples/extraction-examples.md +0 -0
  76. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/examples/index.md +0 -0
  77. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/getting-started/index.md +0 -0
  78. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/getting-started/quick-start.md +0 -0
  79. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/performance-analysis.md +0 -0
  80. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/api-server.md +0 -0
  81. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/basic-usage.md +0 -0
  82. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/chunking.md +0 -0
  83. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/docker.md +0 -0
  84. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/extraction-configuration.md +0 -0
  85. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/index.md +0 -0
  86. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/mcp-server.md +0 -0
  87. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/metadata-extraction.md +0 -0
  88. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/ocr-backends.md +0 -0
  89. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/ocr-configuration.md +0 -0
  90. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/docs/user-guide/supported-formats.md +0 -0
  91. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/__init__.py +0 -0
  92. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/__main__.py +0 -0
  93. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_api/__init__.py +0 -0
  94. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_api/main.py +0 -0
  95. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_chunker.py +0 -0
  96. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_constants.py +0 -0
  97. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_entity_extraction.py +0 -0
  98. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/__init__.py +0 -0
  99. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_email.py +0 -0
  100. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_html.py +0 -0
  101. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  102. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_presentation.py +0 -0
  103. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  104. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_extractors/_structured.py +0 -0
  105. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_gmft.py +0 -0
  106. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_language_detection.py +0 -0
  107. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_mcp/__init__.py +0 -0
  108. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_mime_types.py +0 -0
  109. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/__init__.py +0 -0
  110. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/_base.py +0 -0
  111. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/_easyocr.py +0 -0
  112. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
  113. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_ocr/_tesseract.py +0 -0
  114. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_playa.py +0 -0
  115. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_registry.py +0 -0
  116. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/__init__.py +0 -0
  117. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_cache.py +0 -0
  118. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_device.py +0 -0
  119. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_document_cache.py +0 -0
  120. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_errors.py +0 -0
  121. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  122. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_process_pool.py +0 -0
  123. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_quality.py +0 -0
  124. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_serialization.py +0 -0
  125. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_string.py +0 -0
  126. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_sync.py +0 -0
  127. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_table.py +0 -0
  128. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/_utils/_tmp.py +0 -0
  129. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/cli.py +0 -0
  130. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/exceptions.py +0 -0
  131. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/extraction.py +0 -0
  132. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/kreuzberg/py.typed +0 -0
  133. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/__init__.py +0 -0
  134. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/api/__init__.py +0 -0
  135. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/api/main_test.py +0 -0
  136. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/chunker_test.py +0 -0
  137. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/cli_command_test.py +0 -0
  138. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/cli_integration_test.py +0 -0
  139. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/cli_test.py +0 -0
  140. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/config_test.py +0 -0
  141. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/conftest.py +0 -0
  142. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/entity_extraction_test.py +0 -0
  143. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/exceptions_test.py +0 -0
  144. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extraction_batch_test.py +0 -0
  145. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extraction_test.py +0 -0
  146. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/__init__.py +0 -0
  147. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/email_test.py +0 -0
  148. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/html_test.py +0 -0
  149. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/image_test.py +0 -0
  150. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  151. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/pandoc_test.py +0 -0
  152. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/pdf_test.py +0 -0
  153. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/presentation_test.py +0 -0
  154. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/spreed_sheet_test.py +0 -0
  155. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/extractors/structured_test.py +0 -0
  156. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/gmft_extended_test.py +0 -0
  157. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/gmft_test.py +0 -0
  158. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/hooks_test.py +0 -0
  159. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/language_detection_test.py +0 -0
  160. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/mcp_server_test.py +0 -0
  161. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/mime_types_test.py +0 -0
  162. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/__init__.py +0 -0
  163. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
  164. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  165. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/process_manager_test.py +0 -0
  166. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  167. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/__init__.py +0 -0
  168. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/base_test.py +0 -0
  169. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/device_integration_test.py +0 -0
  170. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/easyocr_test.py +0 -0
  171. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/init_test.py +0 -0
  172. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/paddleocr_test.py +0 -0
  173. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/ocr/tesseract_test.py +0 -0
  174. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/playa_helpers_test.py +0 -0
  175. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/playa_test.py +0 -0
  176. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/registry_test.py +0 -0
  177. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/better-ocr-image.jpg +0 -0
  178. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/contract.txt +0 -0
  179. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/contract_test.txt +0 -0
  180. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/document.docx +0 -0
  181. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/email/sample-email.eml +0 -0
  182. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  183. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/excel.xlsx +0 -0
  184. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/form_test.txt +0 -0
  185. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/french-text.txt +0 -0
  186. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/german-text.txt +0 -0
  187. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/html.html +0 -0
  188. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/images/test_hello_world.png +0 -0
  189. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/invoice_image.png +0 -0
  190. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/invoice_test.txt +0 -0
  191. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/json/sample-document.json +0 -0
  192. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  193. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/markdown.md +0 -0
  194. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  195. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/non-searchable.pdf +0 -0
  196. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/ocr-image.jpg +0 -0
  197. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  198. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  199. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  200. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  201. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/receipt_test.txt +0 -0
  202. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/report_test.txt +0 -0
  203. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/sample-contract.pdf +0 -0
  204. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/scanned.pdf +0 -0
  205. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/searchable.pdf +0 -0
  206. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/spanish-text.txt +0 -0
  207. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/test-article.pdf +0 -0
  208. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  209. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/types_test.py +0 -0
  210. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/__init__.py +0 -0
  211. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/cache_test.py +0 -0
  212. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/device_test.py +0 -0
  213. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/errors_test.py +0 -0
  214. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/pdf_lock_test.py +0 -0
  215. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/process_pool_test.py +0 -0
  216. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/serialization_test.py +0 -0
  217. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/string_test.py +0 -0
  218. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/sync_test.py +0 -0
  219. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/table_test.py +0 -0
  220. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils/tmp_test.py +0 -0
  221. {kreuzberg-3.10.1 → kreuzberg-3.11.0}/tests/utils_errors_test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.10.1
3
+ Version: 3.11.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.9.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.9.0
35
- Requires-Dist: mcp>=1.12.2
35
+ Requires-Dist: mcp>=1.12.3
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: playa-pdf>=0.6.4
38
38
  Requires-Dist: psutil>=7.0.0
@@ -45,6 +45,7 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
45
45
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
46
46
  Provides-Extra: all
47
47
  Requires-Dist: click>=8.2.1; extra == 'all'
48
+ Requires-Dist: deep-translator>=1.11.4; extra == 'all'
48
49
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
49
50
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
50
51
  Requires-Dist: gmft>=0.4.2; extra == 'all'
@@ -53,6 +54,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
53
54
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
55
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
56
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
57
+ Requires-Dist: pandas>=2.3.1; extra == 'all'
56
58
  Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
57
59
  Requires-Dist: rich>=14.1.0; extra == 'all'
58
60
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
@@ -61,9 +63,6 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
61
63
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
62
64
  Provides-Extra: api
63
65
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
64
- Provides-Extra: auto-classify-document-type
65
- Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
66
- Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
67
66
  Provides-Extra: chunking
68
67
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
69
68
  Provides-Extra: cli
@@ -72,6 +71,9 @@ Requires-Dist: rich>=14.1.0; extra == 'cli'
72
71
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
73
72
  Provides-Extra: crypto
74
73
  Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
74
+ Provides-Extra: document-classification
75
+ Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
76
+ Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
75
77
  Provides-Extra: easyocr
76
78
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
77
79
  Provides-Extra: entity-extraction
@@ -34,7 +34,7 @@ All commands run through `uv run`:
34
34
  # Testing
35
35
  uv run pytest # Run all tests
36
36
  uv run pytest tests/foo_test.py # Run specific test
37
- uv run pytest --cov # With coverage (must be ≥95%)
37
+ uv run pytest --cov # With coverage (must be ≥85%)
38
38
 
39
39
  # Code quality
40
40
  uv run ruff format # Format code
@@ -134,6 +134,16 @@ python -m spacy download es_core_news_sm # Spanish
134
134
 
135
135
  spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
136
136
 
137
+ ### Document Classification
138
+
139
+ For automatic document type detection (invoice, contract, receipt, etc.), install the document classification extra:
140
+
141
+ ```shell
142
+ pip install "kreuzberg[document-classification]"
143
+ ```
144
+
145
+ This feature uses Google Translate for multi-language support and requires explicit opt-in by setting `auto_detect_document_type=True` in your configuration.
146
+
137
147
  ### All Optional Dependencies
138
148
 
139
149
  To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -145,5 +155,5 @@ pip install "kreuzberg[all]"
145
155
  This is equivalent to:
146
156
 
147
157
  ```shell
148
- pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
158
+ pip install "kreuzberg[chunking,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
149
159
  ```
@@ -22,7 +22,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
22
22
 
23
23
  ### Engineering Principles
24
24
 
25
- - **Test Coverage**: 95%+ coverage with comprehensive test suites
25
+ - **Test Coverage**: Comprehensive test suites ensuring code reliability
26
26
  - **API Design**: True async/await implementation alongside synchronous APIs
27
27
  - **Error Handling**: Consistent exception hierarchy with detailed context
28
28
  - **Type Safety**: Full type annotations for enhanced developer experience
@@ -2,9 +2,17 @@
2
2
 
3
3
  Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
4
4
 
5
+ ## Installation
6
+
7
+ Document classification requires the `document-classification` extra to be installed:
8
+
9
+ ```bash
10
+ pip install "kreuzberg[document-classification]"
11
+ ```
12
+
5
13
  ## Enabling Document Classification
6
14
 
7
- To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
15
+ Document classification is disabled by default. To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
8
16
 
9
17
  ```python
10
18
  from kreuzberg import ExtractionConfig, extract_file
@@ -97,19 +97,21 @@ def parse_ocr_backend_config(
97
97
  if not isinstance(backend_config, dict):
98
98
  return None
99
99
 
100
- if backend == "tesseract":
101
- # Convert psm integer to PSMMode enum if needed
102
- processed_config = backend_config.copy()
103
- if "psm" in processed_config and isinstance(processed_config["psm"], int):
104
- from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
105
-
106
- processed_config["psm"] = PSMMode(processed_config["psm"])
107
- return TesseractConfig(**processed_config)
108
- if backend == "easyocr":
109
- return EasyOCRConfig(**backend_config)
110
- if backend == "paddleocr":
111
- return PaddleOCRConfig(**backend_config)
112
- return None
100
+ match backend:
101
+ case "tesseract":
102
+ # Convert psm integer to PSMMode enum if needed
103
+ processed_config = backend_config.copy()
104
+ if "psm" in processed_config and isinstance(processed_config["psm"], int):
105
+ from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
106
+
107
+ processed_config["psm"] = PSMMode(processed_config["psm"])
108
+ return TesseractConfig(**processed_config)
109
+ case "easyocr":
110
+ return EasyOCRConfig(**backend_config)
111
+ case "paddleocr":
112
+ return PaddleOCRConfig(**backend_config)
113
+ case _:
114
+ return None
113
115
 
114
116
 
115
117
  def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
@@ -140,7 +142,9 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
140
142
  "document_classification_mode",
141
143
  "keyword_count",
142
144
  }
143
- extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
145
+ extraction_config = extraction_config | {
146
+ field: config_dict[field] for field in basic_fields if field in config_dict
147
+ }
144
148
 
145
149
  # Handle OCR backend configuration
146
150
  ocr_backend = extraction_config.get("ocr_backend")
@@ -62,7 +62,7 @@ def _get_translated_text(result: ExtractionResult) -> str:
62
62
  from deep_translator import GoogleTranslator # noqa: PLC0415
63
63
  except ImportError as e: # pragma: no cover
64
64
  raise MissingDependencyError(
65
- "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
65
+ "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
66
66
  ) from e
67
67
 
68
68
  try:
@@ -116,8 +116,7 @@ class Extractor(ABC):
116
116
  quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
117
 
118
118
  # Add quality metadata
119
- enhanced_metadata = dict(result.metadata) if result.metadata else {}
120
- enhanced_metadata["quality_score"] = quality_score
119
+ enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
121
120
 
122
121
  # Return enhanced result
123
122
  return ExtractionResult(
@@ -85,23 +85,24 @@ class ImageExtractor(Extractor):
85
85
 
86
86
  backend = get_ocr_backend(self.config.ocr_backend)
87
87
 
88
- if self.config.ocr_backend == "tesseract":
89
- config = (
90
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
91
- )
92
- result = backend.process_file_sync(path, **asdict(config))
93
- elif self.config.ocr_backend == "paddleocr":
94
- paddle_config = (
95
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
96
- )
97
- result = backend.process_file_sync(path, **asdict(paddle_config))
98
- elif self.config.ocr_backend == "easyocr":
99
- easy_config = (
100
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
101
- )
102
- result = backend.process_file_sync(path, **asdict(easy_config))
103
- else:
104
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
88
+ match self.config.ocr_backend:
89
+ case "tesseract":
90
+ config = (
91
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
92
+ )
93
+ result = backend.process_file_sync(path, **asdict(config))
94
+ case "paddleocr":
95
+ paddle_config = (
96
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
97
+ )
98
+ result = backend.process_file_sync(path, **asdict(paddle_config))
99
+ case "easyocr":
100
+ easy_config = (
101
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
102
+ )
103
+ result = backend.process_file_sync(path, **asdict(easy_config))
104
+ case _:
105
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
105
106
  return self._apply_quality_processing(result)
106
107
 
107
108
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
@@ -88,14 +88,12 @@ class PDFExtractor(Extractor):
88
88
  # Enhance metadata with table information
89
89
  if result.tables:
90
90
  table_summary = generate_table_summary(result.tables)
91
- result.metadata.update(
92
- {
93
- "table_count": table_summary["table_count"],
94
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
95
- f"across {table_summary['pages_with_tables']} pages with "
96
- f"{table_summary['total_rows']} total rows",
97
- }
98
- )
91
+ result.metadata = result.metadata | {
92
+ "table_count": table_summary["table_count"],
93
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
94
+ f"across {table_summary['pages_with_tables']} pages with "
95
+ f"{table_summary['total_rows']} total rows",
96
+ }
99
97
 
100
98
  return self._apply_quality_processing(result)
101
99
 
@@ -153,14 +151,12 @@ class PDFExtractor(Extractor):
153
151
  # Enhance metadata with table information
154
152
  if tables:
155
153
  table_summary = generate_table_summary(tables)
156
- result.metadata.update(
157
- {
158
- "table_count": table_summary["table_count"],
159
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
160
- f"across {table_summary['pages_with_tables']} pages with "
161
- f"{table_summary['total_rows']} total rows",
162
- }
163
- )
154
+ result.metadata = result.metadata | {
155
+ "table_count": table_summary["table_count"],
156
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
157
+ f"across {table_summary['pages_with_tables']} pages with "
158
+ f"{table_summary['total_rows']} total rows",
159
+ }
164
160
 
165
161
  # Apply quality processing
166
162
  return self._apply_quality_processing(result)
@@ -386,23 +382,24 @@ class PDFExtractor(Extractor):
386
382
  backend = get_ocr_backend(self.config.ocr_backend)
387
383
  paths = [Path(p) for p in image_paths]
388
384
 
389
- if self.config.ocr_backend == "tesseract":
390
- config = (
391
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
392
- )
393
- results = backend.process_batch_sync(paths, **asdict(config))
394
- elif self.config.ocr_backend == "paddleocr":
395
- paddle_config = (
396
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
397
- )
398
- results = backend.process_batch_sync(paths, **asdict(paddle_config))
399
- elif self.config.ocr_backend == "easyocr":
400
- easy_config = (
401
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
402
- )
403
- results = backend.process_batch_sync(paths, **asdict(easy_config))
404
- else:
405
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
385
+ match self.config.ocr_backend:
386
+ case "tesseract":
387
+ config = (
388
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
389
+ )
390
+ results = backend.process_batch_sync(paths, **asdict(config))
391
+ case "paddleocr":
392
+ paddle_config = (
393
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
394
+ )
395
+ results = backend.process_batch_sync(paths, **asdict(paddle_config))
396
+ case "easyocr":
397
+ easy_config = (
398
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
399
+ )
400
+ results = backend.process_batch_sync(paths, **asdict(easy_config))
401
+ case _:
402
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
406
403
 
407
404
  # Use list comprehension and join for efficient string building
408
405
  return "\n\n".join(result.content for result in results)
@@ -51,7 +51,7 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
51
51
  }
52
52
 
53
53
  # Override with provided parameters
54
- config_dict.update(kwargs)
54
+ config_dict = config_dict | kwargs
55
55
 
56
56
  return ExtractionConfig(**config_dict)
57
57
 
@@ -349,7 +349,7 @@ class ExtractionConfig:
349
349
  """Configuration for language detection. If None, uses default settings."""
350
350
  spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
351
351
  """Configuration for spaCy entity extraction. If None, uses default settings."""
352
- auto_detect_document_type: bool = True
352
+ auto_detect_document_type: bool = False
353
353
  """Whether to automatically detect the document type."""
354
354
  document_type_confidence_threshold: float = 0.5
355
355
  """Confidence threshold for document type detection."""
@@ -398,15 +398,16 @@ class ExtractionConfig:
398
398
  return asdict(self.ocr_config)
399
399
 
400
400
  # Lazy load and cache default configs instead of creating new instances
401
- if self.ocr_backend == "tesseract":
402
- from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
401
+ match self.ocr_backend:
402
+ case "tesseract":
403
+ from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
403
404
 
404
- return asdict(TesseractConfig())
405
- if self.ocr_backend == "easyocr":
406
- from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
405
+ return asdict(TesseractConfig())
406
+ case "easyocr":
407
+ from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
407
408
 
408
- return asdict(EasyOCRConfig())
409
- # paddleocr
410
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
409
+ return asdict(EasyOCRConfig())
410
+ case _: # paddleocr or any other backend
411
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
411
412
 
412
- return asdict(PaddleOCRConfig())
413
+ return asdict(PaddleOCRConfig())
@@ -158,4 +158,3 @@ nav:
158
158
  - Custom Hooks: advanced/custom-hooks.md
159
159
  - Custom Extractors: advanced/custom-extractors.md
160
160
  - Contributing: contributing.md
161
- - Changelog: changelog.md
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
5
5
 
6
6
  [project]
7
7
  name = "kreuzberg"
8
- version = "3.10.1"
8
+ version = "3.11.0"
9
9
  description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
10
10
  readme = "README.md"
11
11
  keywords = [
@@ -61,7 +61,7 @@ dependencies = [
61
61
  "chardetng-py>=0.3.5",
62
62
  "exceptiongroup>=1.2.2; python_version<'3.11'",
63
63
  "html-to-markdown[lxml]>=1.9.0",
64
- "mcp>=1.12.2",
64
+ "mcp>=1.12.3",
65
65
  "msgspec>=0.18.0",
66
66
  "playa-pdf>=0.6.4", # pinned due to breaking changes in 0.5.0
67
67
  "psutil>=7.0.0",
@@ -76,15 +76,11 @@ optional-dependencies.additional-extensions = [
76
76
  "tomli>=2.0.0; python_version<'3.11'",
77
77
  ]
78
78
  optional-dependencies.all = [
79
- "kreuzberg[additional-extensions,api,chunking,cli,crypto,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
79
+ "kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]",
80
80
  ]
81
81
  optional-dependencies.api = [
82
82
  "litestar[standard,structlog,opentelemetry]>=2.16.0",
83
83
  ]
84
- optional-dependencies.auto-classify-document-type = [
85
- "deep-translator>=1.11.4",
86
- "pandas>=2.3.1",
87
- ]
88
84
  optional-dependencies.chunking = [ "semantic-text-splitter>=0.27.0" ]
89
85
  optional-dependencies.cli = [
90
86
  "click>=8.2.1",
@@ -92,6 +88,10 @@ optional-dependencies.cli = [
92
88
  "tomli>=2.0.0; python_version<'3.11'",
93
89
  ]
94
90
  optional-dependencies.crypto = [ "playa-pdf[crypto]>=0.6.4" ]
91
+ optional-dependencies.document-classification = [
92
+ "deep-translator>=1.11.4",
93
+ "pandas>=2.3.1",
94
+ ]
95
95
  optional-dependencies.easyocr = [ "easyocr>=1.7.2" ]
96
96
  optional-dependencies.entity-extraction = [ "keybert>=0.9.0", "spacy>=3.8.7" ]
97
97
  optional-dependencies.gmft = [ "gmft>=0.4.2" ]
@@ -256,7 +256,7 @@ exclude_lines = [
256
256
  "class .*\\bProtocol\\):",
257
257
  "@(abc\\.)?abstractmethod",
258
258
  ]
259
- fail_under = 95
259
+ fail_under = 85
260
260
 
261
261
  [tool.mypy]
262
262
  packages = [ "kreuzberg", "tests", "benchmarks.src.kreuzberg_benchmarks" ]
@@ -2,8 +2,10 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import builtins
6
+ import sys
5
7
  from pathlib import Path
6
- from typing import TYPE_CHECKING
8
+ from typing import TYPE_CHECKING, Any
7
9
 
8
10
  import pandas as pd
9
11
  import pytest
@@ -15,6 +17,7 @@ from kreuzberg._document_classification import (
15
17
  classify_document_from_layout,
16
18
  )
17
19
  from kreuzberg._types import ExtractionConfig, ExtractionResult
20
+ from kreuzberg.exceptions import MissingDependencyError
18
21
 
19
22
  if TYPE_CHECKING:
20
23
  from pytest_mock import MockerFixture
@@ -112,7 +115,7 @@ def test_classify_document_with_metadata() -> None:
112
115
  mime_type="text/plain",
113
116
  metadata={"title": "Invoice #12345", "subject": "Payment Due"},
114
117
  )
115
- config = ExtractionConfig()
118
+ config = ExtractionConfig(auto_detect_document_type=True)
116
119
 
117
120
  doc_type, confidence = classify_document(result, config)
118
121
 
@@ -142,7 +145,7 @@ def test_classify_document_empty_content() -> None:
142
145
  mime_type="text/plain",
143
146
  metadata={},
144
147
  )
145
- config = ExtractionConfig()
148
+ config = ExtractionConfig(auto_detect_document_type=True)
146
149
 
147
150
  doc_type, confidence = classify_document(result, config)
148
151
 
@@ -158,7 +161,7 @@ def test_classify_document_with_exclusions() -> None:
158
161
  mime_type="text/plain",
159
162
  metadata={},
160
163
  )
161
- config = ExtractionConfig()
164
+ config = ExtractionConfig(auto_detect_document_type=True)
162
165
 
163
166
  doc_type, confidence = classify_document(result, config)
164
167
 
@@ -184,7 +187,7 @@ def test_classify_document_from_layout_basic() -> None:
184
187
  metadata={},
185
188
  layout=layout_df,
186
189
  )
187
- config = ExtractionConfig()
190
+ config = ExtractionConfig(auto_detect_document_type=True)
188
191
 
189
192
  doc_type, confidence = classify_document_from_layout(result, config)
190
193
 
@@ -200,7 +203,7 @@ def test_classify_document_from_layout_no_layout() -> None:
200
203
  mime_type="text/plain",
201
204
  metadata={},
202
205
  )
203
- config = ExtractionConfig()
206
+ config = ExtractionConfig(auto_detect_document_type=True)
204
207
 
205
208
  doc_type, confidence = classify_document_from_layout(result, config)
206
209
 
@@ -218,7 +221,7 @@ def test_classify_document_from_layout_empty_layout() -> None:
218
221
  metadata={},
219
222
  layout=layout_df,
220
223
  )
221
- config = ExtractionConfig()
224
+ config = ExtractionConfig(auto_detect_document_type=True)
222
225
 
223
226
  doc_type, confidence = classify_document_from_layout(result, config)
224
227
 
@@ -236,7 +239,7 @@ def test_classify_document_from_layout_missing_columns() -> None:
236
239
  metadata={},
237
240
  layout=layout_df,
238
241
  )
239
- config = ExtractionConfig()
242
+ config = ExtractionConfig(auto_detect_document_type=True)
240
243
 
241
244
  doc_type, confidence = classify_document_from_layout(result, config)
242
245
 
@@ -260,7 +263,7 @@ def test_classify_document_from_layout_no_pattern_matches() -> None:
260
263
  metadata={},
261
264
  layout=layout_df,
262
265
  )
263
- config = ExtractionConfig()
266
+ config = ExtractionConfig(auto_detect_document_type=True)
264
267
 
265
268
  doc_type, confidence = classify_document_from_layout(result, config)
266
269
 
@@ -285,7 +288,7 @@ def test_classify_document_from_layout_header_patterns() -> None:
285
288
  metadata={},
286
289
  layout=layout_df,
287
290
  )
288
- config = ExtractionConfig()
291
+ config = ExtractionConfig(auto_detect_document_type=True)
289
292
 
290
293
  doc_type, confidence = classify_document_from_layout(result, config)
291
294
 
@@ -312,7 +315,7 @@ def test_classify_document_from_layout_position_scoring() -> None:
312
315
  metadata={},
313
316
  layout=layout_df,
314
317
  )
315
- config = ExtractionConfig()
318
+ config = ExtractionConfig(auto_detect_document_type=True)
316
319
 
317
320
  doc_type, confidence = classify_document_from_layout(result, config)
318
321
 
@@ -327,7 +330,7 @@ def test_auto_detect_document_type_from_content() -> None:
327
330
  mime_type="text/plain",
328
331
  metadata={},
329
332
  )
330
- config = ExtractionConfig()
333
+ config = ExtractionConfig(auto_detect_document_type=True)
331
334
 
332
335
  detection_result = auto_detect_document_type(result, config)
333
336
 
@@ -352,7 +355,7 @@ def test_auto_detect_document_type_from_layout() -> None:
352
355
  metadata={},
353
356
  layout=layout_df,
354
357
  )
355
- config = ExtractionConfig()
358
+ config = ExtractionConfig(auto_detect_document_type=True)
356
359
 
357
360
  detection_result = auto_detect_document_type(result, config)
358
361
 
@@ -382,7 +385,7 @@ def test_auto_detect_document_type_no_matches() -> None:
382
385
  mime_type="text/plain",
383
386
  metadata={},
384
387
  )
385
- config = ExtractionConfig()
388
+ config = ExtractionConfig(auto_detect_document_type=True)
386
389
 
387
390
  detection_result = auto_detect_document_type(result, config)
388
391
 
@@ -884,3 +887,35 @@ def test_classify_document_confidence_calculation(mocker: MockerFixture) -> None
884
887
 
885
888
  assert doc_type == "invoice"
886
889
  assert confidence == 1.0 # All 3 matches are for invoice, so 3/3 = 1.0
890
+
891
+
892
+ def test_missing_deep_translator_import_error(mocker: MockerFixture) -> None:
893
+ """Test that MissingDependencyError is raised when deep-translator is not installed."""
894
+ # Temporarily remove deep_translator from sys.modules if it exists
895
+ original_module = sys.modules.pop("deep_translator", None)
896
+
897
+ try:
898
+ # Mock the import to raise ImportError when importing deep_translator
899
+ def mock_import(name: str, *args: Any, **kwargs: Any) -> Any:
900
+ if name == "deep_translator":
901
+ raise ImportError("No module named 'deep_translator'")
902
+ return original_import(name, *args, **kwargs)
903
+
904
+ original_import = builtins.__import__
905
+ mocker.patch("builtins.__import__", side_effect=mock_import)
906
+
907
+ # Import _get_translated_text after setting up the mock
908
+ from kreuzberg._document_classification import _get_translated_text
909
+
910
+ result = ExtractionResult(content="Test content", mime_type="text/plain", metadata={})
911
+
912
+ # Should raise MissingDependencyError when trying to import deep_translator
913
+ with pytest.raises(MissingDependencyError) as exc_info:
914
+ _get_translated_text(result)
915
+
916
+ assert "deep-translator" in str(exc_info.value)
917
+ assert "pip install 'kreuzberg[document-classification]'" in str(exc_info.value)
918
+ finally:
919
+ # Restore original module if it existed
920
+ if original_module is not None:
921
+ sys.modules["deep_translator"] = original_module