kreuzberg 3.10.0__tar.gz → 3.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/ci.yaml +97 -46
  2. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.pre-commit-config.yaml +1 -1
  3. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/PKG-INFO +7 -5
  4. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/benchmark_baseline.py +1 -1
  5. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/end_to_end_benchmark.py +1 -1
  6. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +1 -0
  7. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/statistical_benchmark.py +1 -1
  8. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/contributing.md +1 -1
  9. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/getting-started/installation.md +11 -1
  10. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/index.md +1 -1
  11. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/document-classification.md +9 -1
  12. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_api/main.py +1 -1
  13. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_chunker.py +1 -1
  14. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_config.py +41 -16
  15. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_document_classification.py +41 -6
  16. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_entity_extraction.py +2 -2
  17. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_base.py +1 -2
  18. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_email.py +31 -8
  19. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_image.py +18 -17
  20. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_pdf.py +31 -34
  21. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_structured.py +3 -3
  22. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_gmft.py +2 -2
  23. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_language_detection.py +1 -1
  24. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_mcp/server.py +2 -2
  25. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/_base.py +3 -3
  26. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/_easyocr.py +3 -3
  27. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/_paddleocr.py +2 -2
  28. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_playa.py +3 -1
  29. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_types.py +14 -13
  30. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_device.py +6 -6
  31. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_document_cache.py +1 -0
  32. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/cli.py +6 -6
  33. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/mkdocs.yaml +0 -1
  34. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/pyproject.toml +24 -9
  35. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/api/main_test.py +323 -0
  36. kreuzberg-3.11.0/tests/cli_command_test.py +523 -0
  37. kreuzberg-3.11.0/tests/config_test.py +1570 -0
  38. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/conftest.py +6 -0
  39. kreuzberg-3.11.0/tests/document_classification_test.py +921 -0
  40. kreuzberg-3.11.0/tests/entity_extraction_test.py +675 -0
  41. kreuzberg-3.11.0/tests/extraction_test.py +834 -0
  42. kreuzberg-3.11.0/tests/extractors/email_test.py +1003 -0
  43. kreuzberg-3.11.0/tests/extractors/image_test.py +768 -0
  44. kreuzberg-3.11.0/tests/extractors/pandoc_test.py +2123 -0
  45. kreuzberg-3.11.0/tests/extractors/pdf_test.py +973 -0
  46. kreuzberg-3.11.0/tests/extractors/presentation_test.py +1005 -0
  47. kreuzberg-3.11.0/tests/extractors/spreed_sheet_test.py +1237 -0
  48. kreuzberg-3.11.0/tests/extractors/structured_test.py +302 -0
  49. kreuzberg-3.11.0/tests/gmft_test.py +720 -0
  50. kreuzberg-3.11.0/tests/language_detection_test.py +172 -0
  51. kreuzberg-3.11.0/tests/mcp_server_test.py +883 -0
  52. kreuzberg-3.11.0/tests/ocr/tesseract_test.py +1141 -0
  53. kreuzberg-3.11.0/tests/playa_helpers_test.py +549 -0
  54. kreuzberg-3.11.0/tests/types_test.py +440 -0
  55. kreuzberg-3.11.0/tests/utils/string_test.py +305 -0
  56. kreuzberg-3.11.0/tests/utils_errors_test.py +299 -0
  57. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/uv.lock +238 -184
  58. kreuzberg-3.10.0/docs/changelog.md +0 -49
  59. kreuzberg-3.10.0/tests/config_test.py +0 -401
  60. kreuzberg-3.10.0/tests/document_classification_test.py +0 -86
  61. kreuzberg-3.10.0/tests/entity_extraction_test.py +0 -102
  62. kreuzberg-3.10.0/tests/extraction_test.py +0 -389
  63. kreuzberg-3.10.0/tests/extractors/email_comprehensive_test.py +0 -326
  64. kreuzberg-3.10.0/tests/extractors/email_test.py +0 -31
  65. kreuzberg-3.10.0/tests/extractors/image_test.py +0 -275
  66. kreuzberg-3.10.0/tests/extractors/pandoc_test.py +0 -458
  67. kreuzberg-3.10.0/tests/extractors/pdf_test.py +0 -438
  68. kreuzberg-3.10.0/tests/extractors/presentation_test.py +0 -410
  69. kreuzberg-3.10.0/tests/extractors/spreed_sheet_test.py +0 -325
  70. kreuzberg-3.10.0/tests/extractors/structured_test.py +0 -90
  71. kreuzberg-3.10.0/tests/gmft_test.py +0 -397
  72. kreuzberg-3.10.0/tests/language_detection_test.py +0 -237
  73. kreuzberg-3.10.0/tests/mcp_server_test.py +0 -382
  74. kreuzberg-3.10.0/tests/ocr/tesseract_test.py +0 -477
  75. kreuzberg-3.10.0/tests/types_test.py +0 -191
  76. kreuzberg-3.10.0/tests/utils/string_test.py +0 -85
  77. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.commitlintrc +0 -0
  78. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.deepsource.toml +0 -0
  79. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.docker/Dockerfile +0 -0
  80. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.docker/README.md +0 -0
  81. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.dockerignore +0 -0
  82. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/dependabot.yaml +0 -0
  83. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/docs.yml +0 -0
  84. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/pr-title.yaml +0 -0
  85. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/publish-docker.yml +0 -0
  86. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/release.yaml +0 -0
  87. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.gitignore +0 -0
  88. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/.markdownlint.yaml +0 -0
  89. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/LICENSE +0 -0
  90. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/README.md +0 -0
  91. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/ai-rulez.yaml +0 -0
  92. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/README.md +0 -0
  93. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/final_benchmark.py +0 -0
  94. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/pyproject.toml +0 -0
  95. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/baseline_results.json +0 -0
  96. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  97. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
  98. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/final_benchmark_results.json +0 -0
  99. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/latest.json +0 -0
  100. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/mime_caching_results.json +0 -0
  101. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/msgspec_caching_results.json +0 -0
  102. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/ocr_caching_results.json +0 -0
  103. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
  104. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
  105. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/results/table_caching_results.json +0 -0
  106. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/serialization_benchmark.py +0 -0
  107. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  108. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  109. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  110. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  111. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  112. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  113. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/custom-extractors.md +0 -0
  114. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/custom-hooks.md +0 -0
  115. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/error-handling.md +0 -0
  116. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/index.md +0 -0
  117. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/advanced/performance.md +0 -0
  118. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/exceptions.md +0 -0
  119. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/extraction-functions.md +0 -0
  120. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/extractor-registry.md +0 -0
  121. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/index.md +0 -0
  122. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/ocr-configuration.md +0 -0
  123. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/api-reference/types.md +0 -0
  124. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/assets/favicon.png +0 -0
  125. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/assets/logo.png +0 -0
  126. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/cli.md +0 -0
  127. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/css/extra.css +0 -0
  128. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/examples/extraction-examples.md +0 -0
  129. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/examples/index.md +0 -0
  130. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/getting-started/index.md +0 -0
  131. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/getting-started/quick-start.md +0 -0
  132. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/performance-analysis.md +0 -0
  133. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/api-server.md +0 -0
  134. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/basic-usage.md +0 -0
  135. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/chunking.md +0 -0
  136. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/docker.md +0 -0
  137. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/extraction-configuration.md +0 -0
  138. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/index.md +0 -0
  139. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/mcp-server.md +0 -0
  140. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/metadata-extraction.md +0 -0
  141. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/ocr-backends.md +0 -0
  142. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/ocr-configuration.md +0 -0
  143. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/supported-formats.md +0 -0
  144. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/__init__.py +0 -0
  145. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/__main__.py +0 -0
  146. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_api/__init__.py +0 -0
  147. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_constants.py +0 -0
  148. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/__init__.py +0 -0
  149. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_html.py +0 -0
  150. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_pandoc.py +0 -0
  151. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_presentation.py +0 -0
  152. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  153. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_mcp/__init__.py +0 -0
  154. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_mime_types.py +0 -0
  155. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/__init__.py +0 -0
  156. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_ocr/_tesseract.py +0 -0
  157. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_registry.py +0 -0
  158. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/__init__.py +0 -0
  159. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_cache.py +0 -0
  160. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_errors.py +0 -0
  161. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
  162. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_process_pool.py +0 -0
  163. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_quality.py +0 -0
  164. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_serialization.py +0 -0
  165. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_string.py +0 -0
  166. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_sync.py +0 -0
  167. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_table.py +0 -0
  168. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_utils/_tmp.py +0 -0
  169. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/exceptions.py +0 -0
  170. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/extraction.py +0 -0
  171. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/py.typed +0 -0
  172. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/__init__.py +0 -0
  173. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/api/__init__.py +0 -0
  174. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/chunker_test.py +0 -0
  175. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/cli_integration_test.py +0 -0
  176. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/cli_test.py +0 -0
  177. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/exceptions_test.py +0 -0
  178. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/extraction_batch_test.py +0 -0
  179. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/extractors/__init__.py +0 -0
  180. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/extractors/html_test.py +0 -0
  181. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/extractors/pandoc_metadata_test.py +0 -0
  182. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/gmft_extended_test.py +0 -0
  183. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/hooks_test.py +0 -0
  184. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/mime_types_test.py +0 -0
  185. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/__init__.py +0 -0
  186. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
  187. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  188. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/process_manager_test.py +0 -0
  189. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  190. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/__init__.py +0 -0
  191. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/base_test.py +0 -0
  192. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/device_integration_test.py +0 -0
  193. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/easyocr_test.py +0 -0
  194. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/init_test.py +0 -0
  195. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/ocr/paddleocr_test.py +0 -0
  196. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/playa_test.py +0 -0
  197. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/registry_test.py +0 -0
  198. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/better-ocr-image.jpg +0 -0
  199. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/contract.txt +0 -0
  200. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/contract_test.txt +0 -0
  201. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/document.docx +0 -0
  202. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/email/sample-email.eml +0 -0
  203. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  204. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/excel.xlsx +0 -0
  205. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/form_test.txt +0 -0
  206. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/french-text.txt +0 -0
  207. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/german-text.txt +0 -0
  208. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/html.html +0 -0
  209. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/images/test_hello_world.png +0 -0
  210. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/invoice_image.png +0 -0
  211. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/invoice_test.txt +0 -0
  212. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/json/sample-document.json +0 -0
  213. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  214. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/markdown.md +0 -0
  215. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
  216. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/non-searchable.pdf +0 -0
  217. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/ocr-image.jpg +0 -0
  218. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  219. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  220. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  221. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  222. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/receipt_test.txt +0 -0
  223. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/report_test.txt +0 -0
  224. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/sample-contract.pdf +0 -0
  225. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/scanned.pdf +0 -0
  226. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/searchable.pdf +0 -0
  227. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/spanish-text.txt +0 -0
  228. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/test-article.pdf +0 -0
  229. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  230. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/__init__.py +0 -0
  231. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/cache_test.py +0 -0
  232. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/device_test.py +0 -0
  233. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/errors_test.py +0 -0
  234. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/pdf_lock_test.py +0 -0
  235. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/process_pool_test.py +0 -0
  236. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/serialization_test.py +0 -0
  237. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/sync_test.py +0 -0
  238. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/table_test.py +0 -0
  239. {kreuzberg-3.10.0 → kreuzberg-3.11.0}/tests/utils/tmp_test.py +0 -0
@@ -51,12 +51,103 @@ jobs:
51
51
  - name: Execute Pre-Commit
52
52
  run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
53
53
 
54
+ # Coverage job runs first, only on Python 3.13 Ubuntu
55
+ coverage:
56
+ needs: validate
57
+ runs-on: ubuntu-latest
58
+ timeout-minutes: 20
59
+ steps:
60
+ - name: Checkout
61
+ uses: actions/checkout@v4
62
+
63
+ - name: Install uv
64
+ uses: astral-sh/setup-uv@v6
65
+ with:
66
+ enable-cache: true
67
+
68
+ - name: Install Python
69
+ uses: actions/setup-python@v5
70
+ id: setup-python
71
+ with:
72
+ python-version: "3.13"
73
+
74
+ - name: Cache Python Dependencies
75
+ id: python-cache
76
+ uses: actions/cache@v4
77
+ with:
78
+ path: |
79
+ ~/.cache/uv
80
+ .venv
81
+ key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
82
+ restore-keys: |
83
+ python-dependencies-ubuntu-latest-3.13-
84
+
85
+ - name: Install Dependencies
86
+ uses: nick-fields/retry@v3
87
+ with:
88
+ timeout_minutes: 5
89
+ max_attempts: 3
90
+ retry_wait_seconds: 30
91
+ command: |
92
+ uv sync --all-packages --all-extras --dev
93
+ shell: bash
94
+
95
+ - name: Install System Dependencies
96
+ uses: nick-fields/retry@v3
97
+ with:
98
+ timeout_minutes: 5
99
+ max_attempts: 3
100
+ retry_wait_seconds: 30
101
+ command: |
102
+ sudo apt-get update
103
+ sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
104
+ shell: bash
105
+
106
+ - name: Run Tests with Coverage
107
+ uses: nick-fields/retry@v3
108
+ with:
109
+ timeout_minutes: 15
110
+ max_attempts: 3
111
+ retry_wait_seconds: 10
112
+ command: |
113
+ uv run coverage erase
114
+ uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
115
+ uv run coverage report --precision=2
116
+ shell: bash
117
+
118
+ - name: Upload Coverage to DeepSource
119
+ if: always() && github.event_name == 'push'
120
+ env:
121
+ DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
122
+ run: |
123
+ # Install DeepSource CLI
124
+ curl -fsSL https://deepsource.io/cli | sh
125
+ # Upload coverage report
126
+ ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
127
+
128
+ - name: Upload Coverage Artifacts
129
+ if: always()
130
+ uses: actions/upload-artifact@v4
131
+ with:
132
+ name: coverage-report-${{ github.sha }}
133
+ path: |
134
+ coverage.lcov
135
+ .coverage
136
+ retention-days: 7
137
+
138
+ # Full test matrix runs only after coverage succeeds
54
139
  test:
140
+ needs: coverage
141
+ runs-on: ${{ matrix.os }}
55
142
  strategy:
143
+ fail-fast: false
56
144
  matrix:
57
- os: [ ubuntu-latest, macOS-latest, windows-latest ]
58
- python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
59
- runs-on: ${{ matrix.os }}
145
+ os: [ubuntu-latest, windows-latest, macos-latest]
146
+ python: ["3.10", "3.11", "3.12", "3.13"]
147
+ exclude:
148
+ # Skip Python 3.13 on macOS for now due to compatibility issues
149
+ - os: macos-latest
150
+ python: "3.13"
60
151
  timeout-minutes: 30
61
152
  steps:
62
153
  - name: Checkout
@@ -146,52 +237,12 @@ jobs:
146
237
  pandoc --version
147
238
  shell: pwsh
148
239
 
149
- - name: Clean Coverage Data
150
- run: |
151
- rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
152
- shell: bash
153
-
154
- - name: Run Tests with Coverage
155
- run: |
156
- uv run coverage erase
157
- uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
158
-
159
- - name: Upload Coverage Artifacts
160
- if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
161
- uses: actions/upload-artifact@v4
162
- with:
163
- name: coverage-report
164
- path: coverage.lcov
165
- retention-days: 1
166
-
167
- upload-coverage:
168
- needs: test
169
- runs-on: ubuntu-latest
170
- if: github.event_name == 'push' || github.event_name == 'pull_request'
171
- steps:
172
- - name: Checkout
173
- uses: actions/checkout@v4
174
- with:
175
- ref: ${{ github.event.pull_request.head.sha || github.sha }}
176
-
177
- - name: Download Coverage Artifacts
178
- uses: actions/download-artifact@v4
179
- with:
180
- name: coverage-report
181
- path: .
182
-
183
- - name: Install DeepSource CLI
240
+ - name: Run Tests (without coverage)
184
241
  uses: nick-fields/retry@v3
185
242
  with:
186
- timeout_minutes: 3
243
+ timeout_minutes: 15
187
244
  max_attempts: 3
188
245
  retry_wait_seconds: 10
189
246
  command: |
190
- curl -fsSL https://deepsource.io/cli | sh
247
+ uv run pytest -s -vvv --reruns 2 --reruns-delay 1
191
248
  shell: bash
192
-
193
- - name: Upload Coverage to DeepSource
194
- env:
195
- DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
196
- run: |
197
- ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
@@ -53,7 +53,7 @@ repos:
53
53
  hooks:
54
54
  - id: pyproject-fmt
55
55
  - repo: https://github.com/astral-sh/ruff-pre-commit
56
- rev: v0.12.5
56
+ rev: v0.12.7
57
57
  hooks:
58
58
  - id: ruff
59
59
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.10.0
3
+ Version: 3.11.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.9.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.9.0
35
- Requires-Dist: mcp>=1.12.2
35
+ Requires-Dist: mcp>=1.12.3
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: playa-pdf>=0.6.4
38
38
  Requires-Dist: psutil>=7.0.0
@@ -45,6 +45,7 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
45
45
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
46
46
  Provides-Extra: all
47
47
  Requires-Dist: click>=8.2.1; extra == 'all'
48
+ Requires-Dist: deep-translator>=1.11.4; extra == 'all'
48
49
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
49
50
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
50
51
  Requires-Dist: gmft>=0.4.2; extra == 'all'
@@ -53,6 +54,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
53
54
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
55
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
56
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
57
+ Requires-Dist: pandas>=2.3.1; extra == 'all'
56
58
  Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
57
59
  Requires-Dist: rich>=14.1.0; extra == 'all'
58
60
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
@@ -61,9 +63,6 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
61
63
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
62
64
  Provides-Extra: api
63
65
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
64
- Provides-Extra: auto-classify-document-type
65
- Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
66
- Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
67
66
  Provides-Extra: chunking
68
67
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
69
68
  Provides-Extra: cli
@@ -72,6 +71,9 @@ Requires-Dist: rich>=14.1.0; extra == 'cli'
72
71
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
73
72
  Provides-Extra: crypto
74
73
  Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
74
+ Provides-Extra: document-classification
75
+ Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
76
+ Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
75
77
  Provides-Extra: easyocr
76
78
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
77
79
  Provides-Extra: entity-extraction
@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
108
108
  return results # type: ignore[return-value]
109
109
 
110
110
 
111
- if __name__ == "__main__":
111
+ if __name__ == "__main__": # pragma: no cover
112
112
  baseline_results = asyncio.run(run_baseline_benchmark())
113
113
 
114
114
  baseline_file = Path("baseline_results.json")
@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
195
195
  }
196
196
 
197
197
 
198
- if __name__ == "__main__":
198
+ if __name__ == "__main__": # pragma: no cover
199
199
  print("🧪 REPRODUCIBLE CACHE BENCHMARK")
200
200
  print("Testing msgpack implementation with statistical rigor...")
201
201
  print()
@@ -1,4 +1,5 @@
1
1
  """Core benchmark implementations comparing sync vs async performance."""
2
+ # mypy: disable-error-code=unused-ignore
2
3
 
3
4
  from __future__ import annotations
4
5
 
@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
187
187
  }
188
188
 
189
189
 
190
- if __name__ == "__main__":
190
+ if __name__ == "__main__": # pragma: no cover
191
191
  print("🧪 STATISTICAL CACHE BENCHMARK")
192
192
  print("Testing msgpack implementation with proper error analysis...")
193
193
  print()
@@ -34,7 +34,7 @@ All commands run through `uv run`:
34
34
  # Testing
35
35
  uv run pytest # Run all tests
36
36
  uv run pytest tests/foo_test.py # Run specific test
37
- uv run pytest --cov # With coverage (must be ≥95%)
37
+ uv run pytest --cov # With coverage (must be ≥85%)
38
38
 
39
39
  # Code quality
40
40
  uv run ruff format # Format code
@@ -134,6 +134,16 @@ python -m spacy download es_core_news_sm # Spanish
134
134
 
135
135
  spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
136
136
 
137
+ ### Document Classification
138
+
139
+ For automatic document type detection (invoice, contract, receipt, etc.), install the document classification extra:
140
+
141
+ ```shell
142
+ pip install "kreuzberg[document-classification]"
143
+ ```
144
+
145
+ This feature uses Google Translate for multi-language support and requires explicit opt-in by setting `auto_detect_document_type=True` in your configuration.
146
+
137
147
  ### All Optional Dependencies
138
148
 
139
149
  To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -145,5 +155,5 @@ pip install "kreuzberg[all]"
145
155
  This is equivalent to:
146
156
 
147
157
  ```shell
148
- pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
158
+ pip install "kreuzberg[chunking,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
149
159
  ```
@@ -22,7 +22,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
22
22
 
23
23
  ### Engineering Principles
24
24
 
25
- - **Test Coverage**: 95%+ coverage with comprehensive test suites
25
+ - **Test Coverage**: Comprehensive test suites ensuring code reliability
26
26
  - **API Design**: True async/await implementation alongside synchronous APIs
27
27
  - **Error Handling**: Consistent exception hierarchy with detailed context
28
28
  - **Type Safety**: Full type annotations for enhanced developer experience
@@ -2,9 +2,17 @@
2
2
 
3
3
  Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
4
4
 
5
+ ## Installation
6
+
7
+ Document classification requires the `document-classification` extra to be installed:
8
+
9
+ ```bash
10
+ pip install "kreuzberg[document-classification]"
11
+ ```
12
+
5
13
  ## Enabling Document Classification
6
14
 
7
- To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
15
+ Document classification is disabled by default. To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
8
16
 
9
17
  ```python
10
18
  from kreuzberg import ExtractionConfig, extract_file
@@ -30,7 +30,7 @@ try:
30
30
  HTTP_422_UNPROCESSABLE_ENTITY,
31
31
  HTTP_500_INTERNAL_SERVER_ERROR,
32
32
  )
33
- except ImportError as e:
33
+ except ImportError as e: # pragma: no cover
34
34
  raise MissingDependencyError.create_for_package(
35
35
  dependency_group="litestar",
36
36
  functionality="Litestar API and docker container",
@@ -43,7 +43,7 @@ def get_chunker(
43
43
  from semantic_text_splitter import TextSplitter # noqa: PLC0415
44
44
 
45
45
  _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
- except ImportError as e:
46
+ except ImportError as e: # pragma: no cover
47
47
  raise MissingDependencyError.create_for_package(
48
48
  dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
49
49
  ) from e
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
13
13
 
14
14
  if sys.version_info >= (3, 11):
15
15
  import tomllib
16
- else:
16
+ else: # pragma: no cover
17
17
  import tomli as tomllib # type: ignore[import-not-found]
18
18
 
19
19
  from kreuzberg._gmft import GMFTConfig
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
50
50
  # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
51
51
  if config_path.name == "kreuzberg.toml":
52
52
  return data # type: ignore[no-any-return]
53
- return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
53
+
54
+ # For other files, check if they have [tool.kreuzberg] section
55
+ if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
56
+ return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
57
+
58
+ # Otherwise assume root-level configuration
59
+ return data # type: ignore[no-any-return]
54
60
 
55
61
 
56
62
  def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
@@ -91,19 +97,21 @@ def parse_ocr_backend_config(
91
97
  if not isinstance(backend_config, dict):
92
98
  return None
93
99
 
94
- if backend == "tesseract":
95
- # Convert psm integer to PSMMode enum if needed
96
- processed_config = backend_config.copy()
97
- if "psm" in processed_config and isinstance(processed_config["psm"], int):
98
- from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
99
-
100
- processed_config["psm"] = PSMMode(processed_config["psm"])
101
- return TesseractConfig(**processed_config)
102
- if backend == "easyocr":
103
- return EasyOCRConfig(**backend_config)
104
- if backend == "paddleocr":
105
- return PaddleOCRConfig(**backend_config)
106
- return None
100
+ match backend:
101
+ case "tesseract":
102
+ # Convert psm integer to PSMMode enum if needed
103
+ processed_config = backend_config.copy()
104
+ if "psm" in processed_config and isinstance(processed_config["psm"], int):
105
+ from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
106
+
107
+ processed_config["psm"] = PSMMode(processed_config["psm"])
108
+ return TesseractConfig(**processed_config)
109
+ case "easyocr":
110
+ return EasyOCRConfig(**backend_config)
111
+ case "paddleocr":
112
+ return PaddleOCRConfig(**backend_config)
113
+ case _:
114
+ return None
107
115
 
108
116
 
109
117
  def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
@@ -129,12 +137,25 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
129
137
  "extract_keywords",
130
138
  "auto_detect_language",
131
139
  "enable_quality_processing",
140
+ "auto_detect_document_type",
141
+ "document_type_confidence_threshold",
142
+ "document_classification_mode",
143
+ "keyword_count",
144
+ }
145
+ extraction_config = extraction_config | {
146
+ field: config_dict[field] for field in basic_fields if field in config_dict
132
147
  }
133
- extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
134
148
 
135
149
  # Handle OCR backend configuration
136
150
  ocr_backend = extraction_config.get("ocr_backend")
137
151
  if ocr_backend and ocr_backend != "none":
152
+ # Validate OCR backend
153
+ valid_backends = {"tesseract", "easyocr", "paddleocr"}
154
+ if ocr_backend not in valid_backends:
155
+ raise ValidationError(
156
+ f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
157
+ context={"provided": ocr_backend, "valid": sorted(valid_backends)},
158
+ )
138
159
  ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
139
160
  if ocr_config:
140
161
  extraction_config["ocr_config"] = ocr_config
@@ -286,6 +307,10 @@ _CONFIG_FIELDS = [
286
307
  "extract_keywords",
287
308
  "auto_detect_language",
288
309
  "enable_quality_processing",
310
+ "auto_detect_document_type",
311
+ "document_type_confidence_threshold",
312
+ "document_classification_mode",
313
+ "keyword_count",
289
314
  ]
290
315
 
291
316
 
@@ -4,13 +4,12 @@ import re
4
4
  from typing import TYPE_CHECKING
5
5
 
6
6
  from kreuzberg._ocr import get_ocr_backend
7
+ from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
7
8
  from kreuzberg.exceptions import MissingDependencyError
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  from pathlib import Path
11
12
 
12
- from kreuzberg._types import ExtractionConfig, ExtractionResult
13
-
14
13
 
15
14
  DOCUMENT_CLASSIFIERS = {
16
15
  "invoice": [
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
52
51
  Raises:
53
52
  MissingDependencyError: If the deep-translator package is not installed
54
53
  """
54
+ # Combine content with metadata for classification
55
+ text_to_classify = result.content
56
+ if result.metadata:
57
+ # Add metadata values to the text for classification
58
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
59
+ text_to_classify = f"{text_to_classify} {metadata_text}"
60
+
55
61
  try:
56
62
  from deep_translator import GoogleTranslator # noqa: PLC0415
57
- except ImportError as e:
63
+ except ImportError as e: # pragma: no cover
58
64
  raise MissingDependencyError(
59
- "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
65
+ "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
60
66
  ) from e
61
67
 
62
- return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
68
+ try:
69
+ return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
70
+ except Exception: # noqa: BLE001
71
+ # Fall back to original content in lowercase if translation fails
72
+ return text_to_classify.lower()
63
73
 
64
74
 
65
75
  def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
73
83
  A tuple containing the detected document type and the confidence score,
74
84
  or (None, None) if no type is detected with sufficient confidence.
75
85
  """
86
+ if not config.auto_detect_document_type:
87
+ return None, None
88
+
76
89
  translated_text = _get_translated_text(result)
77
90
  scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
78
91
 
@@ -108,7 +121,8 @@ def classify_document_from_layout(
108
121
  A tuple containing the detected document type and the confidence score,
109
122
  or (None, None) if no type is detected with sufficient confidence.
110
123
  """
111
- translated_text = _get_translated_text(result)
124
+ if not config.auto_detect_document_type:
125
+ return None, None
112
126
 
113
127
  if result.layout is None or result.layout.empty:
114
128
  return None, None
@@ -117,6 +131,24 @@ def classify_document_from_layout(
117
131
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
118
132
  return None, None
119
133
 
134
+ # Use layout text for classification, not the content
135
+ layout_text = " ".join(layout_df["text"].astype(str).tolist())
136
+
137
+ # Translate layout text directly for classification
138
+ text_to_classify = layout_text
139
+ if result.metadata:
140
+ # Add metadata values to the text for classification
141
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
142
+ text_to_classify = f"{text_to_classify} {metadata_text}"
143
+
144
+ try:
145
+ from deep_translator import GoogleTranslator # noqa: PLC0415
146
+
147
+ translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
148
+ except Exception: # noqa: BLE001
149
+ # Fall back to original content in lowercase if translation fails
150
+ translated_text = text_to_classify.lower()
151
+
120
152
  layout_df["translated_text"] = translated_text
121
153
 
122
154
  page_height = layout_df["top"].max() + layout_df["height"].max()
@@ -151,6 +183,9 @@ def auto_detect_document_type(
151
183
  if config.document_classification_mode == "vision" and file_path:
152
184
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
153
185
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
186
+ elif result.layout is not None and not result.layout.empty:
187
+ # Use layout-based classification if layout data is available
188
+ result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
154
189
  else:
155
190
  result.document_type, result.document_type_confidence = classify_document(result, config)
156
191
  return result
@@ -139,7 +139,7 @@ def extract_entities(
139
139
 
140
140
  try:
141
141
  import spacy # noqa: F401, PLC0415
142
- except ImportError as e:
142
+ except ImportError as e: # pragma: no cover
143
143
  raise MissingDependencyError.create_for_package(
144
144
  package_name="spacy",
145
145
  dependency_group="entity-extraction",
@@ -230,7 +230,7 @@ def extract_keywords(
230
230
  return [(kw, float(score)) for kw, score in keywords]
231
231
  except (RuntimeError, OSError, ValueError):
232
232
  return []
233
- except ImportError as e:
233
+ except ImportError as e: # pragma: no cover
234
234
  raise MissingDependencyError.create_for_package(
235
235
  package_name="keybert",
236
236
  dependency_group="entity-extraction",
@@ -116,8 +116,7 @@ class Extractor(ABC):
116
116
  quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
117
 
118
118
  # Add quality metadata
119
- enhanced_metadata = dict(result.metadata) if result.metadata else {}
120
- enhanced_metadata["quality_score"] = quality_score
119
+ enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
121
120
 
122
121
  # Return enhanced result
123
122
  return ExtractionResult(
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
  # Import optional dependencies at module level with proper error handling
20
20
  try:
21
21
  import mailparse
22
- except ImportError:
22
+ except ImportError: # pragma: no cover
23
23
  mailparse = None
24
24
 
25
25
  try:
26
26
  import html2text # type: ignore[import-not-found]
27
- except ImportError:
27
+ except ImportError: # pragma: no cover
28
28
  html2text = None
29
29
 
30
30
  # Compile regex pattern once at module level
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
59
59
 
60
60
  to_info = parsed_email.get("to")
61
61
  if to_info:
62
+ # Store the raw value in metadata (could be string, dict, or list)
62
63
  if isinstance(to_info, list) and to_info:
64
+ # For metadata, use first recipient's email if it's a list
63
65
  to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
66
+ metadata["email_to"] = to_email
64
67
  elif isinstance(to_info, dict):
65
- to_email = to_info.get("email", "")
68
+ metadata["email_to"] = to_info.get("email", "")
66
69
  else:
67
- to_email = str(to_info)
68
- metadata["email_to"] = to_email
69
- text_parts.append(f"To: {to_email}")
70
+ metadata["email_to"] = str(to_info)
71
+
72
+ # For display, format all recipients
73
+ to_formatted = self._format_email_field(to_info)
74
+ text_parts.append(f"To: {to_formatted}")
70
75
 
71
76
  date = parsed_email.get("date")
72
77
  if date:
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
76
81
  cc = parsed_email.get("cc")
77
82
  if cc:
78
83
  metadata["email_cc"] = cc
79
- text_parts.append(f"CC: {cc}")
84
+ cc_formatted = self._format_email_field(cc)
85
+ text_parts.append(f"CC: {cc_formatted}")
80
86
 
81
87
  bcc = parsed_email.get("bcc")
82
88
  if bcc:
83
89
  metadata["email_bcc"] = bcc
84
- text_parts.append(f"BCC: {bcc}")
90
+ bcc_formatted = self._format_email_field(bcc)
91
+ text_parts.append(f"BCC: {bcc_formatted}")
92
+
93
+ def _format_email_field(self, field: Any) -> str:
94
+ """Format email field (to, cc, bcc) for display."""
95
+ if isinstance(field, list):
96
+ emails = []
97
+ for item in field:
98
+ if isinstance(item, dict):
99
+ email = item.get("email", "")
100
+ if email:
101
+ emails.append(email)
102
+ else:
103
+ emails.append(str(item))
104
+ return ", ".join(emails)
105
+ if isinstance(field, dict):
106
+ return str(field.get("email", ""))
107
+ return str(field)
85
108
 
86
109
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
87
110
  """Extract and process email body content."""