kreuzberg 3.10.0__tar.gz → 3.10.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/ci.yaml +97 -46
  2. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.pre-commit-config.yaml +1 -1
  3. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/PKG-INFO +1 -1
  4. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/benchmark_baseline.py +1 -1
  5. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/end_to_end_benchmark.py +1 -1
  6. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +1 -0
  7. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/statistical_benchmark.py +1 -1
  8. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_api/main.py +1 -1
  9. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_chunker.py +1 -1
  10. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_config.py +23 -2
  11. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_document_classification.py +40 -5
  12. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_entity_extraction.py +2 -2
  13. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_email.py +31 -8
  14. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pdf.py +1 -1
  15. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_structured.py +3 -3
  16. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_gmft.py +2 -2
  17. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_language_detection.py +1 -1
  18. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_mcp/server.py +1 -1
  19. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_base.py +3 -3
  20. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_easyocr.py +3 -3
  21. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_paddleocr.py +2 -2
  22. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_playa.py +3 -1
  23. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_types.py +5 -5
  24. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_device.py +6 -6
  25. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_document_cache.py +1 -0
  26. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/cli.py +6 -6
  27. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/pyproject.toml +17 -2
  28. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/api/main_test.py +323 -0
  29. kreuzberg-3.10.1/tests/cli_command_test.py +523 -0
  30. kreuzberg-3.10.1/tests/config_test.py +1570 -0
  31. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/conftest.py +6 -0
  32. kreuzberg-3.10.1/tests/document_classification_test.py +886 -0
  33. kreuzberg-3.10.1/tests/entity_extraction_test.py +675 -0
  34. kreuzberg-3.10.1/tests/extraction_test.py +834 -0
  35. kreuzberg-3.10.1/tests/extractors/email_test.py +1003 -0
  36. kreuzberg-3.10.1/tests/extractors/image_test.py +768 -0
  37. kreuzberg-3.10.1/tests/extractors/pandoc_test.py +2123 -0
  38. kreuzberg-3.10.1/tests/extractors/pdf_test.py +973 -0
  39. kreuzberg-3.10.1/tests/extractors/presentation_test.py +1005 -0
  40. kreuzberg-3.10.1/tests/extractors/spreed_sheet_test.py +1237 -0
  41. kreuzberg-3.10.1/tests/extractors/structured_test.py +302 -0
  42. kreuzberg-3.10.1/tests/gmft_test.py +720 -0
  43. kreuzberg-3.10.1/tests/language_detection_test.py +172 -0
  44. kreuzberg-3.10.1/tests/mcp_server_test.py +883 -0
  45. kreuzberg-3.10.1/tests/ocr/tesseract_test.py +1141 -0
  46. kreuzberg-3.10.1/tests/playa_helpers_test.py +549 -0
  47. kreuzberg-3.10.1/tests/types_test.py +440 -0
  48. kreuzberg-3.10.1/tests/utils/string_test.py +305 -0
  49. kreuzberg-3.10.1/tests/utils_errors_test.py +299 -0
  50. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/uv.lock +154 -123
  51. kreuzberg-3.10.0/tests/config_test.py +0 -401
  52. kreuzberg-3.10.0/tests/document_classification_test.py +0 -86
  53. kreuzberg-3.10.0/tests/entity_extraction_test.py +0 -102
  54. kreuzberg-3.10.0/tests/extraction_test.py +0 -389
  55. kreuzberg-3.10.0/tests/extractors/email_comprehensive_test.py +0 -326
  56. kreuzberg-3.10.0/tests/extractors/email_test.py +0 -31
  57. kreuzberg-3.10.0/tests/extractors/image_test.py +0 -275
  58. kreuzberg-3.10.0/tests/extractors/pandoc_test.py +0 -458
  59. kreuzberg-3.10.0/tests/extractors/pdf_test.py +0 -438
  60. kreuzberg-3.10.0/tests/extractors/presentation_test.py +0 -410
  61. kreuzberg-3.10.0/tests/extractors/spreed_sheet_test.py +0 -325
  62. kreuzberg-3.10.0/tests/extractors/structured_test.py +0 -90
  63. kreuzberg-3.10.0/tests/gmft_test.py +0 -397
  64. kreuzberg-3.10.0/tests/language_detection_test.py +0 -237
  65. kreuzberg-3.10.0/tests/mcp_server_test.py +0 -382
  66. kreuzberg-3.10.0/tests/ocr/tesseract_test.py +0 -477
  67. kreuzberg-3.10.0/tests/types_test.py +0 -191
  68. kreuzberg-3.10.0/tests/utils/string_test.py +0 -85
  69. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.commitlintrc +0 -0
  70. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.deepsource.toml +0 -0
  71. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.docker/Dockerfile +0 -0
  72. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.docker/README.md +0 -0
  73. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.dockerignore +0 -0
  74. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/dependabot.yaml +0 -0
  75. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/docs.yml +0 -0
  76. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/pr-title.yaml +0 -0
  77. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/publish-docker.yml +0 -0
  78. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/release.yaml +0 -0
  79. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.gitignore +0 -0
  80. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/.markdownlint.yaml +0 -0
  81. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/LICENSE +0 -0
  82. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/README.md +0 -0
  83. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/ai-rulez.yaml +0 -0
  84. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/README.md +0 -0
  85. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/final_benchmark.py +0 -0
  86. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/pyproject.toml +0 -0
  87. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/baseline_results.json +0 -0
  88. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  89. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
  90. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/final_benchmark_results.json +0 -0
  91. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/latest.json +0 -0
  92. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/mime_caching_results.json +0 -0
  93. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/msgspec_caching_results.json +0 -0
  94. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/ocr_caching_results.json +0 -0
  95. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
  96. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
  97. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/results/table_caching_results.json +0 -0
  98. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/serialization_benchmark.py +0 -0
  99. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  100. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  101. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  102. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  103. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  104. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  105. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/custom-extractors.md +0 -0
  106. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/custom-hooks.md +0 -0
  107. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/error-handling.md +0 -0
  108. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/index.md +0 -0
  109. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/advanced/performance.md +0 -0
  110. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/exceptions.md +0 -0
  111. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/extraction-functions.md +0 -0
  112. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/extractor-registry.md +0 -0
  113. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/index.md +0 -0
  114. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/ocr-configuration.md +0 -0
  115. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/api-reference/types.md +0 -0
  116. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/assets/favicon.png +0 -0
  117. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/assets/logo.png +0 -0
  118. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/changelog.md +0 -0
  119. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/cli.md +0 -0
  120. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/contributing.md +0 -0
  121. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/css/extra.css +0 -0
  122. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/examples/extraction-examples.md +0 -0
  123. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/examples/index.md +0 -0
  124. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/getting-started/index.md +0 -0
  125. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/getting-started/installation.md +0 -0
  126. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/getting-started/quick-start.md +0 -0
  127. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/index.md +0 -0
  128. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/performance-analysis.md +0 -0
  129. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/api-server.md +0 -0
  130. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/basic-usage.md +0 -0
  131. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/chunking.md +0 -0
  132. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/docker.md +0 -0
  133. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/document-classification.md +0 -0
  134. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/extraction-configuration.md +0 -0
  135. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/index.md +0 -0
  136. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/mcp-server.md +0 -0
  137. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/metadata-extraction.md +0 -0
  138. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/ocr-backends.md +0 -0
  139. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/ocr-configuration.md +0 -0
  140. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/docs/user-guide/supported-formats.md +0 -0
  141. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/__init__.py +0 -0
  142. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/__main__.py +0 -0
  143. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_api/__init__.py +0 -0
  144. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_constants.py +0 -0
  145. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/__init__.py +0 -0
  146. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_base.py +0 -0
  147. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_html.py +0 -0
  148. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_image.py +0 -0
  149. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pandoc.py +0 -0
  150. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_presentation.py +0 -0
  151. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  152. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_mcp/__init__.py +0 -0
  153. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_mime_types.py +0 -0
  154. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/__init__.py +0 -0
  155. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_tesseract.py +0 -0
  156. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_registry.py +0 -0
  157. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/__init__.py +0 -0
  158. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_cache.py +0 -0
  159. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_errors.py +0 -0
  160. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  161. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_process_pool.py +0 -0
  162. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_quality.py +0 -0
  163. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_serialization.py +0 -0
  164. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_string.py +0 -0
  165. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_sync.py +0 -0
  166. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_table.py +0 -0
  167. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_utils/_tmp.py +0 -0
  168. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/exceptions.py +0 -0
  169. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/extraction.py +0 -0
  170. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/py.typed +0 -0
  171. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/mkdocs.yaml +0 -0
  172. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/__init__.py +0 -0
  173. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/api/__init__.py +0 -0
  174. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/chunker_test.py +0 -0
  175. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/cli_integration_test.py +0 -0
  176. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/cli_test.py +0 -0
  177. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/exceptions_test.py +0 -0
  178. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/extraction_batch_test.py +0 -0
  179. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/extractors/__init__.py +0 -0
  180. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/extractors/html_test.py +0 -0
  181. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  182. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/gmft_extended_test.py +0 -0
  183. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/hooks_test.py +0 -0
  184. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/mime_types_test.py +0 -0
  185. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/__init__.py +0 -0
  186. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
  187. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  188. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/process_manager_test.py +0 -0
  189. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  190. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/__init__.py +0 -0
  191. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/base_test.py +0 -0
  192. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/device_integration_test.py +0 -0
  193. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/easyocr_test.py +0 -0
  194. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/init_test.py +0 -0
  195. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/ocr/paddleocr_test.py +0 -0
  196. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/playa_test.py +0 -0
  197. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/registry_test.py +0 -0
  198. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
  199. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/contract.txt +0 -0
  200. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/contract_test.txt +0 -0
  201. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/document.docx +0 -0
  202. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/email/sample-email.eml +0 -0
  203. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  204. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/excel.xlsx +0 -0
  205. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/form_test.txt +0 -0
  206. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/french-text.txt +0 -0
  207. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/german-text.txt +0 -0
  208. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/html.html +0 -0
  209. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/images/test_hello_world.png +0 -0
  210. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/invoice_image.png +0 -0
  211. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/invoice_test.txt +0 -0
  212. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/json/sample-document.json +0 -0
  213. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  214. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/markdown.md +0 -0
  215. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  216. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/non-searchable.pdf +0 -0
  217. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/ocr-image.jpg +0 -0
  218. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  219. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  220. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  221. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  222. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/receipt_test.txt +0 -0
  223. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/report_test.txt +0 -0
  224. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/sample-contract.pdf +0 -0
  225. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/scanned.pdf +0 -0
  226. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/searchable.pdf +0 -0
  227. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/spanish-text.txt +0 -0
  228. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/test-article.pdf +0 -0
  229. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  230. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/__init__.py +0 -0
  231. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/cache_test.py +0 -0
  232. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/device_test.py +0 -0
  233. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/errors_test.py +0 -0
  234. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/pdf_lock_test.py +0 -0
  235. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/process_pool_test.py +0 -0
  236. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/serialization_test.py +0 -0
  237. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/sync_test.py +0 -0
  238. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/table_test.py +0 -0
  239. {kreuzberg-3.10.0 → kreuzberg-3.10.1}/tests/utils/tmp_test.py +0 -0
@@ -51,12 +51,103 @@ jobs:
51
51
  - name: Execute Pre-Commit
52
52
  run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
53
53
 
54
+ # Coverage job runs first, only on Python 3.13 Ubuntu
55
+ coverage:
56
+ needs: validate
57
+ runs-on: ubuntu-latest
58
+ timeout-minutes: 20
59
+ steps:
60
+ - name: Checkout
61
+ uses: actions/checkout@v4
62
+
63
+ - name: Install uv
64
+ uses: astral-sh/setup-uv@v6
65
+ with:
66
+ enable-cache: true
67
+
68
+ - name: Install Python
69
+ uses: actions/setup-python@v5
70
+ id: setup-python
71
+ with:
72
+ python-version: "3.13"
73
+
74
+ - name: Cache Python Dependencies
75
+ id: python-cache
76
+ uses: actions/cache@v4
77
+ with:
78
+ path: |
79
+ ~/.cache/uv
80
+ .venv
81
+ key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
82
+ restore-keys: |
83
+ python-dependencies-ubuntu-latest-3.13-
84
+
85
+ - name: Install Dependencies
86
+ uses: nick-fields/retry@v3
87
+ with:
88
+ timeout_minutes: 5
89
+ max_attempts: 3
90
+ retry_wait_seconds: 30
91
+ command: |
92
+ uv sync --all-packages --all-extras --dev
93
+ shell: bash
94
+
95
+ - name: Install System Dependencies
96
+ uses: nick-fields/retry@v3
97
+ with:
98
+ timeout_minutes: 5
99
+ max_attempts: 3
100
+ retry_wait_seconds: 30
101
+ command: |
102
+ sudo apt-get update
103
+ sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
104
+ shell: bash
105
+
106
+ - name: Run Tests with Coverage
107
+ uses: nick-fields/retry@v3
108
+ with:
109
+ timeout_minutes: 15
110
+ max_attempts: 3
111
+ retry_wait_seconds: 10
112
+ command: |
113
+ uv run coverage erase
114
+ uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
115
+ uv run coverage report --precision=2
116
+ shell: bash
117
+
118
+ - name: Upload Coverage to DeepSource
119
+ if: always() && github.event_name == 'push'
120
+ env:
121
+ DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
122
+ run: |
123
+ # Install DeepSource CLI
124
+ curl -fsSL https://deepsource.io/cli | sh
125
+ # Upload coverage report
126
+ ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
127
+
128
+ - name: Upload Coverage Artifacts
129
+ if: always()
130
+ uses: actions/upload-artifact@v4
131
+ with:
132
+ name: coverage-report-${{ github.sha }}
133
+ path: |
134
+ coverage.lcov
135
+ .coverage
136
+ retention-days: 7
137
+
138
+ # Full test matrix runs only after coverage succeeds
54
139
  test:
140
+ needs: coverage
141
+ runs-on: ${{ matrix.os }}
55
142
  strategy:
143
+ fail-fast: false
56
144
  matrix:
57
- os: [ ubuntu-latest, macOS-latest, windows-latest ]
58
- python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
59
- runs-on: ${{ matrix.os }}
145
+ os: [ubuntu-latest, windows-latest, macos-latest]
146
+ python: ["3.10", "3.11", "3.12", "3.13"]
147
+ exclude:
148
+ # Skip Python 3.13 on macOS for now due to compatibility issues
149
+ - os: macos-latest
150
+ python: "3.13"
60
151
  timeout-minutes: 30
61
152
  steps:
62
153
  - name: Checkout
@@ -146,52 +237,12 @@ jobs:
146
237
  pandoc --version
147
238
  shell: pwsh
148
239
 
149
- - name: Clean Coverage Data
150
- run: |
151
- rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
152
- shell: bash
153
-
154
- - name: Run Tests with Coverage
155
- run: |
156
- uv run coverage erase
157
- uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
158
-
159
- - name: Upload Coverage Artifacts
160
- if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
161
- uses: actions/upload-artifact@v4
162
- with:
163
- name: coverage-report
164
- path: coverage.lcov
165
- retention-days: 1
166
-
167
- upload-coverage:
168
- needs: test
169
- runs-on: ubuntu-latest
170
- if: github.event_name == 'push' || github.event_name == 'pull_request'
171
- steps:
172
- - name: Checkout
173
- uses: actions/checkout@v4
174
- with:
175
- ref: ${{ github.event.pull_request.head.sha || github.sha }}
176
-
177
- - name: Download Coverage Artifacts
178
- uses: actions/download-artifact@v4
179
- with:
180
- name: coverage-report
181
- path: .
182
-
183
- - name: Install DeepSource CLI
240
+ - name: Run Tests (without coverage)
184
241
  uses: nick-fields/retry@v3
185
242
  with:
186
- timeout_minutes: 3
243
+ timeout_minutes: 15
187
244
  max_attempts: 3
188
245
  retry_wait_seconds: 10
189
246
  command: |
190
- curl -fsSL https://deepsource.io/cli | sh
247
+ uv run pytest -s -vvv --reruns 2 --reruns-delay 1
191
248
  shell: bash
192
-
193
- - name: Upload Coverage to DeepSource
194
- env:
195
- DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
196
- run: |
197
- ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
@@ -53,7 +53,7 @@ repos:
53
53
  hooks:
54
54
  - id: pyproject-fmt
55
55
  - repo: https://github.com/astral-sh/ruff-pre-commit
56
- rev: v0.12.5
56
+ rev: v0.12.7
57
57
  hooks:
58
58
  - id: ruff
59
59
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.10.0
3
+ Version: 3.10.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
108
108
  return results # type: ignore[return-value]
109
109
 
110
110
 
111
- if __name__ == "__main__":
111
+ if __name__ == "__main__": # pragma: no cover
112
112
  baseline_results = asyncio.run(run_baseline_benchmark())
113
113
 
114
114
  baseline_file = Path("baseline_results.json")
@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
195
195
  }
196
196
 
197
197
 
198
- if __name__ == "__main__":
198
+ if __name__ == "__main__": # pragma: no cover
199
199
  print("🧪 REPRODUCIBLE CACHE BENCHMARK")
200
200
  print("Testing msgpack implementation with statistical rigor...")
201
201
  print()
@@ -1,4 +1,5 @@
1
1
  """Core benchmark implementations comparing sync vs async performance."""
2
+ # mypy: disable-error-code=unused-ignore
2
3
 
3
4
  from __future__ import annotations
4
5
 
@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
187
187
  }
188
188
 
189
189
 
190
- if __name__ == "__main__":
190
+ if __name__ == "__main__": # pragma: no cover
191
191
  print("🧪 STATISTICAL CACHE BENCHMARK")
192
192
  print("Testing msgpack implementation with proper error analysis...")
193
193
  print()
@@ -30,7 +30,7 @@ try:
30
30
  HTTP_422_UNPROCESSABLE_ENTITY,
31
31
  HTTP_500_INTERNAL_SERVER_ERROR,
32
32
  )
33
- except ImportError as e:
33
+ except ImportError as e: # pragma: no cover
34
34
  raise MissingDependencyError.create_for_package(
35
35
  dependency_group="litestar",
36
36
  functionality="Litestar API and docker container",
@@ -43,7 +43,7 @@ def get_chunker(
43
43
  from semantic_text_splitter import TextSplitter # noqa: PLC0415
44
44
 
45
45
  _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
- except ImportError as e:
46
+ except ImportError as e: # pragma: no cover
47
47
  raise MissingDependencyError.create_for_package(
48
48
  dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
49
49
  ) from e
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
13
13
 
14
14
  if sys.version_info >= (3, 11):
15
15
  import tomllib
16
- else:
16
+ else: # pragma: no cover
17
17
  import tomli as tomllib # type: ignore[import-not-found]
18
18
 
19
19
  from kreuzberg._gmft import GMFTConfig
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
50
50
  # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
51
51
  if config_path.name == "kreuzberg.toml":
52
52
  return data # type: ignore[no-any-return]
53
- return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
53
+
54
+ # For other files, check if they have [tool.kreuzberg] section
55
+ if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
56
+ return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
57
+
58
+ # Otherwise assume root-level configuration
59
+ return data # type: ignore[no-any-return]
54
60
 
55
61
 
56
62
  def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
@@ -129,12 +135,23 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
129
135
  "extract_keywords",
130
136
  "auto_detect_language",
131
137
  "enable_quality_processing",
138
+ "auto_detect_document_type",
139
+ "document_type_confidence_threshold",
140
+ "document_classification_mode",
141
+ "keyword_count",
132
142
  }
133
143
  extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
134
144
 
135
145
  # Handle OCR backend configuration
136
146
  ocr_backend = extraction_config.get("ocr_backend")
137
147
  if ocr_backend and ocr_backend != "none":
148
+ # Validate OCR backend
149
+ valid_backends = {"tesseract", "easyocr", "paddleocr"}
150
+ if ocr_backend not in valid_backends:
151
+ raise ValidationError(
152
+ f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
153
+ context={"provided": ocr_backend, "valid": sorted(valid_backends)},
154
+ )
138
155
  ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
139
156
  if ocr_config:
140
157
  extraction_config["ocr_config"] = ocr_config
@@ -286,6 +303,10 @@ _CONFIG_FIELDS = [
286
303
  "extract_keywords",
287
304
  "auto_detect_language",
288
305
  "enable_quality_processing",
306
+ "auto_detect_document_type",
307
+ "document_type_confidence_threshold",
308
+ "document_classification_mode",
309
+ "keyword_count",
289
310
  ]
290
311
 
291
312
 
@@ -4,13 +4,12 @@ import re
4
4
  from typing import TYPE_CHECKING
5
5
 
6
6
  from kreuzberg._ocr import get_ocr_backend
7
+ from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
7
8
  from kreuzberg.exceptions import MissingDependencyError
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  from pathlib import Path
11
12
 
12
- from kreuzberg._types import ExtractionConfig, ExtractionResult
13
-
14
13
 
15
14
  DOCUMENT_CLASSIFIERS = {
16
15
  "invoice": [
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
52
51
  Raises:
53
52
  MissingDependencyError: If the deep-translator package is not installed
54
53
  """
54
+ # Combine content with metadata for classification
55
+ text_to_classify = result.content
56
+ if result.metadata:
57
+ # Add metadata values to the text for classification
58
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
59
+ text_to_classify = f"{text_to_classify} {metadata_text}"
60
+
55
61
  try:
56
62
  from deep_translator import GoogleTranslator # noqa: PLC0415
57
- except ImportError as e:
63
+ except ImportError as e: # pragma: no cover
58
64
  raise MissingDependencyError(
59
65
  "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
60
66
  ) from e
61
67
 
62
- return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
68
+ try:
69
+ return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
70
+ except Exception: # noqa: BLE001
71
+ # Fall back to original content in lowercase if translation fails
72
+ return text_to_classify.lower()
63
73
 
64
74
 
65
75
  def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
73
83
  A tuple containing the detected document type and the confidence score,
74
84
  or (None, None) if no type is detected with sufficient confidence.
75
85
  """
86
+ if not config.auto_detect_document_type:
87
+ return None, None
88
+
76
89
  translated_text = _get_translated_text(result)
77
90
  scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
78
91
 
@@ -108,7 +121,8 @@ def classify_document_from_layout(
108
121
  A tuple containing the detected document type and the confidence score,
109
122
  or (None, None) if no type is detected with sufficient confidence.
110
123
  """
111
- translated_text = _get_translated_text(result)
124
+ if not config.auto_detect_document_type:
125
+ return None, None
112
126
 
113
127
  if result.layout is None or result.layout.empty:
114
128
  return None, None
@@ -117,6 +131,24 @@ def classify_document_from_layout(
117
131
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
118
132
  return None, None
119
133
 
134
+ # Use layout text for classification, not the content
135
+ layout_text = " ".join(layout_df["text"].astype(str).tolist())
136
+
137
+ # Translate layout text directly for classification
138
+ text_to_classify = layout_text
139
+ if result.metadata:
140
+ # Add metadata values to the text for classification
141
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
142
+ text_to_classify = f"{text_to_classify} {metadata_text}"
143
+
144
+ try:
145
+ from deep_translator import GoogleTranslator # noqa: PLC0415
146
+
147
+ translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
148
+ except Exception: # noqa: BLE001
149
+ # Fall back to original content in lowercase if translation fails
150
+ translated_text = text_to_classify.lower()
151
+
120
152
  layout_df["translated_text"] = translated_text
121
153
 
122
154
  page_height = layout_df["top"].max() + layout_df["height"].max()
@@ -151,6 +183,9 @@ def auto_detect_document_type(
151
183
  if config.document_classification_mode == "vision" and file_path:
152
184
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
153
185
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
186
+ elif result.layout is not None and not result.layout.empty:
187
+ # Use layout-based classification if layout data is available
188
+ result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
154
189
  else:
155
190
  result.document_type, result.document_type_confidence = classify_document(result, config)
156
191
  return result
@@ -139,7 +139,7 @@ def extract_entities(
139
139
 
140
140
  try:
141
141
  import spacy # noqa: F401, PLC0415
142
- except ImportError as e:
142
+ except ImportError as e: # pragma: no cover
143
143
  raise MissingDependencyError.create_for_package(
144
144
  package_name="spacy",
145
145
  dependency_group="entity-extraction",
@@ -230,7 +230,7 @@ def extract_keywords(
230
230
  return [(kw, float(score)) for kw, score in keywords]
231
231
  except (RuntimeError, OSError, ValueError):
232
232
  return []
233
- except ImportError as e:
233
+ except ImportError as e: # pragma: no cover
234
234
  raise MissingDependencyError.create_for_package(
235
235
  package_name="keybert",
236
236
  dependency_group="entity-extraction",
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
  # Import optional dependencies at module level with proper error handling
20
20
  try:
21
21
  import mailparse
22
- except ImportError:
22
+ except ImportError: # pragma: no cover
23
23
  mailparse = None
24
24
 
25
25
  try:
26
26
  import html2text # type: ignore[import-not-found]
27
- except ImportError:
27
+ except ImportError: # pragma: no cover
28
28
  html2text = None
29
29
 
30
30
  # Compile regex pattern once at module level
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
59
59
 
60
60
  to_info = parsed_email.get("to")
61
61
  if to_info:
62
+ # Store the raw value in metadata (could be string, dict, or list)
62
63
  if isinstance(to_info, list) and to_info:
64
+ # For metadata, use first recipient's email if it's a list
63
65
  to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
66
+ metadata["email_to"] = to_email
64
67
  elif isinstance(to_info, dict):
65
- to_email = to_info.get("email", "")
68
+ metadata["email_to"] = to_info.get("email", "")
66
69
  else:
67
- to_email = str(to_info)
68
- metadata["email_to"] = to_email
69
- text_parts.append(f"To: {to_email}")
70
+ metadata["email_to"] = str(to_info)
71
+
72
+ # For display, format all recipients
73
+ to_formatted = self._format_email_field(to_info)
74
+ text_parts.append(f"To: {to_formatted}")
70
75
 
71
76
  date = parsed_email.get("date")
72
77
  if date:
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
76
81
  cc = parsed_email.get("cc")
77
82
  if cc:
78
83
  metadata["email_cc"] = cc
79
- text_parts.append(f"CC: {cc}")
84
+ cc_formatted = self._format_email_field(cc)
85
+ text_parts.append(f"CC: {cc_formatted}")
80
86
 
81
87
  bcc = parsed_email.get("bcc")
82
88
  if bcc:
83
89
  metadata["email_bcc"] = bcc
84
- text_parts.append(f"BCC: {bcc}")
90
+ bcc_formatted = self._format_email_field(bcc)
91
+ text_parts.append(f"BCC: {bcc_formatted}")
92
+
93
+ def _format_email_field(self, field: Any) -> str:
94
+ """Format email field (to, cc, bcc) for display."""
95
+ if isinstance(field, list):
96
+ emails = []
97
+ for item in field:
98
+ if isinstance(item, dict):
99
+ email = item.get("email", "")
100
+ if email:
101
+ emails.append(email)
102
+ else:
103
+ emails.append(str(item))
104
+ return ", ".join(emails)
105
+ if isinstance(field, dict):
106
+ return str(field.get("email", ""))
107
+ return str(field)
85
108
 
86
109
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
87
110
  """Extract and process email body content."""
@@ -82,7 +82,7 @@ class PDFExtractor(Extractor):
82
82
  from kreuzberg._gmft import extract_tables # noqa: PLC0415
83
83
 
84
84
  result.tables = await extract_tables(path, self.config.gmft_config)
85
- except ImportError:
85
+ except ImportError: # pragma: no cover
86
86
  result.tables = []
87
87
 
88
88
  # Enhance metadata with table information
@@ -6,15 +6,15 @@ from typing import TYPE_CHECKING, Any, ClassVar
6
6
 
7
7
  if sys.version_info >= (3, 11):
8
8
  import tomllib
9
- else:
9
+ else: # pragma: no cover
10
10
  try:
11
11
  import tomli as tomllib # type: ignore[import-not-found]
12
- except ImportError:
12
+ except ImportError: # pragma: no cover
13
13
  tomllib = None
14
14
 
15
15
  try:
16
16
  import yaml
17
- except ImportError:
17
+ except ImportError: # pragma: no cover
18
18
  yaml = None
19
19
 
20
20
  from anyio import Path as AsyncPath
@@ -265,7 +265,7 @@ async def extract_tables(
265
265
  finally:
266
266
  await run_sync(doc.close)
267
267
 
268
- except ImportError as e:
268
+ except ImportError as e: # pragma: no cover
269
269
  raise MissingDependencyError.create_for_package(
270
270
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
271
271
  ) from e
@@ -379,7 +379,7 @@ def extract_tables_sync(
379
379
  finally:
380
380
  doc.close() # type: ignore[no-untyped-call]
381
381
 
382
- except ImportError as e:
382
+ except ImportError as e: # pragma: no cover
383
383
  raise MissingDependencyError.create_for_package(
384
384
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
385
385
  ) from e
@@ -14,7 +14,7 @@ try:
14
14
  from fast_langdetect import detect, detect_multilingual
15
15
 
16
16
  HAS_FAST_LANGDETECT = True
17
- except ImportError:
17
+ except ImportError: # pragma: no cover
18
18
  HAS_FAST_LANGDETECT = False
19
19
  detect = None
20
20
  detect_multilingual = None
@@ -268,7 +268,7 @@ def extract_structured(file_path: str) -> list[TextContent]:
268
268
  return [TextContent(type="text", text=content)]
269
269
 
270
270
 
271
- def main() -> None:
271
+ def main() -> None: # pragma: no cover
272
272
  """Main entry point for the MCP server."""
273
273
  mcp.run()
274
274
 
@@ -88,7 +88,7 @@ class OCRBackend(ABC, Generic[T]):
88
88
  Returns:
89
89
  List of extraction result objects in the same order as input paths
90
90
  """
91
- return [self.process_file_sync(path, **kwargs) for path in paths]
91
+ return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
92
92
 
93
93
  async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
94
94
  """Asynchronously process a batch of files and extract their text and metadata.
@@ -106,8 +106,8 @@ class OCRBackend(ABC, Generic[T]):
106
106
  from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
107
107
 
108
108
  tasks = [self.process_file(path, **kwargs) for path in paths]
109
- return await run_taskgroup(*tasks)
109
+ return await run_taskgroup(*tasks) # pragma: no cover
110
110
 
111
111
  def __hash__(self) -> int:
112
112
  """Hash function for allowing caching."""
113
- return hash(type(self).__name__)
113
+ return hash(type(self).__name__) # pragma: no cover
@@ -321,7 +321,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
321
321
  import torch # noqa: PLC0415
322
322
 
323
323
  return bool(torch.cuda.is_available())
324
- except ImportError:
324
+ except ImportError: # pragma: no cover
325
325
  return False
326
326
 
327
327
  @classmethod
@@ -340,7 +340,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
340
340
 
341
341
  try:
342
342
  import easyocr # noqa: PLC0415
343
- except ImportError as e:
343
+ except ImportError as e: # pragma: no cover
344
344
  raise MissingDependencyError.create_for_package(
345
345
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
346
346
  ) from e
@@ -508,7 +508,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
508
508
 
509
509
  try:
510
510
  import easyocr # noqa: PLC0415
511
- except ImportError as e:
511
+ except ImportError as e: # pragma: no cover
512
512
  raise MissingDependencyError.create_for_package(
513
513
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
514
514
  ) from e
@@ -261,7 +261,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
261
261
 
262
262
  try:
263
263
  from paddleocr import PaddleOCR # noqa: PLC0415
264
- except ImportError as e:
264
+ except ImportError as e: # pragma: no cover
265
265
  raise MissingDependencyError.create_for_package(
266
266
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
267
267
  ) from e
@@ -428,7 +428,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
428
428
 
429
429
  try:
430
430
  from paddleocr import PaddleOCR # noqa: PLC0415
431
- except ImportError as e:
431
+ except ImportError as e: # pragma: no cover
432
432
  raise MissingDependencyError.create_for_package(
433
433
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
434
434
  ) from e
@@ -143,7 +143,9 @@ def _parse_date_string(date_str: str) -> str:
143
143
  minute = date_str[10:12]
144
144
  second = date_str[12:14]
145
145
  time_part = f"T{hour}:{minute}:{second}"
146
- return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y%m%d%H%M%S").isoformat() # noqa: DTZ007
146
+ if time_part:
147
+ return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").isoformat() # noqa: DTZ007
148
+ return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").isoformat() # noqa: DTZ007
147
149
  return date_str
148
150
 
149
151