kreuzberg 3.9.1__tar.gz → 3.10.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/ci.yaml +97 -46
  2. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.pre-commit-config.yaml +1 -1
  3. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/PKG-INFO +4 -1
  4. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/ai-rulez.yaml +11 -1
  5. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/benchmark_baseline.py +1 -1
  6. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/end_to_end_benchmark.py +1 -1
  7. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +1 -0
  8. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/statistical_benchmark.py +1 -1
  9. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_api/main.py +1 -1
  10. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_chunker.py +1 -1
  11. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_config.py +23 -2
  12. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_document_classification.py +40 -5
  13. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_entity_extraction.py +2 -2
  14. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_email.py +31 -8
  15. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pdf.py +77 -6
  16. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_structured.py +3 -3
  17. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_gmft.py +2 -2
  18. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_language_detection.py +1 -1
  19. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_mcp/server.py +1 -1
  20. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/_base.py +3 -3
  21. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/_easyocr.py +3 -3
  22. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/_paddleocr.py +2 -2
  23. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_playa.py +9 -5
  24. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_types.py +7 -5
  25. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_device.py +6 -6
  26. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_document_cache.py +1 -0
  27. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/cli.py +6 -6
  28. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/pyproject.toml +19 -3
  29. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/api/main_test.py +323 -0
  30. kreuzberg-3.10.1/tests/cli_command_test.py +523 -0
  31. kreuzberg-3.10.1/tests/config_test.py +1570 -0
  32. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/conftest.py +6 -0
  33. kreuzberg-3.10.1/tests/document_classification_test.py +886 -0
  34. kreuzberg-3.10.1/tests/entity_extraction_test.py +675 -0
  35. kreuzberg-3.10.1/tests/extraction_test.py +834 -0
  36. kreuzberg-3.10.1/tests/extractors/email_test.py +1003 -0
  37. kreuzberg-3.10.1/tests/extractors/image_test.py +768 -0
  38. kreuzberg-3.10.1/tests/extractors/pandoc_test.py +2123 -0
  39. kreuzberg-3.10.1/tests/extractors/pdf_test.py +973 -0
  40. kreuzberg-3.10.1/tests/extractors/presentation_test.py +1005 -0
  41. kreuzberg-3.10.1/tests/extractors/spreed_sheet_test.py +1237 -0
  42. kreuzberg-3.10.1/tests/extractors/structured_test.py +302 -0
  43. kreuzberg-3.10.1/tests/gmft_test.py +720 -0
  44. kreuzberg-3.10.1/tests/language_detection_test.py +172 -0
  45. kreuzberg-3.10.1/tests/mcp_server_test.py +883 -0
  46. kreuzberg-3.10.1/tests/ocr/tesseract_test.py +1141 -0
  47. kreuzberg-3.10.1/tests/playa_helpers_test.py +549 -0
  48. kreuzberg-3.10.1/tests/types_test.py +440 -0
  49. kreuzberg-3.10.1/tests/utils/string_test.py +305 -0
  50. kreuzberg-3.10.1/tests/utils_errors_test.py +299 -0
  51. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/uv.lock +217 -129
  52. kreuzberg-3.9.1/tests/config_test.py +0 -401
  53. kreuzberg-3.9.1/tests/document_classification_test.py +0 -86
  54. kreuzberg-3.9.1/tests/entity_extraction_test.py +0 -102
  55. kreuzberg-3.9.1/tests/extraction_test.py +0 -389
  56. kreuzberg-3.9.1/tests/extractors/email_comprehensive_test.py +0 -326
  57. kreuzberg-3.9.1/tests/extractors/email_test.py +0 -31
  58. kreuzberg-3.9.1/tests/extractors/image_test.py +0 -275
  59. kreuzberg-3.9.1/tests/extractors/pandoc_test.py +0 -458
  60. kreuzberg-3.9.1/tests/extractors/pdf_test.py +0 -390
  61. kreuzberg-3.9.1/tests/extractors/presentation_test.py +0 -410
  62. kreuzberg-3.9.1/tests/extractors/spreed_sheet_test.py +0 -325
  63. kreuzberg-3.9.1/tests/extractors/structured_test.py +0 -90
  64. kreuzberg-3.9.1/tests/gmft_test.py +0 -397
  65. kreuzberg-3.9.1/tests/language_detection_test.py +0 -237
  66. kreuzberg-3.9.1/tests/mcp_server_test.py +0 -382
  67. kreuzberg-3.9.1/tests/ocr/tesseract_test.py +0 -477
  68. kreuzberg-3.9.1/tests/types_test.py +0 -191
  69. kreuzberg-3.9.1/tests/utils/string_test.py +0 -85
  70. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.commitlintrc +0 -0
  71. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.deepsource.toml +0 -0
  72. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.docker/Dockerfile +0 -0
  73. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.docker/README.md +0 -0
  74. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.dockerignore +0 -0
  75. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/dependabot.yaml +0 -0
  76. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/docs.yml +0 -0
  77. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/pr-title.yaml +0 -0
  78. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/publish-docker.yml +0 -0
  79. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/release.yaml +0 -0
  80. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.gitignore +0 -0
  81. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/.markdownlint.yaml +0 -0
  82. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/LICENSE +0 -0
  83. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/README.md +0 -0
  84. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/README.md +0 -0
  85. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/final_benchmark.py +0 -0
  86. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/pyproject.toml +0 -0
  87. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/baseline_results.json +0 -0
  88. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
  89. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
  90. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/final_benchmark_results.json +0 -0
  91. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/latest.json +0 -0
  92. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/mime_caching_results.json +0 -0
  93. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/msgspec_caching_results.json +0 -0
  94. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/ocr_caching_results.json +0 -0
  95. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
  96. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
  97. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/results/table_caching_results.json +0 -0
  98. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/serialization_benchmark.py +0 -0
  99. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
  100. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
  101. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
  102. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
  103. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
  104. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
  105. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/custom-extractors.md +0 -0
  106. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/custom-hooks.md +0 -0
  107. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/error-handling.md +0 -0
  108. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/index.md +0 -0
  109. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/advanced/performance.md +0 -0
  110. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/exceptions.md +0 -0
  111. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/extraction-functions.md +0 -0
  112. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/extractor-registry.md +0 -0
  113. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/index.md +0 -0
  114. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/ocr-configuration.md +0 -0
  115. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/api-reference/types.md +0 -0
  116. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/assets/favicon.png +0 -0
  117. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/assets/logo.png +0 -0
  118. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/changelog.md +0 -0
  119. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/cli.md +0 -0
  120. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/contributing.md +0 -0
  121. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/css/extra.css +0 -0
  122. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/examples/extraction-examples.md +0 -0
  123. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/examples/index.md +0 -0
  124. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/getting-started/index.md +0 -0
  125. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/getting-started/installation.md +0 -0
  126. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/getting-started/quick-start.md +0 -0
  127. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/index.md +0 -0
  128. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/performance-analysis.md +0 -0
  129. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/api-server.md +0 -0
  130. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/basic-usage.md +0 -0
  131. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/chunking.md +0 -0
  132. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/docker.md +0 -0
  133. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/document-classification.md +0 -0
  134. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/extraction-configuration.md +0 -0
  135. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/index.md +0 -0
  136. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/mcp-server.md +0 -0
  137. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/metadata-extraction.md +0 -0
  138. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/ocr-backends.md +0 -0
  139. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/ocr-configuration.md +0 -0
  140. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/docs/user-guide/supported-formats.md +0 -0
  141. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/__init__.py +0 -0
  142. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/__main__.py +0 -0
  143. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_api/__init__.py +0 -0
  144. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_constants.py +0 -0
  145. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/__init__.py +0 -0
  146. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_base.py +0 -0
  147. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_html.py +0 -0
  148. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_image.py +0 -0
  149. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pandoc.py +0 -0
  150. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_presentation.py +0 -0
  151. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  152. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_mcp/__init__.py +0 -0
  153. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_mime_types.py +0 -0
  154. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/__init__.py +0 -0
  155. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_ocr/_tesseract.py +0 -0
  156. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_registry.py +0 -0
  157. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/__init__.py +0 -0
  158. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_cache.py +0 -0
  159. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_errors.py +0 -0
  160. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
  161. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_process_pool.py +0 -0
  162. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_quality.py +0 -0
  163. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_serialization.py +0 -0
  164. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_string.py +0 -0
  165. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_sync.py +0 -0
  166. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_table.py +0 -0
  167. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_utils/_tmp.py +0 -0
  168. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/exceptions.py +0 -0
  169. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/extraction.py +0 -0
  170. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/py.typed +0 -0
  171. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/mkdocs.yaml +0 -0
  172. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/__init__.py +0 -0
  173. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/api/__init__.py +0 -0
  174. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/chunker_test.py +0 -0
  175. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/cli_integration_test.py +0 -0
  176. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/cli_test.py +0 -0
  177. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/exceptions_test.py +0 -0
  178. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/extraction_batch_test.py +0 -0
  179. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/extractors/__init__.py +0 -0
  180. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/extractors/html_test.py +0 -0
  181. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/extractors/pandoc_metadata_test.py +0 -0
  182. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/gmft_extended_test.py +0 -0
  183. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/hooks_test.py +0 -0
  184. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/mime_types_test.py +0 -0
  185. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/__init__.py +0 -0
  186. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
  187. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
  188. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/process_manager_test.py +0 -0
  189. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
  190. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/__init__.py +0 -0
  191. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/base_test.py +0 -0
  192. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/device_integration_test.py +0 -0
  193. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/easyocr_test.py +0 -0
  194. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/init_test.py +0 -0
  195. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/ocr/paddleocr_test.py +0 -0
  196. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/playa_test.py +0 -0
  197. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/registry_test.py +0 -0
  198. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/better-ocr-image.jpg +0 -0
  199. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/contract.txt +0 -0
  200. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/contract_test.txt +0 -0
  201. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/document.docx +0 -0
  202. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/email/sample-email.eml +0 -0
  203. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  204. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/excel.xlsx +0 -0
  205. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/form_test.txt +0 -0
  206. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/french-text.txt +0 -0
  207. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/german-text.txt +0 -0
  208. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/html.html +0 -0
  209. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/images/test_hello_world.png +0 -0
  210. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/invoice_image.png +0 -0
  211. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/invoice_test.txt +0 -0
  212. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/json/sample-document.json +0 -0
  213. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  214. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/markdown.md +0 -0
  215. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
  216. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/non-searchable.pdf +0 -0
  217. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/ocr-image.jpg +0 -0
  218. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  219. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  220. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  221. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  222. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/receipt_test.txt +0 -0
  223. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/report_test.txt +0 -0
  224. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/sample-contract.pdf +0 -0
  225. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/scanned.pdf +0 -0
  226. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/searchable.pdf +0 -0
  227. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/spanish-text.txt +0 -0
  228. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/test-article.pdf +0 -0
  229. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  230. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/__init__.py +0 -0
  231. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/cache_test.py +0 -0
  232. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/device_test.py +0 -0
  233. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/errors_test.py +0 -0
  234. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/pdf_lock_test.py +0 -0
  235. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/process_pool_test.py +0 -0
  236. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/serialization_test.py +0 -0
  237. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/sync_test.py +0 -0
  238. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/table_test.py +0 -0
  239. {kreuzberg-3.9.1 → kreuzberg-3.10.1}/tests/utils/tmp_test.py +0 -0
@@ -51,12 +51,103 @@ jobs:
51
51
  - name: Execute Pre-Commit
52
52
  run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
53
53
 
54
+ # Coverage job runs first, only on Python 3.13 Ubuntu
55
+ coverage:
56
+ needs: validate
57
+ runs-on: ubuntu-latest
58
+ timeout-minutes: 20
59
+ steps:
60
+ - name: Checkout
61
+ uses: actions/checkout@v4
62
+
63
+ - name: Install uv
64
+ uses: astral-sh/setup-uv@v6
65
+ with:
66
+ enable-cache: true
67
+
68
+ - name: Install Python
69
+ uses: actions/setup-python@v5
70
+ id: setup-python
71
+ with:
72
+ python-version: "3.13"
73
+
74
+ - name: Cache Python Dependencies
75
+ id: python-cache
76
+ uses: actions/cache@v4
77
+ with:
78
+ path: |
79
+ ~/.cache/uv
80
+ .venv
81
+ key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
82
+ restore-keys: |
83
+ python-dependencies-ubuntu-latest-3.13-
84
+
85
+ - name: Install Dependencies
86
+ uses: nick-fields/retry@v3
87
+ with:
88
+ timeout_minutes: 5
89
+ max_attempts: 3
90
+ retry_wait_seconds: 30
91
+ command: |
92
+ uv sync --all-packages --all-extras --dev
93
+ shell: bash
94
+
95
+ - name: Install System Dependencies
96
+ uses: nick-fields/retry@v3
97
+ with:
98
+ timeout_minutes: 5
99
+ max_attempts: 3
100
+ retry_wait_seconds: 30
101
+ command: |
102
+ sudo apt-get update
103
+ sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
104
+ shell: bash
105
+
106
+ - name: Run Tests with Coverage
107
+ uses: nick-fields/retry@v3
108
+ with:
109
+ timeout_minutes: 15
110
+ max_attempts: 3
111
+ retry_wait_seconds: 10
112
+ command: |
113
+ uv run coverage erase
114
+ uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
115
+ uv run coverage report --precision=2
116
+ shell: bash
117
+
118
+ - name: Upload Coverage to DeepSource
119
+ if: always() && github.event_name == 'push'
120
+ env:
121
+ DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
122
+ run: |
123
+ # Install DeepSource CLI
124
+ curl -fsSL https://deepsource.io/cli | sh
125
+ # Upload coverage report
126
+ ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
127
+
128
+ - name: Upload Coverage Artifacts
129
+ if: always()
130
+ uses: actions/upload-artifact@v4
131
+ with:
132
+ name: coverage-report-${{ github.sha }}
133
+ path: |
134
+ coverage.lcov
135
+ .coverage
136
+ retention-days: 7
137
+
138
+ # Full test matrix runs only after coverage succeeds
54
139
  test:
140
+ needs: coverage
141
+ runs-on: ${{ matrix.os }}
55
142
  strategy:
143
+ fail-fast: false
56
144
  matrix:
57
- os: [ ubuntu-latest, macOS-latest, windows-latest ]
58
- python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
59
- runs-on: ${{ matrix.os }}
145
+ os: [ubuntu-latest, windows-latest, macos-latest]
146
+ python: ["3.10", "3.11", "3.12", "3.13"]
147
+ exclude:
148
+ # Skip Python 3.13 on macOS for now due to compatibility issues
149
+ - os: macos-latest
150
+ python: "3.13"
60
151
  timeout-minutes: 30
61
152
  steps:
62
153
  - name: Checkout
@@ -146,52 +237,12 @@ jobs:
146
237
  pandoc --version
147
238
  shell: pwsh
148
239
 
149
- - name: Clean Coverage Data
150
- run: |
151
- rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
152
- shell: bash
153
-
154
- - name: Run Tests with Coverage
155
- run: |
156
- uv run coverage erase
157
- uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
158
-
159
- - name: Upload Coverage Artifacts
160
- if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
161
- uses: actions/upload-artifact@v4
162
- with:
163
- name: coverage-report
164
- path: coverage.lcov
165
- retention-days: 1
166
-
167
- upload-coverage:
168
- needs: test
169
- runs-on: ubuntu-latest
170
- if: github.event_name == 'push' || github.event_name == 'pull_request'
171
- steps:
172
- - name: Checkout
173
- uses: actions/checkout@v4
174
- with:
175
- ref: ${{ github.event.pull_request.head.sha || github.sha }}
176
-
177
- - name: Download Coverage Artifacts
178
- uses: actions/download-artifact@v4
179
- with:
180
- name: coverage-report
181
- path: .
182
-
183
- - name: Install DeepSource CLI
240
+ - name: Run Tests (without coverage)
184
241
  uses: nick-fields/retry@v3
185
242
  with:
186
- timeout_minutes: 3
243
+ timeout_minutes: 15
187
244
  max_attempts: 3
188
245
  retry_wait_seconds: 10
189
246
  command: |
190
- curl -fsSL https://deepsource.io/cli | sh
247
+ uv run pytest -s -vvv --reruns 2 --reruns-delay 1
191
248
  shell: bash
192
-
193
- - name: Upload Coverage to DeepSource
194
- env:
195
- DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
196
- run: |
197
- ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
@@ -53,7 +53,7 @@ repos:
53
53
  hooks:
54
54
  - id: pyproject-fmt
55
55
  - repo: https://github.com/astral-sh/ruff-pre-commit
56
- rev: v0.12.5
56
+ rev: v0.12.7
57
57
  hooks:
58
58
  - id: ruff
59
59
  args: ["--fix", "--unsafe-fixes"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.9.1
3
+ Version: 3.10.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -53,6 +53,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
53
53
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
54
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
55
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
+ Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
56
57
  Requires-Dist: rich>=14.1.0; extra == 'all'
57
58
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
58
59
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
@@ -69,6 +70,8 @@ Provides-Extra: cli
69
70
  Requires-Dist: click>=8.2.1; extra == 'cli'
70
71
  Requires-Dist: rich>=14.1.0; extra == 'cli'
71
72
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
73
+ Provides-Extra: crypto
74
+ Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
72
75
  Provides-Extra: easyocr
73
76
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
74
77
  Provides-Extra: entity-extraction
@@ -193,16 +193,18 @@ rules:
193
193
  api = ["litestar[standard,structlog,opentelemetry]>=2.1.6"]
194
194
  cli = ["click>=8.2.1", "rich>=14.0.0", "tomli>=2.0.0; python_version<'3.11'"]
195
195
  chunking = ["semantic-text-splitter>=0.27.0"]
196
+ crypto = ["playa-pdf[crypto]>=0.6.4"]
196
197
  easyocr = ["easyocr>=1.7.2"]
197
198
  gmft = ["gmft>=0.4.2"]
198
199
  langdetect = ["fast-langdetect>=0.2.0"]
199
200
  paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
200
- all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
201
+ all = ["kreuzberg[api,chunking,cli,crypto,easyocr,gmft,langdetect,paddleocr]"]
201
202
  ```
202
203
 
203
204
  ### Installation Patterns
204
205
  - Basic: `pip install kreuzberg`
205
206
  - With features: `pip install "kreuzberg[api,cli]"`
207
+ - With crypto support: `pip install "kreuzberg[crypto]"`
206
208
  - All features: `pip install "kreuzberg[all]"`
207
209
  - Development: `uv sync --all-extras`
208
210
 
@@ -211,6 +213,14 @@ rules:
211
213
  - **System**: tesseract-ocr, pandoc (via package manager)
212
214
  - **Development**: Uses dependency groups in pyproject.toml
213
215
 
216
+ ### Crypto Support
217
+ The `crypto` extra adds cryptographic support for PDF processing:
218
+ - **Purpose**: Enables AES encryption/decryption for password-protected PDFs
219
+ - **Dependencies**: Adds cryptography (~22MB), cffi, and pycparser
220
+ - **Usage**: Required for PDFs with AES encryption (RC4 is supported in base installation)
221
+ - **Password Support**: Supports single password or list of passwords to try in sequence
222
+ - **Size Impact**: Increases installation size by ~24MB due to cryptography package
223
+
214
224
  sections:
215
225
  - title: "Language Detection"
216
226
  content: |
@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
108
108
  return results # type: ignore[return-value]
109
109
 
110
110
 
111
- if __name__ == "__main__":
111
+ if __name__ == "__main__": # pragma: no cover
112
112
  baseline_results = asyncio.run(run_baseline_benchmark())
113
113
 
114
114
  baseline_file = Path("baseline_results.json")
@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
195
195
  }
196
196
 
197
197
 
198
- if __name__ == "__main__":
198
+ if __name__ == "__main__": # pragma: no cover
199
199
  print("🧪 REPRODUCIBLE CACHE BENCHMARK")
200
200
  print("Testing msgpack implementation with statistical rigor...")
201
201
  print()
@@ -1,4 +1,5 @@
1
1
  """Core benchmark implementations comparing sync vs async performance."""
2
+ # mypy: disable-error-code=unused-ignore
2
3
 
3
4
  from __future__ import annotations
4
5
 
@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
187
187
  }
188
188
 
189
189
 
190
- if __name__ == "__main__":
190
+ if __name__ == "__main__": # pragma: no cover
191
191
  print("🧪 STATISTICAL CACHE BENCHMARK")
192
192
  print("Testing msgpack implementation with proper error analysis...")
193
193
  print()
@@ -30,7 +30,7 @@ try:
30
30
  HTTP_422_UNPROCESSABLE_ENTITY,
31
31
  HTTP_500_INTERNAL_SERVER_ERROR,
32
32
  )
33
- except ImportError as e:
33
+ except ImportError as e: # pragma: no cover
34
34
  raise MissingDependencyError.create_for_package(
35
35
  dependency_group="litestar",
36
36
  functionality="Litestar API and docker container",
@@ -43,7 +43,7 @@ def get_chunker(
43
43
  from semantic_text_splitter import TextSplitter # noqa: PLC0415
44
44
 
45
45
  _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
- except ImportError as e:
46
+ except ImportError as e: # pragma: no cover
47
47
  raise MissingDependencyError.create_for_package(
48
48
  dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
49
49
  ) from e
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
13
13
 
14
14
  if sys.version_info >= (3, 11):
15
15
  import tomllib
16
- else:
16
+ else: # pragma: no cover
17
17
  import tomli as tomllib # type: ignore[import-not-found]
18
18
 
19
19
  from kreuzberg._gmft import GMFTConfig
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
50
50
  # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
51
51
  if config_path.name == "kreuzberg.toml":
52
52
  return data # type: ignore[no-any-return]
53
- return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
53
+
54
+ # For other files, check if they have [tool.kreuzberg] section
55
+ if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
56
+ return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
57
+
58
+ # Otherwise assume root-level configuration
59
+ return data # type: ignore[no-any-return]
54
60
 
55
61
 
56
62
  def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
@@ -129,12 +135,23 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
129
135
  "extract_keywords",
130
136
  "auto_detect_language",
131
137
  "enable_quality_processing",
138
+ "auto_detect_document_type",
139
+ "document_type_confidence_threshold",
140
+ "document_classification_mode",
141
+ "keyword_count",
132
142
  }
133
143
  extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
134
144
 
135
145
  # Handle OCR backend configuration
136
146
  ocr_backend = extraction_config.get("ocr_backend")
137
147
  if ocr_backend and ocr_backend != "none":
148
+ # Validate OCR backend
149
+ valid_backends = {"tesseract", "easyocr", "paddleocr"}
150
+ if ocr_backend not in valid_backends:
151
+ raise ValidationError(
152
+ f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
153
+ context={"provided": ocr_backend, "valid": sorted(valid_backends)},
154
+ )
138
155
  ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
139
156
  if ocr_config:
140
157
  extraction_config["ocr_config"] = ocr_config
@@ -286,6 +303,10 @@ _CONFIG_FIELDS = [
286
303
  "extract_keywords",
287
304
  "auto_detect_language",
288
305
  "enable_quality_processing",
306
+ "auto_detect_document_type",
307
+ "document_type_confidence_threshold",
308
+ "document_classification_mode",
309
+ "keyword_count",
289
310
  ]
290
311
 
291
312
 
@@ -4,13 +4,12 @@ import re
4
4
  from typing import TYPE_CHECKING
5
5
 
6
6
  from kreuzberg._ocr import get_ocr_backend
7
+ from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
7
8
  from kreuzberg.exceptions import MissingDependencyError
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  from pathlib import Path
11
12
 
12
- from kreuzberg._types import ExtractionConfig, ExtractionResult
13
-
14
13
 
15
14
  DOCUMENT_CLASSIFIERS = {
16
15
  "invoice": [
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
52
51
  Raises:
53
52
  MissingDependencyError: If the deep-translator package is not installed
54
53
  """
54
+ # Combine content with metadata for classification
55
+ text_to_classify = result.content
56
+ if result.metadata:
57
+ # Add metadata values to the text for classification
58
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
59
+ text_to_classify = f"{text_to_classify} {metadata_text}"
60
+
55
61
  try:
56
62
  from deep_translator import GoogleTranslator # noqa: PLC0415
57
- except ImportError as e:
63
+ except ImportError as e: # pragma: no cover
58
64
  raise MissingDependencyError(
59
65
  "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
60
66
  ) from e
61
67
 
62
- return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
68
+ try:
69
+ return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
70
+ except Exception: # noqa: BLE001
71
+ # Fall back to original content in lowercase if translation fails
72
+ return text_to_classify.lower()
63
73
 
64
74
 
65
75
  def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
73
83
  A tuple containing the detected document type and the confidence score,
74
84
  or (None, None) if no type is detected with sufficient confidence.
75
85
  """
86
+ if not config.auto_detect_document_type:
87
+ return None, None
88
+
76
89
  translated_text = _get_translated_text(result)
77
90
  scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
78
91
 
@@ -108,7 +121,8 @@ def classify_document_from_layout(
108
121
  A tuple containing the detected document type and the confidence score,
109
122
  or (None, None) if no type is detected with sufficient confidence.
110
123
  """
111
- translated_text = _get_translated_text(result)
124
+ if not config.auto_detect_document_type:
125
+ return None, None
112
126
 
113
127
  if result.layout is None or result.layout.empty:
114
128
  return None, None
@@ -117,6 +131,24 @@ def classify_document_from_layout(
117
131
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
118
132
  return None, None
119
133
 
134
+ # Use layout text for classification, not the content
135
+ layout_text = " ".join(layout_df["text"].astype(str).tolist())
136
+
137
+ # Translate layout text directly for classification
138
+ text_to_classify = layout_text
139
+ if result.metadata:
140
+ # Add metadata values to the text for classification
141
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
142
+ text_to_classify = f"{text_to_classify} {metadata_text}"
143
+
144
+ try:
145
+ from deep_translator import GoogleTranslator # noqa: PLC0415
146
+
147
+ translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
148
+ except Exception: # noqa: BLE001
149
+ # Fall back to original content in lowercase if translation fails
150
+ translated_text = text_to_classify.lower()
151
+
120
152
  layout_df["translated_text"] = translated_text
121
153
 
122
154
  page_height = layout_df["top"].max() + layout_df["height"].max()
@@ -151,6 +183,9 @@ def auto_detect_document_type(
151
183
  if config.document_classification_mode == "vision" and file_path:
152
184
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
153
185
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
186
+ elif result.layout is not None and not result.layout.empty:
187
+ # Use layout-based classification if layout data is available
188
+ result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
154
189
  else:
155
190
  result.document_type, result.document_type_confidence = classify_document(result, config)
156
191
  return result
@@ -139,7 +139,7 @@ def extract_entities(
139
139
 
140
140
  try:
141
141
  import spacy # noqa: F401, PLC0415
142
- except ImportError as e:
142
+ except ImportError as e: # pragma: no cover
143
143
  raise MissingDependencyError.create_for_package(
144
144
  package_name="spacy",
145
145
  dependency_group="entity-extraction",
@@ -230,7 +230,7 @@ def extract_keywords(
230
230
  return [(kw, float(score)) for kw, score in keywords]
231
231
  except (RuntimeError, OSError, ValueError):
232
232
  return []
233
- except ImportError as e:
233
+ except ImportError as e: # pragma: no cover
234
234
  raise MissingDependencyError.create_for_package(
235
235
  package_name="keybert",
236
236
  dependency_group="entity-extraction",
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
  # Import optional dependencies at module level with proper error handling
20
20
  try:
21
21
  import mailparse
22
- except ImportError:
22
+ except ImportError: # pragma: no cover
23
23
  mailparse = None
24
24
 
25
25
  try:
26
26
  import html2text # type: ignore[import-not-found]
27
- except ImportError:
27
+ except ImportError: # pragma: no cover
28
28
  html2text = None
29
29
 
30
30
  # Compile regex pattern once at module level
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
59
59
 
60
60
  to_info = parsed_email.get("to")
61
61
  if to_info:
62
+ # Store the raw value in metadata (could be string, dict, or list)
62
63
  if isinstance(to_info, list) and to_info:
64
+ # For metadata, use first recipient's email if it's a list
63
65
  to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
66
+ metadata["email_to"] = to_email
64
67
  elif isinstance(to_info, dict):
65
- to_email = to_info.get("email", "")
68
+ metadata["email_to"] = to_info.get("email", "")
66
69
  else:
67
- to_email = str(to_info)
68
- metadata["email_to"] = to_email
69
- text_parts.append(f"To: {to_email}")
70
+ metadata["email_to"] = str(to_info)
71
+
72
+ # For display, format all recipients
73
+ to_formatted = self._format_email_field(to_info)
74
+ text_parts.append(f"To: {to_formatted}")
70
75
 
71
76
  date = parsed_email.get("date")
72
77
  if date:
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
76
81
  cc = parsed_email.get("cc")
77
82
  if cc:
78
83
  metadata["email_cc"] = cc
79
- text_parts.append(f"CC: {cc}")
84
+ cc_formatted = self._format_email_field(cc)
85
+ text_parts.append(f"CC: {cc_formatted}")
80
86
 
81
87
  bcc = parsed_email.get("bcc")
82
88
  if bcc:
83
89
  metadata["email_bcc"] = bcc
84
- text_parts.append(f"BCC: {bcc}")
90
+ bcc_formatted = self._format_email_field(bcc)
91
+ text_parts.append(f"BCC: {bcc_formatted}")
92
+
93
+ def _format_email_field(self, field: Any) -> str:
94
+ """Format email field (to, cc, bcc) for display."""
95
+ if isinstance(field, list):
96
+ emails = []
97
+ for item in field:
98
+ if isinstance(item, dict):
99
+ email = item.get("email", "")
100
+ if email:
101
+ emails.append(email)
102
+ else:
103
+ emails.append(str(item))
104
+ return ", ".join(emails)
105
+ if isinstance(field, dict):
106
+ return str(field.get("email", ""))
107
+ return str(field)
85
108
 
86
109
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
87
110
  """Extract and process email body content."""
@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
22
22
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
23
23
  from kreuzberg._ocr._tesseract import TesseractConfig
24
24
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
25
- from kreuzberg._types import ExtractionResult, OcrBackendType
25
+ from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
26
26
  from kreuzberg._utils._errors import create_error_context, should_retry
27
27
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
28
28
  from kreuzberg._utils._string import normalize_spaces
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
33
33
 
34
34
  if TYPE_CHECKING: # pragma: no cover
35
35
  from PIL.Image import Image
36
+ from playa.document import Document
36
37
 
37
38
 
38
39
  class PDFExtractor(Extractor):
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
45
46
  file_path, unlink = await create_temp_file(".pdf")
46
47
  await AsyncPath(file_path).write_bytes(content)
47
48
  try:
48
- metadata = await extract_pdf_metadata(content)
49
+ metadata = await self._extract_metadata_with_password_attempts(content)
49
50
  result = await self.extract_path_async(file_path)
50
51
 
51
52
  result.metadata = metadata
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
73
74
  if not result:
74
75
  result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
75
76
 
76
- result.metadata = await extract_pdf_metadata(content_bytes)
77
+ result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
77
78
 
78
79
  if self.config.extract_tables:
79
80
  # GMFT is optional dependency
@@ -81,7 +82,7 @@ class PDFExtractor(Extractor):
81
82
  from kreuzberg._gmft import extract_tables # noqa: PLC0415
82
83
 
83
84
  result.tables = await extract_tables(path, self.config.gmft_config)
84
- except ImportError:
85
+ except ImportError: # pragma: no cover
85
86
  result.tables = []
86
87
 
87
88
  # Enhance metadata with table information
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
107
108
 
108
109
  result = self.extract_path_sync(Path(temp_path))
109
110
 
110
- metadata = extract_pdf_metadata_sync(content)
111
+ metadata = self._extract_metadata_with_password_attempts_sync(content)
111
112
  result.metadata = metadata
112
113
 
113
114
  return result
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
406
407
  # Use list comprehension and join for efficient string building
407
408
  return "\n\n".join(result.content for result in results)
408
409
 
410
+ def _parse_with_password_attempts(self, content: bytes) -> Document:
411
+ """Parse PDF with password attempts."""
412
+ # Normalize password to list
413
+ if isinstance(self.config.pdf_password, str):
414
+ passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
415
+ else:
416
+ passwords = list(self.config.pdf_password)
417
+
418
+ # Try each password in sequence
419
+ last_exception = None
420
+ for password in passwords:
421
+ try:
422
+ return parse(content, max_workers=1, password=password)
423
+ except Exception as e: # noqa: PERF203, BLE001
424
+ last_exception = e
425
+ continue
426
+
427
+ # If all passwords failed, raise the last exception
428
+ if last_exception:
429
+ raise last_exception from None
430
+
431
+ # Fallback to no password
432
+ return parse(content, max_workers=1, password="")
433
+
434
+ def _get_passwords_to_try(self) -> list[str]:
435
+ """Get list of passwords to try in sequence."""
436
+ if isinstance(self.config.pdf_password, str):
437
+ return [self.config.pdf_password] if self.config.pdf_password else [""]
438
+ return list(self.config.pdf_password) if self.config.pdf_password else [""]
439
+
440
+ async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
441
+ """Extract PDF metadata with password attempts."""
442
+ passwords = self._get_passwords_to_try()
443
+
444
+ last_exception = None
445
+ for password in passwords:
446
+ try:
447
+ return await extract_pdf_metadata(content, password=password)
448
+ except Exception as e: # noqa: PERF203, BLE001
449
+ last_exception = e
450
+ continue
451
+
452
+ # If all passwords failed, try with empty password as fallback
453
+ try:
454
+ return await extract_pdf_metadata(content, password="")
455
+ except Exception:
456
+ if last_exception:
457
+ raise last_exception from None
458
+ raise
459
+
460
+ def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
461
+ """Extract PDF metadata with password attempts (sync version)."""
462
+ passwords = self._get_passwords_to_try()
463
+
464
+ last_exception = None
465
+ for password in passwords:
466
+ try:
467
+ return extract_pdf_metadata_sync(content, password=password)
468
+ except Exception as e: # noqa: PERF203, BLE001
469
+ last_exception = e
470
+ continue
471
+
472
+ # If all passwords failed, try with empty password as fallback
473
+ try:
474
+ return extract_pdf_metadata_sync(content, password="")
475
+ except Exception:
476
+ if last_exception:
477
+ raise last_exception from None
478
+ raise
479
+
409
480
  def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
410
481
  """Extract text using playa for better structure preservation."""
411
482
  with contextlib.suppress(Exception):
412
483
  content = path.read_bytes()
413
- document = parse(content, max_workers=1)
484
+ document = self._parse_with_password_attempts(content)
414
485
 
415
486
  # Extract text while preserving structure
416
487
  pages_text = []