kreuzberg 3.13.0__tar.gz → 3.13.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/PKG-INFO +3 -2
  2. kreuzberg-3.13.2/docker-logs/docker-info.txt +60 -0
  3. kreuzberg-3.13.2/docker-logs/docker-version.txt +27 -0
  4. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_chunker.py +0 -15
  5. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_config.py +0 -124
  6. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_document_classification.py +20 -39
  7. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_entity_extraction.py +0 -29
  8. kreuzberg-3.13.2/kreuzberg/_extractors/_base.py +62 -0
  9. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_email.py +0 -4
  10. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_image.py +0 -2
  11. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_pandoc.py +0 -58
  12. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_pdf.py +0 -3
  13. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_presentation.py +0 -82
  14. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_spread_sheet.py +0 -2
  15. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_gmft.py +0 -61
  16. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_language_detection.py +0 -14
  17. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_mime_types.py +0 -17
  18. kreuzberg-3.13.2/kreuzberg/_ocr/_base.py +41 -0
  19. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_ocr/_easyocr.py +110 -85
  20. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_ocr/_paddleocr.py +146 -138
  21. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_ocr/_table_extractor.py +0 -76
  22. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_ocr/_tesseract.py +0 -206
  23. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_playa.py +0 -27
  24. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_registry.py +0 -36
  25. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_types.py +16 -119
  26. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_cache.py +0 -52
  27. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_device.py +0 -56
  28. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_document_cache.py +0 -73
  29. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_errors.py +0 -47
  30. kreuzberg-3.13.2/kreuzberg/_utils/_ocr_cache.py +136 -0
  31. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_pdf_lock.py +0 -14
  32. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_process_pool.py +0 -47
  33. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_quality.py +0 -17
  34. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_ref.py +0 -16
  35. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_serialization.py +0 -25
  36. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_string.py +0 -20
  37. kreuzberg-3.13.2/kreuzberg/_utils/_sync.py +64 -0
  38. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_table.py +0 -45
  39. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/_tmp.py +0 -9
  40. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/cli.py +2 -2
  41. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/pyproject.toml +3 -2
  42. kreuzberg-3.13.2/test_report.json +16 -0
  43. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/api/main_test.py +10 -10
  44. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/config_test.py +11 -11
  45. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/entity_extraction_test.py +1 -2
  46. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/gmft_test.py +4 -1
  47. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/language_detection_test.py +2 -1
  48. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/multiprocessing/gmft_isolated_test.py +4 -0
  49. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/multiprocessing/tesseract_pool_test.py +1 -1
  50. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/device_integration_test.py +5 -5
  51. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/easyocr_test.py +142 -105
  52. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/paddleocr_test.py +223 -280
  53. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/tesseract_test.py +105 -7
  54. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/tesseract_tsv_test.py +37 -10
  55. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/tesseract_sync_formats_test.py +4 -3
  56. kreuzberg-3.13.2/tests/utils/ocr_cache_test.py +322 -0
  57. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/uv.lock +103 -68
  58. kreuzberg-3.13.0/kreuzberg/_extractors/_base.py +0 -124
  59. kreuzberg-3.13.0/kreuzberg/_ocr/_base.py +0 -113
  60. kreuzberg-3.13.0/kreuzberg/_utils/_sync.py +0 -140
  61. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.commitlintrc +0 -0
  62. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.deepsource.toml +0 -0
  63. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.docker/Dockerfile +0 -0
  64. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.docker/README.md +0 -0
  65. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.dockerignore +0 -0
  66. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.github/dependabot.yaml +0 -0
  67. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.github/workflows/ci.yaml +0 -0
  68. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.github/workflows/docker-e2e-tests.yml +0 -0
  69. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.github/workflows/docs.yml +0 -0
  70. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.github/workflows/pr-title.yaml +0 -0
  71. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.github/workflows/publish-docker.yml +0 -0
  72. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.github/workflows/release.yaml +0 -0
  73. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.github/workflows/test-docker-builds.yml +0 -0
  74. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.gitignore +0 -0
  75. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.markdownlint.yaml +0 -0
  76. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/.pre-commit-config.yaml +0 -0
  77. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/LICENSE +0 -0
  78. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/README.md +0 -0
  79. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/Taskfile.yml +0 -0
  80. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/ai-rulez.yaml +0 -0
  81. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/README.md +0 -0
  82. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/__init__.py +0 -0
  83. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/pyproject.toml +0 -0
  84. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/src/__init__.py +0 -0
  85. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/src/__main__.py +0 -0
  86. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/src/benchmarks.py +0 -0
  87. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/src/cli.py +0 -0
  88. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/src/models.py +0 -0
  89. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/src/profiler.py +0 -0
  90. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/benchmarks/src/runner.py +0 -0
  91. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docker-compose.example.yml +0 -0
  92. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/advanced/custom-extractors.md +0 -0
  93. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/advanced/custom-hooks.md +0 -0
  94. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/advanced/error-handling.md +0 -0
  95. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/advanced/index.md +0 -0
  96. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/advanced/performance.md +0 -0
  97. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/api-reference/exceptions.md +0 -0
  98. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/api-reference/extraction-functions.md +0 -0
  99. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/api-reference/extractor-registry.md +0 -0
  100. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/api-reference/index.md +0 -0
  101. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/api-reference/ocr-configuration.md +0 -0
  102. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/api-reference/types.md +0 -0
  103. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/assets/favicon.png +0 -0
  104. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/assets/logo.png +0 -0
  105. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/cli.md +0 -0
  106. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/contributing.md +0 -0
  107. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/css/extra.css +0 -0
  108. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/examples/extraction-examples.md +0 -0
  109. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/examples/index.md +0 -0
  110. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/getting-started/index.md +0 -0
  111. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/getting-started/installation.md +0 -0
  112. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/getting-started/quick-start.md +0 -0
  113. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/index.md +0 -0
  114. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/api-server.md +0 -0
  115. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/basic-usage.md +0 -0
  116. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/chunking.md +0 -0
  117. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/docker.md +0 -0
  118. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/document-classification.md +0 -0
  119. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/extraction-configuration.md +0 -0
  120. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/index.md +0 -0
  121. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/mcp-server.md +0 -0
  122. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/metadata-extraction.md +0 -0
  123. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/ocr-backends.md +0 -0
  124. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/ocr-configuration.md +0 -0
  125. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/docs/user-guide/supported-formats.md +0 -0
  126. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/__init__.py +0 -0
  127. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/__main__.py +0 -0
  128. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_api/__init__.py +0 -0
  129. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_api/main.py +0 -0
  130. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_constants.py +0 -0
  131. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/__init__.py +0 -0
  132. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_html.py +0 -0
  133. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_structured.py +0 -0
  134. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_mcp/__init__.py +0 -0
  135. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_mcp/server.py +0 -0
  136. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_ocr/__init__.py +0 -0
  137. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_utils/__init__.py +0 -0
  138. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/exceptions.py +0 -0
  139. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/extraction.py +0 -0
  140. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/py.typed +0 -0
  141. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/mkdocs.yaml +0 -0
  142. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/output.txt +0 -0
  143. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/results/baseline.json +0 -0
  144. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/results/serialization.json +0 -0
  145. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/results/statistical.json +0 -0
  146. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/__init__.py +0 -0
  147. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/api/__init__.py +0 -0
  148. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/api/conftest.py +0 -0
  149. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/api/runtime_config_test.py +0 -0
  150. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/chunker_test.py +0 -0
  151. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/cli_command_test.py +0 -0
  152. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/cli_integration_test.py +0 -0
  153. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/cli_test.py +0 -0
  154. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/conftest.py +0 -0
  155. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/document_classification_test.py +0 -0
  156. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/e2e/__init__.py +0 -0
  157. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/e2e/docker_e2e_test.py +0 -0
  158. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/exceptions_test.py +0 -0
  159. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extraction_batch_test.py +0 -0
  160. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extraction_test.py +0 -0
  161. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/__init__.py +0 -0
  162. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/email_test.py +0 -0
  163. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/html_test.py +0 -0
  164. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/image_test.py +0 -0
  165. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/pandoc_metadata_test.py +0 -0
  166. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/pandoc_test.py +0 -0
  167. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/pdf_test.py +0 -0
  168. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/presentation_test.py +0 -0
  169. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/spreed_sheet_test.py +0 -0
  170. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/extractors/structured_test.py +0 -0
  171. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/gmft_extended_test.py +0 -0
  172. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/hooks_test.py +0 -0
  173. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/html_to_markdown_config_test.py +0 -0
  174. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/mcp_server_test.py +0 -0
  175. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/mime_types_test.py +0 -0
  176. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/multiprocessing/__init__.py +0 -0
  177. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/multiprocessing/gmft_integration_test.py +0 -0
  178. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/multiprocessing/process_manager_test.py +0 -0
  179. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/__init__.py +0 -0
  180. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/base_test.py +0 -0
  181. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/init_test.py +0 -0
  182. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/ocr/tesseract_tsv_integration_test.py +0 -0
  183. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/playa_helpers_test.py +0 -0
  184. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/playa_test.py +0 -0
  185. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/registry_test.py +0 -0
  186. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/contract.txt +0 -0
  187. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/contract_test.txt +0 -0
  188. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/document.docx +0 -0
  189. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/email/sample-email.eml +0 -0
  190. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  191. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/excel.xlsx +0 -0
  192. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/flower-no-text.jpg +0 -0
  193. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/form_test.txt +0 -0
  194. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/french-text.txt +0 -0
  195. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/german-text.txt +0 -0
  196. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/html.html +0 -0
  197. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/images/test_hello_world.png +0 -0
  198. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/invoice_image.png +0 -0
  199. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/invoice_test.txt +0 -0
  200. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/json/sample-document.json +0 -0
  201. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  202. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/markdown.md +0 -0
  203. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/non-ascii-text.pdf +0 -0
  204. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/non-searchable.pdf +0 -0
  205. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/ocr-image.jpg +0 -0
  206. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  207. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  208. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  209. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  210. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/receipt_test.txt +0 -0
  211. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/report_test.txt +0 -0
  212. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/sample-contract.pdf +0 -0
  213. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/scanned.pdf +0 -0
  214. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/searchable.pdf +0 -0
  215. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/spanish-text.txt +0 -0
  216. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/tables/borderless_table.png +0 -0
  217. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/tables/complex_document.png +0 -0
  218. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/tables/simple_table.png +0 -0
  219. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/test-article.pdf +0 -0
  220. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/test_source_files/yaml/sample-config.yaml +0 -0
  221. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/types_test.py +0 -0
  222. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/__init__.py +0 -0
  223. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/cache_test.py +0 -0
  224. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/device_test.py +0 -0
  225. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/errors_test.py +0 -0
  226. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/pdf_lock_test.py +0 -0
  227. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/process_pool_test.py +0 -0
  228. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/ref_test.py +0 -0
  229. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/serialization_test.py +0 -0
  230. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/string_test.py +0 -0
  231. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/sync_test.py +0 -0
  232. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/table_test.py +0 -0
  233. {kreuzberg-3.13.0 → kreuzberg-3.13.2}/tests/utils/tmp_test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.13.0
3
+ Version: 3.13.2
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -34,11 +34,12 @@ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.9.1
35
35
  Requires-Dist: mcp>=1.13.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
+ Requires-Dist: numpy>=1.24.0
37
38
  Requires-Dist: playa-pdf>=0.7.0
38
39
  Requires-Dist: polars>=1.33.0
39
40
  Requires-Dist: psutil>=7.0.0
40
41
  Requires-Dist: pypdfium2==4.30.0
41
- Requires-Dist: python-calamine>=0.3.2
42
+ Requires-Dist: python-calamine>=0.5.2
42
43
  Requires-Dist: python-pptx>=1.0.2
43
44
  Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
44
45
  Provides-Extra: additional-extensions
@@ -0,0 +1,60 @@
1
+ Client: Docker Engine - Community
2
+ Version: 28.0.4
3
+ Context: default
4
+ Debug Mode: false
5
+ Plugins:
6
+ buildx: Docker Buildx (Docker Inc.)
7
+ Version: v0.27.0
8
+ Path: /usr/libexec/docker/cli-plugins/docker-buildx
9
+ compose: Docker Compose (Docker Inc.)
10
+ Version: v2.38.2
11
+ Path: /usr/libexec/docker/cli-plugins/docker-compose
12
+
13
+ Server:
14
+ Containers: 1
15
+ Running: 1
16
+ Paused: 0
17
+ Stopped: 0
18
+ Images: 2
19
+ Server Version: 28.0.4
20
+ Storage Driver: overlay2
21
+ Backing Filesystem: extfs
22
+ Supports d_type: true
23
+ Using metacopy: false
24
+ Native Overlay Diff: false
25
+ userxattr: false
26
+ Logging Driver: json-file
27
+ Cgroup Driver: systemd
28
+ Cgroup Version: 2
29
+ Plugins:
30
+ Volume: local
31
+ Network: bridge host ipvlan macvlan null overlay
32
+ Log: awslogs fluentd gcplogs gelf journald json-file local splunk syslog
33
+ Swarm: inactive
34
+ Runtimes: io.containerd.runc.v2 runc
35
+ Default Runtime: runc
36
+ Init Binary: docker-init
37
+ containerd version: 05044ec0a9a75232cad458027ca83437aae3f4da
38
+ runc version: v1.2.5-0-g59923ef
39
+ init version: de40ad0
40
+ Security Options:
41
+ apparmor
42
+ seccomp
43
+ Profile: builtin
44
+ cgroupns
45
+ Kernel Version: 6.11.0-1018-azure
46
+ Operating System: Ubuntu 24.04.3 LTS
47
+ OSType: linux
48
+ Architecture: x86_64
49
+ CPUs: 4
50
+ Total Memory: 15.62GiB
51
+ Name: pkrvm7jw40e0xgp
52
+ ID: 33a18c03-7dc8-4ab9-bfe1-99342b7c1aaf
53
+ Docker Root Dir: /var/lib/docker
54
+ Debug Mode: false
55
+ Username: githubactions
56
+ Experimental: false
57
+ Insecure Registries:
58
+ ::1/128
59
+ 127.0.0.0/8
60
+ Live Restore Enabled: false
@@ -0,0 +1,27 @@
1
+ Client: Docker Engine - Community
2
+ Version: 28.0.4
3
+ API version: 1.48
4
+ Go version: go1.23.7
5
+ Git commit: b8034c0
6
+ Built: Tue Mar 25 15:07:16 2025
7
+ OS/Arch: linux/amd64
8
+ Context: default
9
+
10
+ Server: Docker Engine - Community
11
+ Engine:
12
+ Version: 28.0.4
13
+ API version: 1.48 (minimum version 1.24)
14
+ Go version: go1.23.7
15
+ Git commit: 6430e49
16
+ Built: Tue Mar 25 15:07:16 2025
17
+ OS/Arch: linux/amd64
18
+ Experimental: false
19
+ containerd:
20
+ Version: 1.7.27
21
+ GitCommit: 05044ec0a9a75232cad458027ca83437aae3f4da
22
+ runc:
23
+ Version: 1.2.5
24
+ GitCommit: v1.2.5-0-g59923ef
25
+ docker-init:
26
+ Version: 0.19.0
27
+ GitCommit: de40ad0
@@ -17,21 +17,6 @@ def get_chunker(
17
17
  max_characters: int = DEFAULT_MAX_CHARACTERS,
18
18
  overlap_characters: int = DEFAULT_MAX_OVERLAP,
19
19
  ) -> MarkdownSplitter | TextSplitter:
20
- """Creates and returns a Chunker object configured with the given maximum
21
- characters per chunk and overlap between chunks.
22
-
23
- Args:
24
- mime_type: The mime type of the content.
25
- max_characters: Maximum number of characters allowed in each chunk.
26
- overlap_characters: Number of characters overlapping between two consecutive chunks.
27
-
28
- Raises:
29
- MissingDependencyError: if semantic-text-splitter is not installed.
30
-
31
- Returns:
32
- Chunker: A Chunker object configured with the specified maximum
33
- characters and overlap.
34
- """
35
20
  key = (max_characters, overlap_characters, mime_type)
36
21
  if key not in _chunkers:
37
22
  try:
@@ -148,17 +148,6 @@ def _create_ocr_config(
148
148
 
149
149
 
150
150
  def load_config_from_file(config_path: Path) -> dict[str, Any]:
151
- """Load configuration from a TOML file.
152
-
153
- Args:
154
- config_path: Path to the configuration file.
155
-
156
- Returns:
157
- Dictionary containing the loaded configuration.
158
-
159
- Raises:
160
- ValidationError: If the file cannot be read or parsed.
161
- """
162
151
  try:
163
152
  with config_path.open("rb") as f:
164
153
  data = tomllib.load(f)
@@ -177,15 +166,6 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
177
166
 
178
167
 
179
168
  def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
180
- """Merge two configuration dictionaries recursively.
181
-
182
- Args:
183
- base: Base configuration dictionary.
184
- override: Configuration dictionary to override base values.
185
-
186
- Returns:
187
- Merged configuration dictionary.
188
- """
189
169
  result = base.copy()
190
170
  for key, value in override.items():
191
171
  if isinstance(value, dict) and key in result and isinstance(result[key], dict):
@@ -198,18 +178,6 @@ def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, A
198
178
  def parse_ocr_backend_config(
199
179
  config_dict: dict[str, Any], backend: OcrBackendType
200
180
  ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
201
- """Parse OCR backend-specific configuration.
202
-
203
- Args:
204
- config_dict: Configuration dictionary.
205
- backend: The OCR backend type.
206
-
207
- Returns:
208
- Backend-specific configuration object or None.
209
-
210
- Raises:
211
- ValidationError: If the backend configuration is invalid.
212
- """
213
181
  if backend not in config_dict:
214
182
  return None
215
183
 
@@ -230,17 +198,6 @@ def parse_ocr_backend_config(
230
198
 
231
199
 
232
200
  def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
233
- """Build ExtractionConfig from a configuration dictionary.
234
-
235
- Args:
236
- config_dict: Configuration dictionary from TOML file.
237
-
238
- Returns:
239
- ExtractionConfig instance.
240
-
241
- Raises:
242
- ValidationError: If the configuration is invalid.
243
- """
244
201
  extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
245
202
 
246
203
  ocr_backend = extraction_config.get("ocr_backend")
@@ -288,18 +245,6 @@ def build_extraction_config(
288
245
  file_config: dict[str, Any],
289
246
  cli_args: MutableMapping[str, Any],
290
247
  ) -> ExtractionConfig:
291
- """Build ExtractionConfig from file config and CLI arguments.
292
-
293
- Args:
294
- file_config: Configuration loaded from file.
295
- cli_args: CLI arguments.
296
-
297
- Returns:
298
- ExtractionConfig instance.
299
-
300
- Raises:
301
- ValidationError: If the combined configuration is invalid.
302
- """
303
248
  config_dict: dict[str, Any] = {}
304
249
 
305
250
  _merge_file_config(config_dict, file_config)
@@ -321,21 +266,6 @@ def build_extraction_config(
321
266
 
322
267
 
323
268
  def find_config_file(start_path: Path | None = None) -> Path | None:
324
- """Find configuration file by searching up the directory tree.
325
-
326
- Searches for configuration files in the following order:
327
- 1. kreuzberg.toml
328
- 2. pyproject.toml (with [tool.kreuzberg] section)
329
-
330
- Args:
331
- start_path: Directory to start searching from. Defaults to current working directory.
332
-
333
- Returns:
334
- Path to the configuration file or None if not found.
335
-
336
- Raises:
337
- ValidationError: If a config file exists but cannot be read or has invalid TOML.
338
- """
339
269
  current = start_path or Path.cwd()
340
270
 
341
271
  while current != current.parent:
@@ -366,17 +296,6 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
366
296
 
367
297
 
368
298
  def load_default_config(start_path: Path | None = None) -> ExtractionConfig | None:
369
- """Load the default configuration from discovered config file.
370
-
371
- Args:
372
- start_path: Directory to start searching from. Defaults to current working directory.
373
-
374
- Returns:
375
- ExtractionConfig instance or None if no configuration found.
376
-
377
- Raises:
378
- ValidationError: If configuration file exists but contains invalid configuration.
379
- """
380
299
  config_path = find_config_file(start_path)
381
300
  if not config_path:
382
301
  return None
@@ -388,34 +307,12 @@ def load_default_config(start_path: Path | None = None) -> ExtractionConfig | No
388
307
 
389
308
 
390
309
  def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
391
- """Load configuration from a specific file path.
392
-
393
- Args:
394
- config_path: Path to the configuration file.
395
-
396
- Returns:
397
- ExtractionConfig instance.
398
-
399
- Raises:
400
- ValidationError: If the file cannot be read, parsed, or is invalid.
401
- """
402
310
  path = Path(config_path)
403
311
  config_dict = load_config_from_file(path)
404
312
  return build_extraction_config_from_dict(config_dict)
405
313
 
406
314
 
407
315
  def discover_and_load_config(start_path: Path | str | None = None) -> ExtractionConfig:
408
- """Load configuration by discovering config files in the directory tree.
409
-
410
- Args:
411
- start_path: Directory to start searching from. Defaults to current working directory.
412
-
413
- Returns:
414
- ExtractionConfig instance.
415
-
416
- Raises:
417
- ValidationError: If no configuration file is found or if the file is invalid.
418
- """
419
316
  search_path = Path(start_path) if start_path else None
420
317
  config_path = find_config_file(search_path)
421
318
 
@@ -436,19 +333,6 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
436
333
 
437
334
 
438
335
  def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
439
- """Discover and load configuration, returning None if no config file found.
440
-
441
- If a config file is found, attempts to load it. Any errors during loading will bubble up.
442
-
443
- Args:
444
- start_path: Directory to start searching from. Defaults to current working directory.
445
-
446
- Returns:
447
- ExtractionConfig instance or None if no configuration file found.
448
-
449
- Raises:
450
- ValidationError: If a configuration file exists but is invalid.
451
- """
452
336
  search_path = Path(start_path) if start_path else None
453
337
  config_path = find_config_file(search_path)
454
338
 
@@ -462,12 +346,4 @@ def discover_config(start_path: Path | str | None = None) -> ExtractionConfig |
462
346
 
463
347
 
464
348
  def find_default_config() -> Path | None:
465
- """Find the default configuration file (pyproject.toml).
466
-
467
- Returns:
468
- Path to the configuration file or None if not found.
469
-
470
- Note:
471
- This function is deprecated. Use find_config_file() instead.
472
- """
473
349
  return find_config_file()
@@ -3,6 +3,8 @@ from __future__ import annotations
3
3
  import re
4
4
  from typing import TYPE_CHECKING
5
5
 
6
+ import polars as pl
7
+
6
8
  from kreuzberg._ocr import get_ocr_backend
7
9
  from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
8
10
  from kreuzberg.exceptions import MissingDependencyError
@@ -40,17 +42,6 @@ DOCUMENT_CLASSIFIERS = {
40
42
 
41
43
 
42
44
  def _get_translated_text(result: ExtractionResult) -> str:
43
- """Translate extracted text to English using Google Translate API.
44
-
45
- Args:
46
- result: ExtractionResult containing the text to be translated
47
-
48
- Returns:
49
- str: The translated text in lowercase English
50
-
51
- Raises:
52
- MissingDependencyError: If the deep-translator package is not installed
53
- """
54
45
  text_to_classify = result.content
55
46
  if result.metadata:
56
47
  metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
@@ -70,16 +61,6 @@ def _get_translated_text(result: ExtractionResult) -> str:
70
61
 
71
62
 
72
63
  def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
73
- """Classifies the document type based on keywords and patterns.
74
-
75
- Args:
76
- result: The extraction result containing the content.
77
- config: The extraction configuration.
78
-
79
- Returns:
80
- A tuple containing the detected document type and the confidence score,
81
- or (None, None) if no type is detected with sufficient confidence.
82
- """
83
64
  if not config.auto_detect_document_type:
84
65
  return None, None
85
66
 
@@ -108,27 +89,17 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
108
89
  def classify_document_from_layout(
109
90
  result: ExtractionResult, config: ExtractionConfig
110
91
  ) -> tuple[str | None, float | None]:
111
- """Classifies the document type based on layout information from OCR.
112
-
113
- Args:
114
- result: The extraction result containing the layout data.
115
- config: The extraction configuration.
116
-
117
- Returns:
118
- A tuple containing the detected document type and the confidence score,
119
- or (None, None) if no type is detected with sufficient confidence.
120
- """
121
92
  if not config.auto_detect_document_type:
122
93
  return None, None
123
94
 
124
- if result.layout is None or result.layout.empty:
95
+ if result.layout is None or result.layout.is_empty():
125
96
  return None, None
126
97
 
127
98
  layout_df = result.layout
128
99
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
129
100
  return None, None
130
101
 
131
- layout_text = " ".join(layout_df["text"].astype(str).tolist())
102
+ layout_text = " ".join(layout_df["text"].cast(str).to_list())
132
103
 
133
104
  text_to_classify = layout_text
134
105
  if result.metadata:
@@ -142,17 +113,27 @@ def classify_document_from_layout(
142
113
  except Exception: # noqa: BLE001
143
114
  translated_text = text_to_classify.lower()
144
115
 
145
- layout_df["translated_text"] = translated_text
116
+ layout_df = layout_df.with_columns(pl.lit(translated_text).alias("translated_text"))
146
117
 
147
- page_height = layout_df["top"].max() + layout_df["height"].max()
118
+ try:
119
+ layout_df = layout_df.with_columns(
120
+ [pl.col("top").cast(pl.Float64, strict=False), pl.col("height").cast(pl.Float64, strict=False)]
121
+ )
122
+
123
+ page_height_val = layout_df.select(pl.col("top").max() + pl.col("height").max()).item()
124
+ if page_height_val is None:
125
+ page_height_val = 0.0
126
+ page_height = float(page_height_val)
127
+ except Exception: # noqa: BLE001
128
+ page_height = 1000.0
148
129
  scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
149
130
 
150
131
  for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
151
132
  for pattern in patterns:
152
- found_words = layout_df[layout_df["translated_text"].str.contains(pattern, case=False, na=False)]
153
- if not found_words.empty:
133
+ found_words = layout_df.filter(layout_df["translated_text"].str.contains(pattern))
134
+ if not found_words.is_empty():
154
135
  scores[doc_type] += 1.0
155
- word_top = found_words.iloc[0]["top"]
136
+ word_top = found_words[0, "top"]
156
137
  if word_top < page_height * 0.3:
157
138
  scores[doc_type] += 0.5
158
139
 
@@ -176,7 +157,7 @@ def auto_detect_document_type(
176
157
  if config.document_classification_mode == "vision" and file_path:
177
158
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
178
159
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
179
- elif result.layout is not None and not result.layout.empty:
160
+ elif result.layout is not None and not result.layout.is_empty():
180
161
  result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
181
162
  else:
182
163
  result.document_type, result.document_type_confidence = classify_document(result, config)
@@ -19,21 +19,6 @@ def extract_entities(
19
19
  languages: list[str] | None = None,
20
20
  spacy_config: SpacyEntityExtractionConfig | None = None,
21
21
  ) -> list[Entity]:
22
- """Extract entities from text using custom regex patterns and/or a NER model.
23
-
24
- Args:
25
- text: The input text to extract entities from.
26
- entity_types: List of entity types to extract using the NER model.
27
- custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
28
- languages: List of detected languages to choose appropriate spaCy models.
29
- spacy_config: Configuration for spaCy entity extraction.
30
-
31
- Returns:
32
- list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
33
-
34
- Raises:
35
- MissingDependencyError: If `spacy` is not installed.
36
- """
37
22
  entities: list[Entity] = []
38
23
  if custom_patterns:
39
24
  for ent_type, pattern in custom_patterns:
@@ -85,7 +70,6 @@ def extract_entities(
85
70
 
86
71
  @lru_cache(maxsize=32)
87
72
  def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
88
- """Load a spaCy model with caching."""
89
73
  try:
90
74
  import spacy # noqa: PLC0415
91
75
 
@@ -102,7 +86,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
102
86
 
103
87
 
104
88
  def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
105
- """Select the best spaCy model based on detected languages."""
106
89
  if not languages:
107
90
  return spacy_config.get_model_for_language("en")
108
91
 
@@ -118,18 +101,6 @@ def extract_keywords(
118
101
  text: str,
119
102
  keyword_count: int = 10,
120
103
  ) -> list[tuple[str, float]]:
121
- """Extract keywords from text using the KeyBERT model.
122
-
123
- Args:
124
- text: The input text to extract keywords from.
125
- keyword_count: Number of top keywords to return. Defaults to 10.
126
-
127
- Returns:
128
- list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
129
-
130
- Raises:
131
- MissingDependencyError: If `keybert` is not installed.
132
- """
133
104
  try:
134
105
  from keybert import KeyBERT # noqa: PLC0415
135
106
 
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, ClassVar
5
+
6
+ from kreuzberg._types import ExtractionResult, normalize_metadata
7
+ from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
8
+
9
+ if TYPE_CHECKING:
10
+ from pathlib import Path
11
+
12
+ from kreuzberg._types import ExtractionConfig
13
+
14
+
15
+ class Extractor(ABC):
16
+ __slots__ = ("config", "mime_type")
17
+
18
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]]
19
+
20
+ def __init__(self, mime_type: str, config: ExtractionConfig) -> None:
21
+ self.mime_type = mime_type
22
+ self.config = config
23
+
24
+ @abstractmethod
25
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult: ...
26
+
27
+ @abstractmethod
28
+ async def extract_path_async(self, path: Path) -> ExtractionResult: ...
29
+
30
+ @abstractmethod
31
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult: ...
32
+
33
+ @abstractmethod
34
+ def extract_path_sync(self, path: Path) -> ExtractionResult: ...
35
+
36
+ @classmethod
37
+ def supports_mimetype(cls, mime_type: str) -> bool:
38
+ return mime_type in cls.SUPPORTED_MIME_TYPES or any(
39
+ mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
40
+ )
41
+
42
+ def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
43
+ if not self.config.enable_quality_processing:
44
+ return result
45
+
46
+ if not result.content:
47
+ return result
48
+
49
+ cleaned_content = clean_extracted_text(result.content)
50
+
51
+ quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
52
+
53
+ enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
54
+
55
+ return ExtractionResult(
56
+ content=cleaned_content,
57
+ mime_type=result.mime_type,
58
+ metadata=normalize_metadata(enhanced_metadata),
59
+ chunks=result.chunks,
60
+ detected_languages=result.detected_languages,
61
+ tables=result.tables,
62
+ )
@@ -42,7 +42,6 @@ class EmailExtractor(Extractor):
42
42
  def _extract_email_headers(
43
43
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
44
44
  ) -> None:
45
- """Extract and process email headers."""
46
45
  subject = parsed_email.get("subject")
47
46
  if subject:
48
47
  metadata["subject"] = subject
@@ -85,7 +84,6 @@ class EmailExtractor(Extractor):
85
84
  text_parts.append(f"BCC: {bcc_formatted}")
86
85
 
87
86
  def _format_email_field(self, field: Any) -> str:
88
- """Format email field (to, cc, bcc) for display."""
89
87
  if isinstance(field, list):
90
88
  emails = []
91
89
  for item in field:
@@ -101,7 +99,6 @@ class EmailExtractor(Extractor):
101
99
  return str(field)
102
100
 
103
101
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
104
- """Extract and process email body content."""
105
102
  text_content = parsed_email.get("text")
106
103
  if text_content:
107
104
  text_parts.append(f"\n{text_content}")
@@ -123,7 +120,6 @@ class EmailExtractor(Extractor):
123
120
  def _extract_email_attachments(
124
121
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
125
122
  ) -> None:
126
- """Extract and process email attachments info."""
127
123
  if parsed_email.get("attachments"):
128
124
  attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
129
125
  metadata["attachments"] = attachment_names
@@ -61,7 +61,6 @@ class ImageExtractor(Extractor):
61
61
  return self._apply_quality_processing(result)
62
62
 
63
63
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
64
- """Pure sync implementation of extract_bytes."""
65
64
  extension = self._get_extension_from_mime_type(self.mime_type)
66
65
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
67
66
 
@@ -75,7 +74,6 @@ class ImageExtractor(Extractor):
75
74
  Path(temp_path).unlink()
76
75
 
77
76
  def extract_path_sync(self, path: Path) -> ExtractionResult:
78
- """Pure sync implementation of extract_path."""
79
77
  if self.config.ocr_backend is None:
80
78
  raise ValidationError("ocr_backend is None, cannot perform OCR")
81
79