kreuzberg 3.18.0__tar.gz → 3.20.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (362) hide show
  1. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/PKG-INFO +32 -45
  2. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_api/main.py +4 -2
  3. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_entity_extraction.py +4 -8
  4. kreuzberg-3.20.2/kreuzberg/_error_handling.py +182 -0
  5. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_extractors/_base.py +2 -2
  6. kreuzberg-3.20.2/kreuzberg/_extractors/_html.py +138 -0
  7. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_extractors/_pdf.py +33 -54
  8. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_extractors/_structured.py +1 -1
  9. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_gmft.py +36 -4
  10. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_language_detection.py +2 -0
  11. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_ocr/_tesseract.py +76 -297
  12. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_types.py +153 -47
  13. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/cli.py +36 -22
  14. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/extraction.py +251 -107
  15. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/pyproject.toml +58 -74
  16. kreuzberg-3.18.0/.commitlintrc +0 -1
  17. kreuzberg-3.18.0/.deepsource.toml +0 -54
  18. kreuzberg-3.18.0/.docker/Dockerfile +0 -79
  19. kreuzberg-3.18.0/.docker/README.md +0 -190
  20. kreuzberg-3.18.0/.dockerignore +0 -15
  21. kreuzberg-3.18.0/.github/dependabot.yaml +0 -6
  22. kreuzberg-3.18.0/.github/workflows/ci.yaml +0 -381
  23. kreuzberg-3.18.0/.github/workflows/docker-e2e-tests.yml +0 -150
  24. kreuzberg-3.18.0/.github/workflows/docs.yml +0 -66
  25. kreuzberg-3.18.0/.github/workflows/pr-title.yaml +0 -20
  26. kreuzberg-3.18.0/.github/workflows/publish-docker.yml +0 -163
  27. kreuzberg-3.18.0/.github/workflows/release.yaml +0 -37
  28. kreuzberg-3.18.0/.github/workflows/test-docker-builds.yml +0 -101
  29. kreuzberg-3.18.0/.gitignore +0 -74
  30. kreuzberg-3.18.0/.markdownlint.yaml +0 -17
  31. kreuzberg-3.18.0/.pre-commit-config.yaml +0 -82
  32. kreuzberg-3.18.0/.prettierignore +0 -1
  33. kreuzberg-3.18.0/ATTRIBUTIONS.md +0 -47
  34. kreuzberg-3.18.0/LICENSE +0 -7
  35. kreuzberg-3.18.0/Taskfile.yml +0 -50
  36. kreuzberg-3.18.0/ai-rulez.yaml +0 -586
  37. kreuzberg-3.18.0/benchmarks/README.md +0 -264
  38. kreuzberg-3.18.0/benchmarks/batch_size_benchmark.py +0 -179
  39. kreuzberg-3.18.0/benchmarks/batch_validation_benchmark.py +0 -83
  40. kreuzberg-3.18.0/benchmarks/pyproject.toml +0 -29
  41. kreuzberg-3.18.0/benchmarks/src/__init__.py +0 -1
  42. kreuzberg-3.18.0/benchmarks/src/__main__.py +0 -4
  43. kreuzberg-3.18.0/benchmarks/src/benchmarks.py +0 -703
  44. kreuzberg-3.18.0/benchmarks/src/cli.py +0 -723
  45. kreuzberg-3.18.0/benchmarks/src/models.py +0 -195
  46. kreuzberg-3.18.0/benchmarks/src/profiler.py +0 -161
  47. kreuzberg-3.18.0/benchmarks/src/runner.py +0 -367
  48. kreuzberg-3.18.0/benchmarks/token_reduction_compression_benchmark.py +0 -268
  49. kreuzberg-3.18.0/docs/advanced/custom-extractors.md +0 -203
  50. kreuzberg-3.18.0/docs/advanced/custom-hooks.md +0 -148
  51. kreuzberg-3.18.0/docs/advanced/error-handling.md +0 -181
  52. kreuzberg-3.18.0/docs/advanced/index.md +0 -41
  53. kreuzberg-3.18.0/docs/advanced/performance.md +0 -306
  54. kreuzberg-3.18.0/docs/api-reference/exceptions.md +0 -33
  55. kreuzberg-3.18.0/docs/api-reference/extraction-functions.md +0 -59
  56. kreuzberg-3.18.0/docs/api-reference/extractor-registry.md +0 -5
  57. kreuzberg-3.18.0/docs/api-reference/index.md +0 -51
  58. kreuzberg-3.18.0/docs/api-reference/ocr-configuration.md +0 -27
  59. kreuzberg-3.18.0/docs/api-reference/types.md +0 -120
  60. kreuzberg-3.18.0/docs/assets/favicon.png +0 -0
  61. kreuzberg-3.18.0/docs/assets/logo.png +0 -0
  62. kreuzberg-3.18.0/docs/cli.md +0 -225
  63. kreuzberg-3.18.0/docs/contributing.md +0 -82
  64. kreuzberg-3.18.0/docs/css/extra.css +0 -56
  65. kreuzberg-3.18.0/docs/examples/extraction-examples.md +0 -763
  66. kreuzberg-3.18.0/docs/examples/index.md +0 -48
  67. kreuzberg-3.18.0/docs/getting-started/index.md +0 -20
  68. kreuzberg-3.18.0/docs/getting-started/installation.md +0 -154
  69. kreuzberg-3.18.0/docs/getting-started/quick-start.md +0 -111
  70. kreuzberg-3.18.0/docs/index.md +0 -60
  71. kreuzberg-3.18.0/docs/user-guide/api-server.md +0 -531
  72. kreuzberg-3.18.0/docs/user-guide/basic-usage.md +0 -161
  73. kreuzberg-3.18.0/docs/user-guide/chunking.md +0 -124
  74. kreuzberg-3.18.0/docs/user-guide/docker.md +0 -548
  75. kreuzberg-3.18.0/docs/user-guide/document-classification.md +0 -61
  76. kreuzberg-3.18.0/docs/user-guide/extraction-configuration.md +0 -966
  77. kreuzberg-3.18.0/docs/user-guide/index.md +0 -45
  78. kreuzberg-3.18.0/docs/user-guide/mcp-server.md +0 -586
  79. kreuzberg-3.18.0/docs/user-guide/metadata-extraction.md +0 -125
  80. kreuzberg-3.18.0/docs/user-guide/ocr-backends.md +0 -247
  81. kreuzberg-3.18.0/docs/user-guide/ocr-configuration.md +0 -414
  82. kreuzberg-3.18.0/docs/user-guide/supported-formats.md +0 -71
  83. kreuzberg-3.18.0/docs/user-guide/token-reduction.md +0 -251
  84. kreuzberg-3.18.0/kreuzberg/_extractors/_html.py +0 -148
  85. kreuzberg-3.18.0/kreuzberg/_utils/__init__.py +0 -0
  86. kreuzberg-3.18.0/kreuzberg/_utils/_html_streaming.py +0 -20
  87. kreuzberg-3.18.0/kreuzberg/py.typed +0 -0
  88. kreuzberg-3.18.0/mkdocs.yaml +0 -160
  89. kreuzberg-3.18.0/tests/__init__.py +0 -0
  90. kreuzberg-3.18.0/tests/api/__init__.py +0 -0
  91. kreuzberg-3.18.0/tests/api/config_cache_test.py +0 -224
  92. kreuzberg-3.18.0/tests/api/conftest.py +0 -18
  93. kreuzberg-3.18.0/tests/api/environment_config_test.py +0 -154
  94. kreuzberg-3.18.0/tests/api/header_config_hashing_test.py +0 -29
  95. kreuzberg-3.18.0/tests/api/image_extraction_test.py +0 -59
  96. kreuzberg-3.18.0/tests/api/main_test.py +0 -817
  97. kreuzberg-3.18.0/tests/api/runtime_config_test.py +0 -374
  98. kreuzberg-3.18.0/tests/conftest.py +0 -219
  99. kreuzberg-3.18.0/tests/core/__init__.py +0 -0
  100. kreuzberg-3.18.0/tests/core/comprehensive_config_test.py +0 -664
  101. kreuzberg-3.18.0/tests/core/config_test.py +0 -15
  102. kreuzberg-3.18.0/tests/core/constants_test.py +0 -22
  103. kreuzberg-3.18.0/tests/core/dpi_configuration_test.py +0 -319
  104. kreuzberg-3.18.0/tests/core/exceptions_test.py +0 -159
  105. kreuzberg-3.18.0/tests/core/extraction_batch_test.py +0 -389
  106. kreuzberg-3.18.0/tests/core/extraction_test.py +0 -494
  107. kreuzberg-3.18.0/tests/core/html_to_markdown_config_test.py +0 -0
  108. kreuzberg-3.18.0/tests/core/image_ocr_result_test.py +0 -27
  109. kreuzberg-3.18.0/tests/core/init_test.py +0 -85
  110. kreuzberg-3.18.0/tests/core/main_test.py +0 -35
  111. kreuzberg-3.18.0/tests/core/mime_types_test.py +0 -242
  112. kreuzberg-3.18.0/tests/core/registry_test.py +0 -225
  113. kreuzberg-3.18.0/tests/core/types_test.py +0 -465
  114. kreuzberg-3.18.0/tests/e2e/__init__.py +0 -0
  115. kreuzberg-3.18.0/tests/e2e/docker_e2e.py +0 -481
  116. kreuzberg-3.18.0/tests/extractors/README_image_tests.md +0 -85
  117. kreuzberg-3.18.0/tests/extractors/__init__.py +0 -0
  118. kreuzberg-3.18.0/tests/extractors/base_extractor_test.py +0 -420
  119. kreuzberg-3.18.0/tests/extractors/base_memory_limits_test.py +0 -100
  120. kreuzberg-3.18.0/tests/extractors/base_ocr_processing_test.py +0 -276
  121. kreuzberg-3.18.0/tests/extractors/base_ocr_simple_test.py +0 -64
  122. kreuzberg-3.18.0/tests/extractors/email_error_paths_test.py +0 -39
  123. kreuzberg-3.18.0/tests/extractors/email_test.py +0 -948
  124. kreuzberg-3.18.0/tests/extractors/html_invalid_base64_test.py +0 -11
  125. kreuzberg-3.18.0/tests/extractors/html_test.py +0 -52
  126. kreuzberg-3.18.0/tests/extractors/image_deduplication_test.py +0 -87
  127. kreuzberg-3.18.0/tests/extractors/image_error_handling_test.py +0 -253
  128. kreuzberg-3.18.0/tests/extractors/image_error_simple_test.py +0 -75
  129. kreuzberg-3.18.0/tests/extractors/image_test.py +0 -766
  130. kreuzberg-3.18.0/tests/extractors/json_test.py +0 -427
  131. kreuzberg-3.18.0/tests/extractors/pandoc_metadata_test.py +0 -323
  132. kreuzberg-3.18.0/tests/extractors/pandoc_test.py +0 -1995
  133. kreuzberg-3.18.0/tests/extractors/pdf_images_test.py +0 -52
  134. kreuzberg-3.18.0/tests/extractors/pdf_sync_images_test.py +0 -217
  135. kreuzberg-3.18.0/tests/extractors/pdf_test.py +0 -979
  136. kreuzberg-3.18.0/tests/extractors/presentation_test.py +0 -967
  137. kreuzberg-3.18.0/tests/extractors/spreadsheet_test.py +0 -1140
  138. kreuzberg-3.18.0/tests/extractors/structured_test.py +0 -304
  139. kreuzberg-3.18.0/tests/features/__init__.py +0 -0
  140. kreuzberg-3.18.0/tests/features/chunker_test.py +0 -94
  141. kreuzberg-3.18.0/tests/features/document_classification_test.py +0 -747
  142. kreuzberg-3.18.0/tests/features/entity_extraction_test.py +0 -279
  143. kreuzberg-3.18.0/tests/features/gmft_test.py +0 -1496
  144. kreuzberg-3.18.0/tests/features/hooks_test.py +0 -0
  145. kreuzberg-3.18.0/tests/features/language_detection_test.py +0 -343
  146. kreuzberg-3.18.0/tests/features/table_extraction_test.py +0 -0
  147. kreuzberg-3.18.0/tests/features/token_reduction_test.py +0 -813
  148. kreuzberg-3.18.0/tests/integration/__init__.py +0 -0
  149. kreuzberg-3.18.0/tests/integration/all_extractors_images_test.py +0 -252
  150. kreuzberg-3.18.0/tests/integration/api/__init__.py +0 -0
  151. kreuzberg-3.18.0/tests/integration/api/large_file_test.py +0 -0
  152. kreuzberg-3.18.0/tests/integration/api/mounted_config_test.py +0 -0
  153. kreuzberg-3.18.0/tests/integration/dpi_integration_test.py +0 -209
  154. kreuzberg-3.18.0/tests/integration/multiprocessing/__init__.py +0 -0
  155. kreuzberg-3.18.0/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
  156. kreuzberg-3.18.0/tests/integration/ocr/__init__.py +0 -0
  157. kreuzberg-3.18.0/tests/integration/ocr/device_integration_test.py +0 -0
  158. kreuzberg-3.18.0/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
  159. kreuzberg-3.18.0/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
  160. kreuzberg-3.18.0/tests/integration/pandoc_images_test.py +0 -30
  161. kreuzberg-3.18.0/tests/integration/pdf_images_test.py +0 -18
  162. kreuzberg-3.18.0/tests/integration/pdf_real_images_test.py +0 -52
  163. kreuzberg-3.18.0/tests/integration/pptx_complex_test.py +0 -22
  164. kreuzberg-3.18.0/tests/integration/pptx_images_test.py +0 -18
  165. kreuzberg-3.18.0/tests/integration/regression_test.py +0 -134
  166. kreuzberg-3.18.0/tests/integration/token_reduction_integration_test.py +0 -173
  167. kreuzberg-3.18.0/tests/interfaces/__init__.py +0 -0
  168. kreuzberg-3.18.0/tests/interfaces/cli_test.py +0 -527
  169. kreuzberg-3.18.0/tests/interfaces/mcp_server_test.py +0 -1116
  170. kreuzberg-3.18.0/tests/mcp/__init__.py +0 -0
  171. kreuzberg-3.18.0/tests/mcp/mcp_server_test.py +0 -0
  172. kreuzberg-3.18.0/tests/multiprocessing/__init__.py +0 -0
  173. kreuzberg-3.18.0/tests/multiprocessing/gmft_isolated_test.py +0 -449
  174. kreuzberg-3.18.0/tests/multiprocessing/process_manager_test.py +0 -273
  175. kreuzberg-3.18.0/tests/multiprocessing/tesseract_pool_test.py +0 -331
  176. kreuzberg-3.18.0/tests/ocr/__init__.py +0 -0
  177. kreuzberg-3.18.0/tests/ocr/base_test.py +0 -80
  178. kreuzberg-3.18.0/tests/ocr/easyocr_test.py +0 -517
  179. kreuzberg-3.18.0/tests/ocr/init_test.py +0 -35
  180. kreuzberg-3.18.0/tests/ocr/paddleocr_test.py +0 -835
  181. kreuzberg-3.18.0/tests/ocr/tesseract_test.py +0 -1314
  182. kreuzberg-3.18.0/tests/ocr/tesseract_tsv_test.py +0 -409
  183. kreuzberg-3.18.0/tests/performance/__init__.py +0 -0
  184. kreuzberg-3.18.0/tests/performance/large_pdf_perf_test.py +0 -29
  185. kreuzberg-3.18.0/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
  186. kreuzberg-3.18.0/tests/test_source_files/contract.txt +0 -1
  187. kreuzberg-3.18.0/tests/test_source_files/contract_test.txt +0 -4
  188. kreuzberg-3.18.0/tests/test_source_files/document.docx +0 -0
  189. kreuzberg-3.18.0/tests/test_source_files/email/sample-email.eml +0 -11
  190. kreuzberg-3.18.0/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
  191. kreuzberg-3.18.0/tests/test_source_files/excel.xlsx +0 -0
  192. kreuzberg-3.18.0/tests/test_source_files/flower-no-text.jpg +0 -0
  193. kreuzberg-3.18.0/tests/test_source_files/form_test.txt +0 -5
  194. kreuzberg-3.18.0/tests/test_source_files/french-text.txt +0 -2
  195. kreuzberg-3.18.0/tests/test_source_files/german-text.txt +0 -2
  196. kreuzberg-3.18.0/tests/test_source_files/google-doc-document.pdf +0 -0
  197. kreuzberg-3.18.0/tests/test_source_files/html.html +0 -10
  198. kreuzberg-3.18.0/tests/test_source_files/image-only-german-pdf.pdf +0 -0
  199. kreuzberg-3.18.0/tests/test_source_files/images/test_hello_world.png +0 -0
  200. kreuzberg-3.18.0/tests/test_source_files/invoice_image.png +0 -0
  201. kreuzberg-3.18.0/tests/test_source_files/invoice_test.txt +0 -4
  202. kreuzberg-3.18.0/tests/test_source_files/json/complex_nested.json +0 -41
  203. kreuzberg-3.18.0/tests/test_source_files/json/real_world/aws_policy.json +0 -43
  204. kreuzberg-3.18.0/tests/test_source_files/json/real_world/earthquakes.geojson +0 -6
  205. kreuzberg-3.18.0/tests/test_source_files/json/real_world/github_emojis.json +0 -111
  206. kreuzberg-3.18.0/tests/test_source_files/json/real_world/iss_location.json +0 -1
  207. kreuzberg-3.18.0/tests/test_source_files/json/real_world/openapi_spec.json +0 -84
  208. kreuzberg-3.18.0/tests/test_source_files/json/real_world/package.json +0 -33
  209. kreuzberg-3.18.0/tests/test_source_files/json/real_world/rick_morty_character.json +0 -1
  210. kreuzberg-3.18.0/tests/test_source_files/json/sample-document.json +0 -1
  211. kreuzberg-3.18.0/tests/test_source_files/json/schema_test.json +0 -25
  212. kreuzberg-3.18.0/tests/test_source_files/layout-parser-ocr.jpg +0 -0
  213. kreuzberg-3.18.0/tests/test_source_files/markdown.md +0 -1
  214. kreuzberg-3.18.0/tests/test_source_files/non-ascii-text.pdf +0 -0
  215. kreuzberg-3.18.0/tests/test_source_files/non-searchable.pdf +0 -0
  216. kreuzberg-3.18.0/tests/test_source_files/ocr-image.jpg +0 -0
  217. kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
  218. kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
  219. kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
  220. kreuzberg-3.18.0/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
  221. kreuzberg-3.18.0/tests/test_source_files/receipt_test.txt +0 -5
  222. kreuzberg-3.18.0/tests/test_source_files/report_test.txt +0 -4
  223. kreuzberg-3.18.0/tests/test_source_files/sample-contract.pdf +0 -0
  224. kreuzberg-3.18.0/tests/test_source_files/scanned.pdf +0 -0
  225. kreuzberg-3.18.0/tests/test_source_files/searchable.pdf +0 -0
  226. kreuzberg-3.18.0/tests/test_source_files/sharable-web-guide.pdf +0 -0
  227. kreuzberg-3.18.0/tests/test_source_files/spanish-text.txt +0 -2
  228. kreuzberg-3.18.0/tests/test_source_files/tables/borderless_table.png +0 -0
  229. kreuzberg-3.18.0/tests/test_source_files/tables/complex_document.png +0 -0
  230. kreuzberg-3.18.0/tests/test_source_files/tables/simple_table.png +0 -0
  231. kreuzberg-3.18.0/tests/test_source_files/test-article.pdf +0 -0
  232. kreuzberg-3.18.0/tests/test_source_files/test-excel.xls +0 -0
  233. kreuzberg-3.18.0/tests/test_source_files/yaml/sample-config.yaml +0 -15
  234. kreuzberg-3.18.0/tests/utils/__init__.py +0 -0
  235. kreuzberg-3.18.0/tests/utils/cache_test.py +0 -427
  236. kreuzberg-3.18.0/tests/utils/device_test.py +0 -347
  237. kreuzberg-3.18.0/tests/utils/errors_test.py +0 -343
  238. kreuzberg-3.18.0/tests/utils/ocr_cache_test.py +0 -286
  239. kreuzberg-3.18.0/tests/utils/pdf_lock_test.py +0 -215
  240. kreuzberg-3.18.0/tests/utils/playa_helpers_test.py +0 -0
  241. kreuzberg-3.18.0/tests/utils/playa_metadata_test.py +0 -753
  242. kreuzberg-3.18.0/tests/utils/playa_test.py +0 -315
  243. kreuzberg-3.18.0/tests/utils/process_pool_test.py +0 -223
  244. kreuzberg-3.18.0/tests/utils/quality_test.py +0 -121
  245. kreuzberg-3.18.0/tests/utils/ref_test.py +0 -90
  246. kreuzberg-3.18.0/tests/utils/serialization_test.py +0 -379
  247. kreuzberg-3.18.0/tests/utils/string_test.py +0 -251
  248. kreuzberg-3.18.0/tests/utils/sync_test.py +0 -259
  249. kreuzberg-3.18.0/tests/utils/table_test.py +0 -353
  250. kreuzberg-3.18.0/tests/utils/tmp_test.py +0 -50
  251. kreuzberg-3.18.0/uv.lock +0 -6208
  252. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/README.md +0 -0
  253. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/__init__.py +0 -0
  254. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/__main__.py +0 -0
  255. {kreuzberg-3.18.0/benchmarks → kreuzberg-3.20.2/kreuzberg/_api}/__init__.py +0 -0
  256. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_api/_config_cache.py +0 -0
  257. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_chunker.py +0 -0
  258. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_config.py +0 -0
  259. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_constants.py +0 -0
  260. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_document_classification.py +0 -0
  261. {kreuzberg-3.18.0/kreuzberg/_api → kreuzberg-3.20.2/kreuzberg/_extractors}/__init__.py +0 -0
  262. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_extractors/_email.py +0 -0
  263. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_extractors/_image.py +0 -0
  264. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_extractors/_pandoc.py +0 -0
  265. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_extractors/_presentation.py +0 -0
  266. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  267. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_mcp/__init__.py +0 -0
  268. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_mcp/server.py +0 -0
  269. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_mime_types.py +0 -0
  270. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_ocr/__init__.py +0 -0
  271. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_ocr/_base.py +0 -0
  272. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_ocr/_easyocr.py +0 -0
  273. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
  274. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_ocr/_table_extractor.py +0 -0
  275. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_playa.py +0 -0
  276. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_registry.py +0 -0
  277. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/__init__.py +0 -0
  278. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/_reducer.py +0 -0
  279. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/_stopwords.py +0 -0
  280. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
  281. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
  282. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
  283. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
  284. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
  285. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
  286. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
  287. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
  288. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
  289. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
  290. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
  291. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
  292. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
  293. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
  294. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
  295. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
  296. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
  297. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
  298. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
  299. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
  300. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
  301. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
  302. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
  303. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
  304. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
  305. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
  306. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
  307. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
  308. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
  309. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
  310. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
  311. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
  312. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
  313. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
  314. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
  315. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
  316. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
  317. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
  318. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
  319. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
  320. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
  321. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
  322. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
  323. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
  324. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
  325. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
  326. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
  327. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
  328. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
  329. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
  330. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
  331. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
  332. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
  333. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
  334. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
  335. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
  336. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
  337. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
  338. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
  339. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
  340. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
  341. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
  342. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
  343. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
  344. {kreuzberg-3.18.0/kreuzberg/_extractors → kreuzberg-3.20.2/kreuzberg/_utils}/__init__.py +0 -0
  345. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_cache.py +0 -0
  346. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_device.py +0 -0
  347. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_document_cache.py +0 -0
  348. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_errors.py +0 -0
  349. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_image_preprocessing.py +0 -0
  350. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_ocr_cache.py +0 -0
  351. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
  352. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_process_pool.py +0 -0
  353. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_quality.py +0 -0
  354. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_ref.py +0 -0
  355. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_resource_managers.py +0 -0
  356. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_serialization.py +0 -0
  357. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_string.py +0 -0
  358. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_sync.py +0 -0
  359. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_table.py +0 -0
  360. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/_utils/_tmp.py +0 -0
  361. {kreuzberg-3.18.0 → kreuzberg-3.20.2}/kreuzberg/exceptions.py +0 -0
  362. {kreuzberg-3.18.0/benchmarks → kreuzberg-3.20.2/kreuzberg}/py.typed +0 -0
@@ -1,13 +1,11 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.3
2
2
  Name: kreuzberg
3
- Version: 3.18.0
3
+ Version: 3.20.2
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
- Project-URL: documentation, https://kreuzberg.dev
6
- Project-URL: homepage, https://github.com/Goldziher/kreuzberg
5
+ Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
6
+ Author: Na'aman Hirschfeld
7
7
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
8
8
  License: MIT
9
- License-File: LICENSE
10
- Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
11
9
  Classifier: Development Status :: 5 - Production/Stable
12
10
  Classifier: Intended Audience :: Developers
13
11
  Classifier: Intended Audience :: Information Technology
@@ -27,67 +25,56 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
25
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
26
  Classifier: Topic :: Text Processing :: General
29
27
  Classifier: Typing :: Typed
30
- Requires-Python: >=3.10
31
28
  Requires-Dist: anyio>=4.11.0
32
29
  Requires-Dist: chardetng-py>=0.3.5
33
- Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.16.0
30
+ Requires-Dist: exceptiongroup>=1.2.2 ; python_full_version < '3.11'
31
+ Requires-Dist: html-to-markdown>=2.1.2
35
32
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.15.0
33
+ Requires-Dist: mcp>=1.17.0
37
34
  Requires-Dist: msgspec>=0.18.0
38
35
  Requires-Dist: numpy>=2.0.0
39
36
  Requires-Dist: playa-pdf>=0.7.0
40
- Requires-Dist: polars>=1.33.1
37
+ Requires-Dist: polars>=1.34.0
41
38
  Requires-Dist: psutil>=7.1.0
42
39
  Requires-Dist: pypdfium2==4.30.0
43
40
  Requires-Dist: python-calamine>=0.5.3
44
41
  Requires-Dist: python-pptx>=1.0.2
45
- Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
42
+ Requires-Dist: transformers>=4.55.0
43
+ Requires-Dist: typing-extensions>=4.15.0 ; python_full_version < '3.12'
44
+ Requires-Dist: mailparse>=1.0.15 ; extra == 'additional-extensions'
45
+ Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'additional-extensions'
46
+ Requires-Dist: kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr] ; extra == 'all'
47
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0 ; extra == 'api'
48
+ Requires-Dist: semantic-text-splitter>=0.28.0 ; extra == 'chunking'
49
+ Requires-Dist: click>=8.3.0 ; extra == 'cli'
50
+ Requires-Dist: rich>=14.2.0 ; extra == 'cli'
51
+ Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'cli'
52
+ Requires-Dist: playa-pdf[crypto]>=0.7.0 ; extra == 'crypto'
53
+ Requires-Dist: deep-translator>=1.11.4 ; extra == 'document-classification'
54
+ Requires-Dist: easyocr>=1.7.2 ; python_full_version < '3.14' and extra == 'easyocr'
55
+ Requires-Dist: keybert>=0.9.0 ; extra == 'entity-extraction'
56
+ Requires-Dist: spacy>=3.8.7 ; python_full_version < '3.14' and extra == 'entity-extraction'
57
+ Requires-Dist: gmft>=0.4.2 ; extra == 'gmft'
58
+ Requires-Dist: transformers>=4.57.0 ; extra == 'gmft'
59
+ Requires-Dist: fast-langdetect>=1.0.0 ; extra == 'langdetect'
60
+ Requires-Dist: paddleocr>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
61
+ Requires-Dist: paddlepaddle>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
62
+ Requires-Dist: setuptools>=80.9.0 ; extra == 'paddleocr'
63
+ Requires-Python: >=3.10
64
+ Project-URL: documentation, https://kreuzberg.dev
65
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
46
66
  Provides-Extra: additional-extensions
47
- Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
48
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
49
67
  Provides-Extra: all
50
- Requires-Dist: click>=8.2.1; extra == 'all'
51
- Requires-Dist: deep-translator>=1.11.4; extra == 'all'
52
- Requires-Dist: easyocr>=1.7.2; extra == 'all'
53
- Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
54
- Requires-Dist: gmft>=0.4.2; extra == 'all'
55
- Requires-Dist: keybert>=0.9.0; extra == 'all'
56
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
57
- Requires-Dist: mailparse>=1.0.15; extra == 'all'
58
- Requires-Dist: paddleocr>=3.2.0; extra == 'all'
59
- Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
60
- Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
61
- Requires-Dist: rich>=14.1.0; extra == 'all'
62
- Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
63
- Requires-Dist: setuptools>=80.9.0; extra == 'all'
64
- Requires-Dist: spacy>=3.8.7; extra == 'all'
65
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
66
68
  Provides-Extra: api
67
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
68
69
  Provides-Extra: chunking
69
- Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
70
70
  Provides-Extra: cli
71
- Requires-Dist: click>=8.2.1; extra == 'cli'
72
- Requires-Dist: rich>=14.1.0; extra == 'cli'
73
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
74
71
  Provides-Extra: crypto
75
- Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
76
72
  Provides-Extra: document-classification
77
- Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
78
73
  Provides-Extra: easyocr
79
- Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
80
74
  Provides-Extra: entity-extraction
81
- Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
82
- Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
83
75
  Provides-Extra: gmft
84
- Requires-Dist: gmft>=0.4.2; extra == 'gmft'
85
76
  Provides-Extra: langdetect
86
- Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
87
77
  Provides-Extra: paddleocr
88
- Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
89
- Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
90
- Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
91
78
  Description-Content-Type: text/markdown
92
79
 
93
80
  # Kreuzberg
@@ -110,10 +110,9 @@ def _get_max_upload_size() -> int:
110
110
  Environment Variables:
111
111
  KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
112
  """
113
- default_size = 1024 * 1024 * 1024 # 1GB
113
+ default_size = 1024 * 1024 * 1024
114
114
  try:
115
115
  size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
- # Return default if negative
117
116
  return size if size >= 0 else default_size
118
117
  except ValueError:
119
118
  return default_size
@@ -311,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
311
310
  """
312
311
  static_config = discover_config_cached()
313
312
 
313
+ if not data:
314
+ raise ValidationError("No files provided for extraction", context={"file_count": 0})
315
+
314
316
  min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
315
317
  max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
316
318
 
@@ -144,10 +144,9 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
144
144
  try:
145
145
  nlp = spacy.load(model_name)
146
146
  except OSError:
147
- # Try to download the model automatically
147
+
148
148
  async def install_model() -> tuple[bool, str | None]:
149
149
  """Install model and return success status and error message."""
150
- # First try spaCy's built-in download
151
150
  try:
152
151
  success = await install_spacy_model_with_spacy(model_name)
153
152
  if success:
@@ -157,7 +156,6 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
157
156
  else:
158
157
  spacy_error = "spaCy download failed"
159
158
 
160
- # If spaCy download failed and uv is available, try uv as fallback
161
159
  if is_uv_available():
162
160
  try:
163
161
  result = await install_spacy_model_with_uv(model_name)
@@ -167,14 +165,12 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
167
165
 
168
166
  return False, spacy_error
169
167
 
170
- # Run the async installation in a sync context
171
168
  try:
172
169
  success, error_details = anyio.run(install_model)
173
- except (OSError, RuntimeError) as e:
174
- success, error_details = False, str(e)
170
+ except SystemExit as e:
171
+ success, error_details = False, f"spaCy CLI exit code: {e.code}"
175
172
 
176
173
  if not success:
177
- # Generate appropriate error message based on available tools
178
174
  if is_uv_available():
179
175
  model_url = get_spacy_model_url(model_name)
180
176
  manual_install_cmd = f"uv pip install {model_url}"
@@ -234,7 +230,7 @@ def extract_keywords(
234
230
  kw_model = KeyBERT()
235
231
  keywords = kw_model.extract_keywords(text, top_n=keyword_count)
236
232
  return [(kw, float(score)) for kw, score in keywords]
237
- except (RuntimeError, OSError, ValueError):
233
+ except ValueError:
238
234
  return []
239
235
  except ImportError as e: # pragma: no cover
240
236
  raise MissingDependencyError.create_for_package(
@@ -0,0 +1,182 @@
1
+ """Type-safe error handling utilities for extraction pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import traceback
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable
10
+
11
+ from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
12
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
13
+
14
+
15
+ def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
16
+ """Determine if an exception should bubble up or be handled gracefully.
17
+
18
+ Args:
19
+ exception: The exception to classify
20
+ context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
21
+
22
+ Returns:
23
+ True if the exception should bubble up, False if it should be handled gracefully
24
+ """
25
+ if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
26
+ return True
27
+
28
+ if isinstance(exception, MissingDependencyError):
29
+ return True
30
+
31
+ if isinstance(exception, ValidationError):
32
+ if context == "batch_processing":
33
+ return False
34
+
35
+ return context != "optional_feature"
36
+
37
+ if isinstance(exception, KreuzbergError) and context == "optional_feature":
38
+ return False
39
+
40
+ if context == "batch_processing":
41
+ return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
42
+
43
+ return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
44
+
45
+
46
+ class FeatureProcessingError:
47
+ """Type-safe processing error for extraction features."""
48
+
49
+ def __init__(self, feature: str, error: Exception) -> None:
50
+ self._feature = feature
51
+ self._error = error
52
+ self._traceback = traceback.format_exc()
53
+
54
+ @property
55
+ def feature(self) -> str:
56
+ return self._feature
57
+
58
+ @property
59
+ def error_type(self) -> str:
60
+ return type(self._error).__name__
61
+
62
+ @property
63
+ def error_message(self) -> str:
64
+ return str(self._error)
65
+
66
+ @property
67
+ def traceback(self) -> str:
68
+ return self._traceback
69
+
70
+ def to_dict(self) -> ProcessingErrorDict:
71
+ return {
72
+ "feature": self.feature,
73
+ "error_type": self.error_type,
74
+ "error_message": self.error_message,
75
+ "traceback": self.traceback,
76
+ }
77
+
78
+
79
+ def safe_feature_execution(
80
+ feature_name: str,
81
+ execution_func: Callable[[], Any],
82
+ default_value: Any,
83
+ result: ExtractionResult,
84
+ context: ErrorContextType = "optional_feature",
85
+ ) -> Any:
86
+ """Safely execute a feature extraction function with proper error handling.
87
+
88
+ Args:
89
+ feature_name: Name of the feature being executed
90
+ execution_func: Function to execute that may raise exceptions
91
+ default_value: Default value to return if execution fails
92
+ result: ExtractionResult to update with error information
93
+ context: The context for exception handling decisions
94
+
95
+ Returns:
96
+ Either the successful result or the default value
97
+ """
98
+ try:
99
+ return execution_func()
100
+ except Exception as e:
101
+ if should_exception_bubble_up(e, context):
102
+ raise
103
+
104
+ _add_processing_error(result, FeatureProcessingError(feature_name, e))
105
+ return default_value
106
+
107
+
108
+ def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
109
+ """Add a processing error to the result metadata in a type-safe way."""
110
+ if result.metadata is None:
111
+ result.metadata = {}
112
+
113
+ if "processing_errors" not in result.metadata:
114
+ result.metadata["processing_errors"] = []
115
+
116
+ errors_list = result.metadata["processing_errors"]
117
+ if isinstance(errors_list, list):
118
+ errors_list.append(error.to_dict())
119
+ else:
120
+ result.metadata["processing_errors"] = [error.to_dict()]
121
+
122
+
123
+ def preserve_result_with_errors(
124
+ result: ExtractionResult,
125
+ errors: list[FeatureProcessingError],
126
+ ) -> ExtractionResult:
127
+ """Preserve a successful extraction result while adding error information.
128
+
129
+ This is used when core extraction succeeds but optional features fail.
130
+
131
+ Args:
132
+ result: The successful extraction result
133
+ errors: List of errors that occurred during optional processing
134
+
135
+ Returns:
136
+ The result with error information added to metadata
137
+ """
138
+ for error in errors:
139
+ _add_processing_error(result, error)
140
+
141
+ return result
142
+
143
+
144
+ def create_error_result(
145
+ content: str,
146
+ mime_type: str,
147
+ errors: list[FeatureProcessingError],
148
+ **metadata_kwargs: Any,
149
+ ) -> ExtractionResult:
150
+ """Create an error result with proper type safety.
151
+
152
+ Args:
153
+ content: Error content to include
154
+ mime_type: MIME type of the result
155
+ errors: List of errors that occurred
156
+ **metadata_kwargs: Additional metadata to include
157
+
158
+ Returns:
159
+ An ExtractionResult with error information
160
+ """
161
+ metadata: Metadata = {
162
+ "error": f"Multiple processing errors occurred: {len(errors)} errors",
163
+ "error_context": {
164
+ "error_count": len(errors),
165
+ "errors": [error.to_dict() for error in errors],
166
+ **metadata_kwargs,
167
+ },
168
+ "processing_errors": [error.to_dict() for error in errors],
169
+ }
170
+
171
+ return ExtractionResult(
172
+ content=content,
173
+ chunks=[],
174
+ mime_type=mime_type,
175
+ metadata=metadata,
176
+ entities=[],
177
+ keywords=[],
178
+ detected_languages=[],
179
+ tables=[],
180
+ images=[],
181
+ image_ocr_results=[],
182
+ )
@@ -230,13 +230,13 @@ class Extractor(ABC):
230
230
  confidence_score=None,
231
231
  processing_time=duration,
232
232
  )
233
- except (OSError, ValueError) as e: # pragma: no cover
233
+ except ValueError as e: # pragma: no cover
234
234
  return ImageOCRResult(
235
235
  image=target,
236
236
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
237
237
  skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
238
238
  )
239
- except (RuntimeError, TypeError) as e: # pragma: no cover
239
+ except TypeError as e: # pragma: no cover
240
240
  return ImageOCRResult(
241
241
  image=target,
242
242
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
@@ -0,0 +1,138 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import TYPE_CHECKING, Any, ClassVar
5
+
6
+ from anyio import Path as AsyncPath
7
+ from html_to_markdown import HtmlToMarkdownError
8
+ from html_to_markdown._html_to_markdown import (
9
+ InlineImageConfig,
10
+ convert_with_inline_images,
11
+ )
12
+ from html_to_markdown._html_to_markdown import (
13
+ convert as rust_convert,
14
+ )
15
+
16
+ from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
17
+ from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
18
+ from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
19
+ from kreuzberg._utils._string import safe_decode
20
+ from kreuzberg._utils._sync import run_maybe_async, run_sync
21
+
22
+ if TYPE_CHECKING:
23
+ from pathlib import Path
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class HTMLExtractor(Extractor):
29
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
30
+
31
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
32
+ result = await run_sync(self.extract_bytes_sync, content)
33
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
34
+ result.image_ocr_results = await self._process_images_with_ocr(result.images)
35
+ return result
36
+
37
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
38
+ content = await AsyncPath(path).read_bytes()
39
+ result = await run_sync(self.extract_bytes_sync, content)
40
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
41
+ result.image_ocr_results = await self._process_images_with_ocr(result.images)
42
+ return result
43
+
44
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
45
+ extraction_config = self.config
46
+ html_content = safe_decode(content)
47
+ if extraction_config and extraction_config.html_to_markdown_config is not None:
48
+ html_config = extraction_config.html_to_markdown_config
49
+ else:
50
+ html_config = HTMLToMarkdownConfig()
51
+ conversion_options, _ = html_config.to_options()
52
+
53
+ extract_inline_images = bool(extraction_config and extraction_config.extract_images)
54
+ run_ocr_on_images = bool(
55
+ extraction_config and extraction_config.extract_images and extraction_config.ocr_extracted_images
56
+ )
57
+ inline_image_config = None
58
+ if extract_inline_images:
59
+ inline_image_config = InlineImageConfig(
60
+ max_decoded_size_bytes=MAX_SINGLE_IMAGE_SIZE,
61
+ filename_prefix=None,
62
+ capture_svg=True,
63
+ infer_dimensions=True,
64
+ )
65
+
66
+ try:
67
+ if extract_inline_images:
68
+ markdown, images_payload, warnings = convert_with_inline_images(
69
+ html_content,
70
+ options=conversion_options,
71
+ image_config=inline_image_config,
72
+ )
73
+ else:
74
+ markdown = rust_convert(
75
+ html_content,
76
+ conversion_options,
77
+ )
78
+ images_payload = []
79
+ warnings = []
80
+ except (HtmlToMarkdownError, ValueError) as exc:
81
+ logger.exception("Failed to convert HTML to Markdown: %s", exc)
82
+ markdown = ""
83
+ images_payload = []
84
+ warnings = []
85
+
86
+ for warning in warnings:
87
+ self._log_inline_warning(warning)
88
+
89
+ extraction_result = ExtractionResult(content=markdown, mime_type=MARKDOWN_MIME_TYPE, metadata={})
90
+
91
+ inline_images = [self._build_extracted_image(image) for image in images_payload]
92
+ if inline_images:
93
+ extraction_result.images = inline_images
94
+ if run_ocr_on_images:
95
+ extraction_result.image_ocr_results = run_maybe_async(
96
+ self._process_images_with_ocr,
97
+ inline_images,
98
+ )
99
+
100
+ return self._apply_quality_processing(extraction_result)
101
+
102
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
103
+ content = path.read_bytes()
104
+ return self.extract_bytes_sync(content)
105
+
106
+ @staticmethod
107
+ def _build_extracted_image(image: dict[str, Any]) -> ExtractedImage:
108
+ dimensions_value = image.get("dimensions")
109
+ dimensions = tuple(dimensions_value) if dimensions_value else None
110
+ return ExtractedImage(
111
+ data=image["data"],
112
+ format=image["format"],
113
+ filename=image.get("filename"),
114
+ description=image.get("description"),
115
+ dimensions=dimensions,
116
+ )
117
+
118
+ @staticmethod
119
+ def _log_inline_warning(warning: Any) -> None:
120
+ if isinstance(warning, dict):
121
+ index = warning.get("index")
122
+ message = warning.get("message")
123
+ if index is not None and message:
124
+ logger.warning("Inline image %s: %s", index, message)
125
+ elif message:
126
+ logger.warning("Inline image warning: %s", message)
127
+ else:
128
+ logger.warning("Inline image warning received with no message")
129
+ return
130
+
131
+ message = getattr(warning, "message", None)
132
+ index = getattr(warning, "index", None)
133
+ if message and index is not None:
134
+ logger.warning("Inline image %s: %s", index, message)
135
+ elif message:
136
+ logger.warning("Inline image warning: %s", message)
137
+ else:
138
+ logger.warning("Inline image warning received with no message")
@@ -6,7 +6,6 @@ import logging
6
6
  import os
7
7
  import tempfile
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from dataclasses import asdict
10
9
  from itertools import count
11
10
  from multiprocessing import cpu_count
12
11
  from pathlib import Path
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
27
26
  from kreuzberg._ocr import get_ocr_backend
28
27
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
29
28
  from kreuzberg._types import (
30
- EasyOCRConfig,
31
29
  ExtractedImage,
32
30
  ExtractionResult,
33
31
  ImageOCRResult,
34
32
  Metadata,
35
33
  OcrBackendType,
36
- PaddleOCRConfig,
37
- TesseractConfig,
38
34
  )
39
35
  from kreuzberg._utils._errors import create_error_context, should_retry
40
36
  from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
134
130
  def extract_path_sync(self, path: Path) -> ExtractionResult:
135
131
  content_bytes = path.read_bytes()
136
132
 
133
+ result: ExtractionResult | None = None
134
+
137
135
  document: Document | None = None
138
136
  if self.config.extract_images or self.config.extract_tables:
139
137
  document = self._parse_with_password_attempts(content_bytes)
140
138
 
141
- try:
142
- text = self._extract_pdf_searchable_text_sync(path)
143
- except ParsingError:
144
- text = ""
139
+ if not self.config.force_ocr:
140
+ try:
141
+ content = self._extract_pdf_searchable_text_sync(path)
142
+ if self._validate_extracted_text(content):
143
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
144
+ except ParsingError:
145
+ pass
145
146
 
146
- if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
147
- text = self._extract_pdf_with_ocr_sync(path)
147
+ if not result and self.config.ocr_backend is not None:
148
+ result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
149
+
150
+ if not result:
151
+ result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
152
+
153
+ metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
154
+ result.metadata = metadata
148
155
 
149
- tables = []
150
156
  if self.config.extract_tables:
151
157
  # GMFT is optional dependency ~keep
152
158
  try:
153
159
  from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
154
160
 
155
161
  tables = extract_tables_sync(path)
162
+ result.tables = tables
156
163
  except ImportError: # pragma: no cover
157
- tables = []
158
-
159
- if not self.config.force_ocr and self._validate_extracted_text(text):
160
- text = self._extract_with_playa_sync(path, fallback_text=text)
161
-
162
- text = normalize_spaces(text)
163
-
164
- result = ExtractionResult(
165
- content=text,
166
- mime_type=PLAIN_TEXT_MIME_TYPE,
167
- metadata={},
168
- tables=list(tables),
169
- )
164
+ result.tables = []
170
165
 
171
- if tables:
172
- table_summary = generate_table_summary(tables)
173
- result.metadata = result.metadata | {
174
- "table_count": table_summary["table_count"],
175
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
176
- f"across {table_summary['pages_with_tables']} pages with "
177
- f"{table_summary['total_rows']} total rows",
178
- }
166
+ if result.tables:
167
+ table_summary = generate_table_summary(result.tables)
168
+ result.metadata = result.metadata | {
169
+ "table_count": table_summary["table_count"],
170
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
171
+ f"across {table_summary['pages_with_tables']} pages with "
172
+ f"{table_summary['total_rows']} total rows",
173
+ }
179
174
 
180
175
  if self.config.extract_images and document:
181
176
  images = self._extract_images_from_playa_sync(document)
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
405
400
  except Exception as e:
406
401
  raise ParsingError(f"Failed to extract PDF text: {e}") from e
407
402
 
408
- def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
403
+ def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
409
404
  temp_files: list[Path] = []
410
405
  try:
411
406
  with pdf_document_sync(path) as pdf:
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
443
438
  with pdf_resources_sync(bitmap, page):
444
439
  pil_image.close()
445
440
 
446
- return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
441
+ content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
442
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
447
443
 
448
444
  except Exception as e:
449
445
  raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
452
448
  with contextlib.suppress(OSError):
453
449
  p.unlink()
454
450
 
455
- def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
456
- backend = get_ocr_backend(self.config.ocr_backend)
451
+ def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
452
+ backend = get_ocr_backend(ocr_backend)
457
453
  paths = [Path(p) for p in image_paths]
458
454
 
459
- match self.config.ocr_backend:
460
- case "tesseract":
461
- config = (
462
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
463
- )
464
- results = backend.process_batch_sync(paths, **asdict(config))
465
- case "paddleocr":
466
- paddle_config = (
467
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
468
- )
469
- results = backend.process_batch_sync(paths, **asdict(paddle_config))
470
- case "easyocr":
471
- easy_config = (
472
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
473
- )
474
- results = backend.process_batch_sync(paths, **asdict(easy_config))
475
- case _:
476
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
455
+ results = backend.process_batch_sync(paths, **self.config.get_config_dict())
477
456
 
478
457
  return "\n\n".join(result.content for result in results)
479
458
 
@@ -14,7 +14,7 @@ else: # pragma: no cover
14
14
  try:
15
15
  import yaml
16
16
  except ImportError: # pragma: no cover
17
- yaml = None
17
+ yaml = None # type: ignore[assignment]
18
18
 
19
19
 
20
20
  from anyio import Path as AsyncPath